import os import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse, urlunparse from markdownify import markdownify as md BASE_URL = "https://docs.cloudferro.com/en/latest/" OUTPUT_DIR = "docs" IMAGES_DIR = os.path.join(OUTPUT_DIR, "_images") visited = set() def normalize_url(url): parsed = urlparse(url) # Remove query and fragment, normalize path path = parsed.path.rstrip("/") if path == "": path = "/" normalized = urlunparse((parsed.scheme, parsed.netloc, path, '', '', '')) return normalized def download_image(img_url): os.makedirs(IMAGES_DIR, exist_ok=True) filename = os.path.basename(urlparse(img_url).path) local_path = os.path.join(IMAGES_DIR, filename) if not os.path.exists(local_path): try: r = requests.get(img_url, timeout=10) r.raise_for_status() with open(local_path, "wb") as f: f.write(r.content) except Exception as e: print(f"Failed to download {img_url}: {e}") return f"_images/{filename}" def process_page(url, rel_path): print(f"Processing: {url}") r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") # Download images and update src for img in soup.find_all("img"): img_url = urljoin(url, img["src"]) local_img = download_image(img_url) img["src"] = local_img # Convert to Markdown main = soup.find("main") or soup.body md_content = md(str(main)) # Save Markdown out_path = os.path.join(OUTPUT_DIR, rel_path) os.makedirs(os.path.dirname(out_path), exist_ok=True) with open(out_path, "w", encoding="utf-8") as f: f.write(md_content) def crawl_iterative(start_url, start_rel_path="index.md"): stack = [(start_url, start_rel_path)] while stack: url, rel_path = stack.pop() norm_url = normalize_url(url) if norm_url in visited: continue visited.add(norm_url) process_page(url, rel_path) try: r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") for a in soup.find_all("a", href=True): href = a["href"] if href.startswith("http") and not href.startswith(BASE_URL): continue abs_url = urljoin(url, href) norm_abs_url = normalize_url(abs_url) if norm_abs_url.startswith(BASE_URL.rstrip("/")) and norm_abs_url not in visited: path = urlparse(abs_url).path.replace("/en/latest/", "") if path.endswith("/") or path == "": rel_md = os.path.join(path, "index.md") else: rel_md = path.rstrip("/") + ".md" stack.append((abs_url, rel_md.lstrip("/"))) except Exception as e: print(f"Failed to process links on {url}: {e}") if __name__ == "__main__": crawl_iterative(BASE_URL) print("Scraping complete. All Markdown and images are in the 'docs/' folder.")