before changing links

2025-06-19 09:01:18 +05:30
commit 6686208bf1
1277 changed files with 29692 additions and 0 deletions
--- a/scrape_docs.py
+++ b/scrape_docs.py
@ -0,0 +1,83 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse, urlunparse
+from markdownify import markdownify as md
+
+BASE_URL = "https://docs.cloudferro.com/en/latest/"
+OUTPUT_DIR = "docs"
+IMAGES_DIR = os.path.join(OUTPUT_DIR, "_images")
+visited = set()
+
+def normalize_url(url):
+    parsed = urlparse(url)
+    # Remove query and fragment, normalize path
+    path = parsed.path.rstrip("/")
+    if path == "":
+        path = "/"
+    normalized = urlunparse((parsed.scheme, parsed.netloc, path, '', '', ''))
+    return normalized
+
+def download_image(img_url):
+    os.makedirs(IMAGES_DIR, exist_ok=True)
+    filename = os.path.basename(urlparse(img_url).path)
+    local_path = os.path.join(IMAGES_DIR, filename)
+    if not os.path.exists(local_path):
+        try:
+            r = requests.get(img_url, timeout=10)
+            r.raise_for_status()
+            with open(local_path, "wb") as f:
+                f.write(r.content)
+        except Exception as e:
+            print(f"Failed to download {img_url}: {e}")
+    return f"_images/{filename}"
+
+def process_page(url, rel_path):
+    print(f"Processing: {url}")
+    r = requests.get(url)
+    soup = BeautifulSoup(r.text, "html.parser")
+    # Download images and update src
+    for img in soup.find_all("img"):
+        img_url = urljoin(url, img["src"])
+        local_img = download_image(img_url)
+        img["src"] = local_img
+    # Convert to Markdown
+    main = soup.find("main") or soup.body
+    md_content = md(str(main))
+    # Save Markdown
+    out_path = os.path.join(OUTPUT_DIR, rel_path)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write(md_content)
+
+def crawl_iterative(start_url, start_rel_path="index.md"):
+    stack = [(start_url, start_rel_path)]
+    while stack:
+        url, rel_path = stack.pop()
+        norm_url = normalize_url(url)
+        if norm_url in visited:
+            continue
+        visited.add(norm_url)
+        process_page(url, rel_path)
+        try:
+            r = requests.get(url)
+            soup = BeautifulSoup(r.text, "html.parser")
+            for a in soup.find_all("a", href=True):
+                href = a["href"]
+                if href.startswith("http") and not href.startswith(BASE_URL):
+                    continue
+                abs_url = urljoin(url, href)
+                norm_abs_url = normalize_url(abs_url)
+                if norm_abs_url.startswith(BASE_URL.rstrip("/")) and norm_abs_url not in visited:
+                    path = urlparse(abs_url).path.replace("/en/latest/", "")
+                    if path.endswith("/") or path == "":
+                        rel_md = os.path.join(path, "index.md")
+                    else:
+                        rel_md = path.rstrip("/") + ".md"
+                    stack.append((abs_url, rel_md.lstrip("/")))
+        except Exception as e:
+            print(f"Failed to process links on {url}: {e}")
+
+if __name__ == "__main__":
+    crawl_iterative(BASE_URL)
+    print("Scraping complete. All Markdown and images are in the 'docs/' folder.")