before changing links
This commit is contained in:
83
scrape_docs.py
Normal file
83
scrape_docs.py
Normal file
@ -0,0 +1,83 @@
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse, urlunparse
|
||||
from markdownify import markdownify as md
|
||||
|
||||
BASE_URL = "https://docs.cloudferro.com/en/latest/"
|
||||
OUTPUT_DIR = "docs"
|
||||
IMAGES_DIR = os.path.join(OUTPUT_DIR, "_images")
|
||||
visited = set()
|
||||
|
||||
def normalize_url(url):
|
||||
parsed = urlparse(url)
|
||||
# Remove query and fragment, normalize path
|
||||
path = parsed.path.rstrip("/")
|
||||
if path == "":
|
||||
path = "/"
|
||||
normalized = urlunparse((parsed.scheme, parsed.netloc, path, '', '', ''))
|
||||
return normalized
|
||||
|
||||
def download_image(img_url):
|
||||
os.makedirs(IMAGES_DIR, exist_ok=True)
|
||||
filename = os.path.basename(urlparse(img_url).path)
|
||||
local_path = os.path.join(IMAGES_DIR, filename)
|
||||
if not os.path.exists(local_path):
|
||||
try:
|
||||
r = requests.get(img_url, timeout=10)
|
||||
r.raise_for_status()
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(r.content)
|
||||
except Exception as e:
|
||||
print(f"Failed to download {img_url}: {e}")
|
||||
return f"_images/{filename}"
|
||||
|
||||
def process_page(url, rel_path):
|
||||
print(f"Processing: {url}")
|
||||
r = requests.get(url)
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
# Download images and update src
|
||||
for img in soup.find_all("img"):
|
||||
img_url = urljoin(url, img["src"])
|
||||
local_img = download_image(img_url)
|
||||
img["src"] = local_img
|
||||
# Convert to Markdown
|
||||
main = soup.find("main") or soup.body
|
||||
md_content = md(str(main))
|
||||
# Save Markdown
|
||||
out_path = os.path.join(OUTPUT_DIR, rel_path)
|
||||
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
f.write(md_content)
|
||||
|
||||
def crawl_iterative(start_url, start_rel_path="index.md"):
|
||||
stack = [(start_url, start_rel_path)]
|
||||
while stack:
|
||||
url, rel_path = stack.pop()
|
||||
norm_url = normalize_url(url)
|
||||
if norm_url in visited:
|
||||
continue
|
||||
visited.add(norm_url)
|
||||
process_page(url, rel_path)
|
||||
try:
|
||||
r = requests.get(url)
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
if href.startswith("http") and not href.startswith(BASE_URL):
|
||||
continue
|
||||
abs_url = urljoin(url, href)
|
||||
norm_abs_url = normalize_url(abs_url)
|
||||
if norm_abs_url.startswith(BASE_URL.rstrip("/")) and norm_abs_url not in visited:
|
||||
path = urlparse(abs_url).path.replace("/en/latest/", "")
|
||||
if path.endswith("/") or path == "":
|
||||
rel_md = os.path.join(path, "index.md")
|
||||
else:
|
||||
rel_md = path.rstrip("/") + ".md"
|
||||
stack.append((abs_url, rel_md.lstrip("/")))
|
||||
except Exception as e:
|
||||
print(f"Failed to process links on {url}: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl_iterative(BASE_URL)
|
||||
print("Scraping complete. All Markdown and images are in the 'docs/' folder.")
|
||||
Reference in New Issue
Block a user