before changing links
This commit is contained in:
169
cloudferro_docs_scraper_new.py
Normal file
169
cloudferro_docs_scraper_new.py
Normal file
@ -0,0 +1,169 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time
|
||||
import yaml
|
||||
import markdownify
|
||||
|
||||
def scrape_cloudferro_docs(base_url):
|
||||
"""Scrape CloudFerro documentation and save as markdown files."""
|
||||
|
||||
# Create project directory
|
||||
project_dir = "cloudferro-docs"
|
||||
docs_dir = os.path.join(project_dir, "docs")
|
||||
os.makedirs(docs_dir, exist_ok=True)
|
||||
|
||||
# Keep track of visited URLs and navigation structure
|
||||
visited_urls = set()
|
||||
nav_structure = []
|
||||
|
||||
def clean_filename(text):
|
||||
"""Convert text to a clean filename"""
|
||||
return text.strip().replace(' ', '_').replace('/', '_').replace('?', '').lower()
|
||||
|
||||
def scrape_page(url, parent_path=""):
|
||||
if url in visited_urls:
|
||||
return
|
||||
|
||||
visited_urls.add(url)
|
||||
print(f"Scraping: {url}")
|
||||
|
||||
try:
|
||||
# Add delay to be respectful to the server
|
||||
time.sleep(1)
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Extract main content
|
||||
content = soup.select_one('.rst-content .document') or soup.select_one('.document')
|
||||
if not content:
|
||||
print(f"No content found at {url}")
|
||||
return
|
||||
|
||||
# Get title
|
||||
title = soup.select_one('h1')
|
||||
if not title:
|
||||
title = soup.select_one('title')
|
||||
page_title = title.text.strip() if title else url.split('/')[-1]
|
||||
|
||||
# Convert content to markdown
|
||||
md_content = markdownify.markdownify(str(content), heading_style="ATX")
|
||||
|
||||
# Create file path
|
||||
filename = clean_filename(page_title) + '.md'
|
||||
relative_path = os.path.join(parent_path, filename)
|
||||
file_path = os.path.join(docs_dir, relative_path)
|
||||
|
||||
# Ensure directory exists
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
|
||||
# Save content
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(f"# {page_title}\n\n")
|
||||
f.write(md_content)
|
||||
|
||||
print(f"Saved: {file_path}")
|
||||
|
||||
# Update navigation structure
|
||||
current_section = nav_structure
|
||||
if parent_path:
|
||||
for section in parent_path.split('/'):
|
||||
if section:
|
||||
section_dict = next((item for item in current_section
|
||||
if isinstance(item, dict) and section in item), None)
|
||||
if not section_dict:
|
||||
section_dict = {section: []}
|
||||
current_section.append(section_dict)
|
||||
current_section = section_dict[section]
|
||||
|
||||
current_section.append({page_title: relative_path})
|
||||
|
||||
# Find and process links
|
||||
menu = soup.select_one('.wy-menu-vertical')
|
||||
if menu:
|
||||
for link in menu.select('a'):
|
||||
href = link.get('href')
|
||||
if not href or href.startswith(('#', 'mailto:', 'javascript:', 'tel:')):
|
||||
continue
|
||||
|
||||
next_url = urljoin(url, href)
|
||||
if urlparse(next_url).netloc == urlparse(base_url).netloc:
|
||||
parent = link.find_parent('li', class_='toctree-l2')
|
||||
next_parent_path = parent_path
|
||||
if parent:
|
||||
section = parent.find_previous_sibling('li', class_='toctree-l1')
|
||||
if section:
|
||||
section_name = clean_filename(section.get_text().strip())
|
||||
next_parent_path = os.path.join(parent_path, section_name)
|
||||
scrape_page(next_url, next_parent_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {url}: {str(e)}")
|
||||
|
||||
try:
|
||||
scrape_page(base_url)
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {str(e)}")
|
||||
|
||||
# Generate mkdocs.yml
|
||||
mkdocs_config = {
|
||||
'site_name': 'CloudFerro Documentation',
|
||||
'site_description': 'CloudFerro Documentation Mirror',
|
||||
'theme': {
|
||||
'name': 'material',
|
||||
'palette': {
|
||||
'primary': 'blue',
|
||||
'accent': 'blue'
|
||||
},
|
||||
'features': [
|
||||
'navigation.instant',
|
||||
'navigation.tracking',
|
||||
'navigation.tabs',
|
||||
'navigation.sections',
|
||||
'navigation.expand',
|
||||
'navigation.indexes',
|
||||
'toc.integrate',
|
||||
'content.code.copy'
|
||||
]
|
||||
},
|
||||
'nav': nav_structure,
|
||||
'markdown_extensions': [
|
||||
'admonition',
|
||||
'pymdownx.details',
|
||||
'pymdownx.superfences',
|
||||
'pymdownx.highlight',
|
||||
'pymdownx.inlinehilite',
|
||||
'pymdownx.snippets',
|
||||
'pymdownx.tabbed',
|
||||
'footnotes',
|
||||
'toc',
|
||||
'tables',
|
||||
'attr_list'
|
||||
]
|
||||
}
|
||||
|
||||
with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
|
||||
yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False)
|
||||
|
||||
# Create requirements.txt
|
||||
requirements = [
|
||||
"mkdocs>=1.5.0",
|
||||
"mkdocs-material>=9.6.0",
|
||||
"pymdown-extensions>=10.7",
|
||||
]
|
||||
|
||||
with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(requirements))
|
||||
|
||||
print("\nScraping completed!")
|
||||
print(f"Documentation has been generated in the {project_dir} directory")
|
||||
print("\nTo preview the documentation locally:")
|
||||
print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt")
|
||||
print("2. Run local server: cd cloudferro-docs && mkdocs serve")
|
||||
|
||||
if __name__ == "__main__":
|
||||
url = 'https://docs.cloudferro.com/en/latest/'
|
||||
scrape_cloudferro_docs(url)
|
||||
Reference in New Issue
Block a user