import requests from bs4 import BeautifulSoup import os from urllib.parse import urljoin, urlparse import time import yaml import markdownify def scrape_cloudferro_docs(base_url): # Create project directories project_dir = "cloudferro-docs" docs_dir = os.path.join(project_dir, "docs") os.makedirs(docs_dir, exist_ok=True) # Create project directory project_dir = "cloudferro-docs" docs_dir = os.path.join(project_dir, "docs") os.makedirs(docs_dir, exist_ok=True) # Dictionary to store navigation structure nav_structure = {} visited_urls = set() def scrape_page(url, parent_path=""): if url in visited_urls: return visited_urls.add(url) try: # Add delay to be respectful to the server time.sleep(1) response = requests.get(url) response.raise_for_status() # Parse page content with BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Extract main content (using the correct selector for CloudFerro docs) content = soup.select_one('.rst-content .document') or soup.select_one('.document') if not content: print(f"No content found at {url}") return except Exception as e: print(f"Error processing {url}: {str(e)}") return # Convert HTML to Markdown md_content = markdownify.markdownify(str(content), heading_style="ATX") # Extract page title for file name and nav title = soup.select_one('.md-content h1') or soup.select_one('title') page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower() page_title = page_title.replace('/', '_').replace('?', '') # Create file path relative_path = os.path.join(parent_path, f"{page_title}.md") file_path = os.path.join(docs_dir, relative_path) os.makedirs(os.path.dirname(file_path), exist_ok=True) # Save Markdown content with open(file_path, 'w', encoding='utf-8') as f: f.write(md_content) # Update navigation structure nav_path = parent_path.split('/') if parent_path else [] current_nav = nav_structure for part in nav_path: for item in current_nav: if isinstance(item, dict) and part in item: current_nav = item[part] break current_nav.append({page_title.replace('_', ' ').title(): relative_path}) # Find and scrape linked pages (e.g., sidebar or content links) links = content.select('a[href]') for link in links: href = link['href'] if href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'): continue absolute_url = urljoin(base_url, href) if absolute_url.startswith(base_url) and absolute_url not in visited_urls: new_parent_path = parent_path if 'section' in link.get('class', []): # Adjust based on actual class names new_parent_path = os.path.join(parent_path, page_title) scrape_page(absolute_url, new_parent_path) # Start scraping from the base URL try: scrape_page(base_url) except Exception as e: print(f"Error during scraping: {e}") # Generate mkdocs.yml mkdocs_config = { 'site_name': 'CloudFerro Cloud Documentation', 'site_url': 'https://your-readthedocs-subdomain.readthedocs.io', 'theme': { 'name': 'material', 'palette': { 'primary': 'blue', 'accent': 'blue' }, 'features': ['content.code.copy', 'navigation.sections'] }, 'nav': convert_nav_structure(nav_structure), 'markdown_extensions': [ 'admonition', 'pymdownx.details', 'pymdownx.superfences' ] } with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f: yaml.dump(mkdocs_config, f, allow_unicode=True) # Create .readthedocs.yml readthedocs_config = { 'version': 2, 'build': { 'os': 'ubuntu-24.04', 'tools': {'python': '3.11'} }, 'python': { 'install': [{'requirements': 'requirements.txt'}] }, 'mkdocs': {'configuration': 'mkdocs.yml'} } with open(os.path.join(project_dir, '.readthedocs.yml'), 'w', encoding='utf-8') as f: yaml.dump(readthedocs_config, f, allow_unicode=True) # Create requirements.txt with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f: f.write("mkdocs>=1.5.0\nmkdocs-material>=9.6.0\n") print(f"Documentation cloned to {project_dir}. Run 'mkdocs serve' to preview locally.") print("Push to a Git repository and import to Read the Docs to host.") def convert_nav_structure(nav): result = [] for item in nav: if isinstance(item, dict): for key, value in item.items(): if isinstance(value, str): result.append({key: value}) else: result.append({key: convert_nav_structure(value)}) return result # URL of the CloudFerro documentation url = 'https://docs.cloudferro.com/en/latest/' scrape_cloudferro_docs(url)