import time import os from urllib.parse import urljoin from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager import yaml from bs4 import BeautifulSoup import markdownify def scrape_cloudferro_docs(base_url): # Set up Brave options chrome_options = Options() # Specify Brave binary location chrome_options.binary_location = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser" chrome_options.add_argument("--headless") # Run in headless mode # Initialize the WebDriver try: service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) except Exception as e: print(f"Error initializing WebDriver: {e}") return # Create project directory project_dir = "cloudferro-docs" docs_dir = os.path.join(project_dir, "docs") os.makedirs(docs_dir, exist_ok=True) # Dictionary to store navigation structure nav_structure = {} visited_urls = set() def scrape_page(url, parent_path=""): if url in visited_urls: return visited_urls.add(url) print(f"Scraping: {url}") try: driver.get(url) time.sleep(2) # Wait for page to load except Exception as e: print(f"Error loading {url}: {e}") return # Parse page content with BeautifulSoup soup = BeautifulSoup(driver.page_source, 'html.parser') # Extract main content (using the correct selector for CloudFerro docs) content = soup.select_one('.rst-content .document') or soup.select_one('.document') if not content: print(f"No content found at {url}") return # Convert HTML to Markdown md_content = markdownify.markdownify(str(content), heading_style="ATX") # Extract page title for file name and nav title = soup.select_one('h1') or soup.select_one('title') page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower() page_title = page_title.replace('/', '_').replace('?', '') # Create file path relative_path = os.path.join(parent_path, f"{page_title}.md") file_path = os.path.join(docs_dir, relative_path) os.makedirs(os.path.dirname(file_path), exist_ok=True) # Save Markdown content with open(file_path, 'w', encoding='utf-8') as f: f.write(md_content) print(f"Saved: {file_path}") # Update navigation structure nav_path = parent_path.split('/') if parent_path else [] current_nav = nav_structure for part in nav_path: for item in current_nav: if isinstance(item, dict) and part in item: current_nav = item[part] break current_nav.append({page_title.replace('_', ' ').title(): relative_path}) # Find and scrape linked pages (focusing on sidebar links) links = soup.select('.wy-menu-vertical li a') # CloudFerro uses Read the Docs theme for link in links: href = link.get('href') if not href or href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'): continue absolute_url = urljoin(base_url, href) if absolute_url.startswith(base_url) and absolute_url not in visited_urls: new_parent_path = os.path.join(parent_path, page_title) if 'current' in link.get('class', []) else parent_path scrape_page(absolute_url, new_parent_path) # Start scraping from the base URL try: scrape_page(base_url) except Exception as e: print(f"Error during scraping: {e}") finally: driver.quit() # Generate mkdocs.yml mkdocs_config = { 'site_name': 'CloudFerro Documentation', 'site_url': 'https://cloudferro-docs.readthedocs.io', 'theme': { 'name': 'material', 'palette': { 'primary': 'blue', 'accent': 'blue' }, 'features': [ 'navigation.instant', 'navigation.tracking', 'navigation.tabs', 'navigation.sections', 'navigation.expand', 'navigation.indexes', 'toc.integrate', 'content.code.copy' ] }, 'nav': convert_nav_structure(nav_structure), 'markdown_extensions': [ 'admonition', 'pymdownx.details', 'pymdownx.superfences', 'pymdownx.highlight', 'pymdownx.inlinehilite', 'toc', 'tables' ] } with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f: yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False) # Create requirements.txt requirements = [ "mkdocs>=1.5.0", "mkdocs-material>=9.6.0", "pymdown-extensions>=10.7", ] with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f: f.write('\n'.join(requirements)) print("\nScraping completed!") print(f"Documentation has been generated in the {project_dir} directory") print("To preview the documentation locally:") print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt") print("2. Run local server: cd cloudferro-docs && mkdocs serve") def convert_nav_structure(nav): result = [] for item in nav: if isinstance(item, dict): for key, value in item.items(): if isinstance(value, str): result.append({key: value}) else: result.append({key: convert_nav_structure(value)}) return result if __name__ == "__main__": url = 'https://docs.cloudferro.com/en/latest/' scrape_cloudferro_docs(url)