before changing links

2025-06-19 09:01:18 +05:30
commit 6686208bf1
1277 changed files with 29692 additions and 0 deletions
--- a/extract_docs_brave.py
+++ b/extract_docs_brave.py
@ -0,0 +1,173 @@
+import time
+import os
+from urllib.parse import urljoin
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+import yaml
+from bs4 import BeautifulSoup
+import markdownify
+
+def scrape_cloudferro_docs(base_url):
+    # Set up Brave options
+    chrome_options = Options()
+    # Specify Brave binary location
+    chrome_options.binary_location = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
+    chrome_options.add_argument("--headless")  # Run in headless mode
+    
+    # Initialize the WebDriver
+    try:
+        service = Service(ChromeDriverManager().install())
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+    except Exception as e:
+        print(f"Error initializing WebDriver: {e}")
+        return
+    
+    # Create project directory
+    project_dir = "cloudferro-docs"
+    docs_dir = os.path.join(project_dir, "docs")
+    os.makedirs(docs_dir, exist_ok=True)
+    
+    # Dictionary to store navigation structure
+    nav_structure = {}
+    visited_urls = set()
+    
+    def scrape_page(url, parent_path=""):
+        if url in visited_urls:
+            return
+        visited_urls.add(url)
+        
+        print(f"Scraping: {url}")
+        
+        try:
+            driver.get(url)
+            time.sleep(2)  # Wait for page to load
+        except Exception as e:
+            print(f"Error loading {url}: {e}")
+            return
+        
+        # Parse page content with BeautifulSoup
+        soup = BeautifulSoup(driver.page_source, 'html.parser')
+        
+        # Extract main content (using the correct selector for CloudFerro docs)
+        content = soup.select_one('.rst-content .document') or soup.select_one('.document')
+        if not content:
+            print(f"No content found at {url}")
+            return
+        
+        # Convert HTML to Markdown
+        md_content = markdownify.markdownify(str(content), heading_style="ATX")
+        
+        # Extract page title for file name and nav
+        title = soup.select_one('h1') or soup.select_one('title')
+        page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower()
+        page_title = page_title.replace('/', '_').replace('?', '')
+        
+        # Create file path
+        relative_path = os.path.join(parent_path, f"{page_title}.md")
+        file_path = os.path.join(docs_dir, relative_path)
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        
+        # Save Markdown content
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(md_content)
+        
+        print(f"Saved: {file_path}")
+        
+        # Update navigation structure
+        nav_path = parent_path.split('/') if parent_path else []
+        current_nav = nav_structure
+        for part in nav_path:
+            for item in current_nav:
+                if isinstance(item, dict) and part in item:
+                    current_nav = item[part]
+                    break
+        current_nav.append({page_title.replace('_', ' ').title(): relative_path})
+        
+        # Find and scrape linked pages (focusing on sidebar links)
+        links = soup.select('.wy-menu-vertical li a')  # CloudFerro uses Read the Docs theme
+        for link in links:
+            href = link.get('href')
+            if not href or href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'):
+                continue
+            absolute_url = urljoin(base_url, href)
+            if absolute_url.startswith(base_url) and absolute_url not in visited_urls:
+                new_parent_path = os.path.join(parent_path, page_title) if 'current' in link.get('class', []) else parent_path
+                scrape_page(absolute_url, new_parent_path)
+    
+    # Start scraping from the base URL
+    try:
+        scrape_page(base_url)
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+    finally:
+        driver.quit()
+    
+    # Generate mkdocs.yml
+    mkdocs_config = {
+        'site_name': 'CloudFerro Documentation',
+        'site_url': 'https://cloudferro-docs.readthedocs.io',
+        'theme': {
+            'name': 'material',
+            'palette': {
+                'primary': 'blue',
+                'accent': 'blue'
+            },
+            'features': [
+                'navigation.instant',
+                'navigation.tracking',
+                'navigation.tabs',
+                'navigation.sections',
+                'navigation.expand',
+                'navigation.indexes',
+                'toc.integrate',
+                'content.code.copy'
+            ]
+        },
+        'nav': convert_nav_structure(nav_structure),
+        'markdown_extensions': [
+            'admonition',
+            'pymdownx.details',
+            'pymdownx.superfences',
+            'pymdownx.highlight',
+            'pymdownx.inlinehilite',
+            'toc',
+            'tables'
+        ]
+    }
+    
+    with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
+        yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False)
+    
+    # Create requirements.txt
+    requirements = [
+        "mkdocs>=1.5.0",
+        "mkdocs-material>=9.6.0",
+        "pymdown-extensions>=10.7",
+    ]
+    
+    with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
+        f.write('\n'.join(requirements))
+    
+    print("\nScraping completed!")
+    print(f"Documentation has been generated in the {project_dir} directory")
+    print("To preview the documentation locally:")
+    print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt")
+    print("2. Run local server: cd cloudferro-docs && mkdocs serve")
+
+def convert_nav_structure(nav):
+    result = []
+    for item in nav:
+        if isinstance(item, dict):
+            for key, value in item.items():
+                if isinstance(value, str):
+                    result.append({key: value})
+                else:
+                    result.append({key: convert_nav_structure(value)})
+    return result
+
+if __name__ == "__main__":
+    url = 'https://docs.cloudferro.com/en/latest/'
+    scrape_cloudferro_docs(url)