3engines_doc/cloudferro_docs_scraper.py

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import time
import yaml
import markdownify

def scrape_cloudferro_docs(base_url):
    # Create project directories
    project_dir = "cloudferro-docs"
    docs_dir = os.path.join(project_dir, "docs")
    os.makedirs(docs_dir, exist_ok=True)

    # Create project directory
    project_dir = "cloudferro-docs"
    docs_dir = os.path.join(project_dir, "docs")
    os.makedirs(docs_dir, exist_ok=True)

    # Dictionary to store navigation structure
    nav_structure = {}
    visited_urls = set()

    def scrape_page(url, parent_path=""):
        if url in visited_urls:
            return
        visited_urls.add(url)
         try:
            # Add delay to be respectful to the server
            time.sleep(1)
            response = requests.get(url)
            response.raise_for_status()

            # Parse page content with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract main content (using the correct selector for CloudFerro docs)
            content = soup.select_one('.rst-content .document') or soup.select_one('.document')
            if not content:
                print(f"No content found at {url}")
                return
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            return

        # Convert HTML to Markdown
        md_content = markdownify.markdownify(str(content), heading_style="ATX")

        # Extract page title for file name and nav
        title = soup.select_one('.md-content h1') or soup.select_one('title')
        page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower()
        page_title = page_title.replace('/', '_').replace('?', '')

        # Create file path
        relative_path = os.path.join(parent_path, f"{page_title}.md")
        file_path = os.path.join(docs_dir, relative_path)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # Save Markdown content
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(md_content)

        # Update navigation structure
        nav_path = parent_path.split('/') if parent_path else []
        current_nav = nav_structure
        for part in nav_path:
            for item in current_nav:
                if isinstance(item, dict) and part in item:
                    current_nav = item[part]
                    break
        current_nav.append({page_title.replace('_', ' ').title(): relative_path})

        # Find and scrape linked pages (e.g., sidebar or content links)
        links = content.select('a[href]')
        for link in links:
            href = link['href']
            if href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'):
                continue
            absolute_url = urljoin(base_url, href)
            if absolute_url.startswith(base_url) and absolute_url not in visited_urls:
                new_parent_path = parent_path
                if 'section' in link.get('class', []):  # Adjust based on actual class names
                    new_parent_path = os.path.join(parent_path, page_title)
                scrape_page(absolute_url, new_parent_path)

    # Start scraping from the base URL
    try:
        scrape_page(base_url)
    except Exception as e:
        print(f"Error during scraping: {e}")

    # Generate mkdocs.yml
    mkdocs_config = {
        'site_name': 'CloudFerro Cloud Documentation',
        'site_url': 'https://your-readthedocs-subdomain.readthedocs.io',
        'theme': {
            'name': 'material',
            'palette': {
                'primary': 'blue',
                'accent': 'blue'
            },
            'features': ['content.code.copy', 'navigation.sections']
        },
        'nav': convert_nav_structure(nav_structure),
        'markdown_extensions': [
            'admonition',
            'pymdownx.details',
            'pymdownx.superfences'
        ]
    }

    with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
        yaml.dump(mkdocs_config, f, allow_unicode=True)

    # Create .readthedocs.yml
    readthedocs_config = {
        'version': 2,
        'build': {
            'os': 'ubuntu-24.04',
            'tools': {'python': '3.11'}
        },
        'python': {
            'install': [{'requirements': 'requirements.txt'}]
        },
        'mkdocs': {'configuration': 'mkdocs.yml'}
    }

    with open(os.path.join(project_dir, '.readthedocs.yml'), 'w', encoding='utf-8') as f:
        yaml.dump(readthedocs_config, f, allow_unicode=True)

    # Create requirements.txt
    with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
        f.write("mkdocs>=1.5.0\nmkdocs-material>=9.6.0\n")

    print(f"Documentation cloned to {project_dir}. Run 'mkdocs serve' to preview locally.")
    print("Push to a Git repository and import to Read the Docs to host.")

def convert_nav_structure(nav):
    result = []
    for item in nav:
        if isinstance(item, dict):
            for key, value in item.items():
                if isinstance(value, str):
                    result.append({key: value})
                else:
                    result.append({key: convert_nav_structure(value)})
    return result

# URL of the CloudFerro documentation
url = 'https://docs.cloudferro.com/en/latest/'
scrape_cloudferro_docs(url)