3engines_doc/cloudferro_docs_scraper_new.py

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import time
import yaml
import markdownify

def scrape_cloudferro_docs(base_url):
    """Scrape CloudFerro documentation and save as markdown files."""

    # Create project directory
    project_dir = "cloudferro-docs"
    docs_dir = os.path.join(project_dir, "docs")
    os.makedirs(docs_dir, exist_ok=True)

    # Keep track of visited URLs and navigation structure
    visited_urls = set()
    nav_structure = []

    def clean_filename(text):
        """Convert text to a clean filename"""
        return text.strip().replace(' ', '_').replace('/', '_').replace('?', '').lower()

    def scrape_page(url, parent_path=""):
        if url in visited_urls:
            return

        visited_urls.add(url)
        print(f"Scraping: {url}")

        try:
            # Add delay to be respectful to the server
            time.sleep(1)
            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract main content
            content = soup.select_one('.rst-content .document') or soup.select_one('.document')
            if not content:
                print(f"No content found at {url}")
                return

            # Get title
            title = soup.select_one('h1')
            if not title:
                title = soup.select_one('title')
            page_title = title.text.strip() if title else url.split('/')[-1]

            # Convert content to markdown
            md_content = markdownify.markdownify(str(content), heading_style="ATX")

            # Create file path
            filename = clean_filename(page_title) + '.md'
            relative_path = os.path.join(parent_path, filename)
            file_path = os.path.join(docs_dir, relative_path)

            # Ensure directory exists
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            # Save content
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(f"# {page_title}\n\n")
                f.write(md_content)

            print(f"Saved: {file_path}")

            # Update navigation structure
            current_section = nav_structure
            if parent_path:
                for section in parent_path.split('/'):
                    if section:
                        section_dict = next((item for item in current_section
                                          if isinstance(item, dict) and section in item), None)
                        if not section_dict:
                            section_dict = {section: []}
                            current_section.append(section_dict)
                        current_section = section_dict[section]

            current_section.append({page_title: relative_path})

            # Find and process links
            menu = soup.select_one('.wy-menu-vertical')
            if menu:
                for link in menu.select('a'):
                    href = link.get('href')
                    if not href or href.startswith(('#', 'mailto:', 'javascript:', 'tel:')):
                        continue

                    next_url = urljoin(url, href)
                    if urlparse(next_url).netloc == urlparse(base_url).netloc:
                        parent = link.find_parent('li', class_='toctree-l2')
                        next_parent_path = parent_path
                        if parent:
                            section = parent.find_previous_sibling('li', class_='toctree-l1')
                            if section:
                                section_name = clean_filename(section.get_text().strip())
                                next_parent_path = os.path.join(parent_path, section_name)
                        scrape_page(next_url, next_parent_path)

        except Exception as e:
            print(f"Error processing {url}: {str(e)}")

    try:
        scrape_page(base_url)
    except Exception as e:
        print(f"Error during scraping: {str(e)}")

    # Generate mkdocs.yml
    mkdocs_config = {
        'site_name': 'CloudFerro Documentation',
        'site_description': 'CloudFerro Documentation Mirror',
        'theme': {
            'name': 'material',
            'palette': {
                'primary': 'blue',
                'accent': 'blue'
            },
            'features': [
                'navigation.instant',
                'navigation.tracking',
                'navigation.tabs',
                'navigation.sections',
                'navigation.expand',
                'navigation.indexes',
                'toc.integrate',
                'content.code.copy'
            ]
        },
        'nav': nav_structure,
        'markdown_extensions': [
            'admonition',
            'pymdownx.details',
            'pymdownx.superfences',
            'pymdownx.highlight',
            'pymdownx.inlinehilite',
            'pymdownx.snippets',
            'pymdownx.tabbed',
            'footnotes',
            'toc',
            'tables',
            'attr_list'
        ]
    }

    with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
        yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False)

    # Create requirements.txt
    requirements = [
        "mkdocs>=1.5.0",
        "mkdocs-material>=9.6.0",
        "pymdown-extensions>=10.7",
    ]

    with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
        f.write('\n'.join(requirements))

    print("\nScraping completed!")
    print(f"Documentation has been generated in the {project_dir} directory")
    print("\nTo preview the documentation locally:")
    print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt")
    print("2. Run local server: cd cloudferro-docs && mkdocs serve")

if __name__ == "__main__":
    url = 'https://docs.cloudferro.com/en/latest/'
    scrape_cloudferro_docs(url)