before changing links

2025-06-19 09:01:18 +05:30
commit 6686208bf1
1277 changed files with 29692 additions and 0 deletions
--- a/cloudferro_docs_scraper_new.py
+++ b/cloudferro_docs_scraper_new.py
@ -0,0 +1,169 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+from urllib.parse import urljoin, urlparse
+import time
+import yaml
+import markdownify
+
+def scrape_cloudferro_docs(base_url):
+    """Scrape CloudFerro documentation and save as markdown files."""
+    
+    # Create project directory
+    project_dir = "cloudferro-docs"
+    docs_dir = os.path.join(project_dir, "docs")
+    os.makedirs(docs_dir, exist_ok=True)
+    
+    # Keep track of visited URLs and navigation structure
+    visited_urls = set()
+    nav_structure = []
+    
+    def clean_filename(text):
+        """Convert text to a clean filename"""
+        return text.strip().replace(' ', '_').replace('/', '_').replace('?', '').lower()
+    
+    def scrape_page(url, parent_path=""):
+        if url in visited_urls:
+            return
+        
+        visited_urls.add(url)
+        print(f"Scraping: {url}")
+        
+        try:
+            # Add delay to be respectful to the server
+            time.sleep(1)
+            response = requests.get(url)
+            response.raise_for_status()
+            
+            soup = BeautifulSoup(response.content, 'html.parser')
+            
+            # Extract main content
+            content = soup.select_one('.rst-content .document') or soup.select_one('.document')
+            if not content:
+                print(f"No content found at {url}")
+                return
+            
+            # Get title
+            title = soup.select_one('h1')
+            if not title:
+                title = soup.select_one('title')
+            page_title = title.text.strip() if title else url.split('/')[-1]
+            
+            # Convert content to markdown
+            md_content = markdownify.markdownify(str(content), heading_style="ATX")
+            
+            # Create file path
+            filename = clean_filename(page_title) + '.md'
+            relative_path = os.path.join(parent_path, filename)
+            file_path = os.path.join(docs_dir, relative_path)
+            
+            # Ensure directory exists
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            
+            # Save content
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(f"# {page_title}\n\n")
+                f.write(md_content)
+            
+            print(f"Saved: {file_path}")
+            
+            # Update navigation structure
+            current_section = nav_structure
+            if parent_path:
+                for section in parent_path.split('/'):
+                    if section:
+                        section_dict = next((item for item in current_section 
+                                          if isinstance(item, dict) and section in item), None)
+                        if not section_dict:
+                            section_dict = {section: []}
+                            current_section.append(section_dict)
+                        current_section = section_dict[section]
+            
+            current_section.append({page_title: relative_path})
+            
+            # Find and process links
+            menu = soup.select_one('.wy-menu-vertical')
+            if menu:
+                for link in menu.select('a'):
+                    href = link.get('href')
+                    if not href or href.startswith(('#', 'mailto:', 'javascript:', 'tel:')):
+                        continue
+                    
+                    next_url = urljoin(url, href)
+                    if urlparse(next_url).netloc == urlparse(base_url).netloc:
+                        parent = link.find_parent('li', class_='toctree-l2')
+                        next_parent_path = parent_path
+                        if parent:
+                            section = parent.find_previous_sibling('li', class_='toctree-l1')
+                            if section:
+                                section_name = clean_filename(section.get_text().strip())
+                                next_parent_path = os.path.join(parent_path, section_name)
+                        scrape_page(next_url, next_parent_path)
+                        
+        except Exception as e:
+            print(f"Error processing {url}: {str(e)}")
+    
+    try:
+        scrape_page(base_url)
+    except Exception as e:
+        print(f"Error during scraping: {str(e)}")
+    
+    # Generate mkdocs.yml
+    mkdocs_config = {
+        'site_name': 'CloudFerro Documentation',
+        'site_description': 'CloudFerro Documentation Mirror',
+        'theme': {
+            'name': 'material',
+            'palette': {
+                'primary': 'blue',
+                'accent': 'blue'
+            },
+            'features': [
+                'navigation.instant',
+                'navigation.tracking',
+                'navigation.tabs',
+                'navigation.sections',
+                'navigation.expand',
+                'navigation.indexes',
+                'toc.integrate',
+                'content.code.copy'
+            ]
+        },
+        'nav': nav_structure,
+        'markdown_extensions': [
+            'admonition',
+            'pymdownx.details',
+            'pymdownx.superfences',
+            'pymdownx.highlight',
+            'pymdownx.inlinehilite',
+            'pymdownx.snippets',
+            'pymdownx.tabbed',
+            'footnotes',
+            'toc',
+            'tables',
+            'attr_list'
+        ]
+    }
+    
+    with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
+        yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False)
+    
+    # Create requirements.txt
+    requirements = [
+        "mkdocs>=1.5.0",
+        "mkdocs-material>=9.6.0",
+        "pymdown-extensions>=10.7",
+    ]
+    
+    with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
+        f.write('\n'.join(requirements))
+    
+    print("\nScraping completed!")
+    print(f"Documentation has been generated in the {project_dir} directory")
+    print("\nTo preview the documentation locally:")
+    print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt")
+    print("2. Run local server: cd cloudferro-docs && mkdocs serve")
+
+if __name__ == "__main__":
+    url = 'https://docs.cloudferro.com/en/latest/'
+    scrape_cloudferro_docs(url)