import requests from bs4 import BeautifulSoup import os from urllib.parse import urljoin, urlparse import time import yaml import markdownify def scrape_cloudferro_docs(base_url): """Scrape CloudFerro documentation and save as markdown files.""" # Create project directory project_dir = "cloudferro-docs" docs_dir = os.path.join(project_dir, "docs") os.makedirs(docs_dir, exist_ok=True) # Keep track of visited URLs and navigation structure visited_urls = set() nav_structure = [] def clean_filename(text): """Convert text to a clean filename""" return text.strip().replace(' ', '_').replace('/', '_').replace('?', '').lower() def scrape_page(url, parent_path=""): if url in visited_urls: return visited_urls.add(url) print(f"Scraping: {url}") try: # Add delay to be respectful to the server time.sleep(1) response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract main content content = soup.select_one('.rst-content .document') or soup.select_one('.document') if not content: print(f"No content found at {url}") return # Get title title = soup.select_one('h1') if not title: title = soup.select_one('title') page_title = title.text.strip() if title else url.split('/')[-1] # Convert content to markdown md_content = markdownify.markdownify(str(content), heading_style="ATX") # Create file path filename = clean_filename(page_title) + '.md' relative_path = os.path.join(parent_path, filename) file_path = os.path.join(docs_dir, relative_path) # Ensure directory exists os.makedirs(os.path.dirname(file_path), exist_ok=True) # Save content with open(file_path, 'w', encoding='utf-8') as f: f.write(f"# {page_title}\n\n") f.write(md_content) print(f"Saved: {file_path}") # Update navigation structure current_section = nav_structure if parent_path: for section in parent_path.split('/'): if section: section_dict = next((item for item in current_section if isinstance(item, dict) and section in item), None) if not section_dict: section_dict = {section: []} current_section.append(section_dict) current_section = section_dict[section] current_section.append({page_title: relative_path}) # Find and process links menu = soup.select_one('.wy-menu-vertical') if menu: for link in menu.select('a'): href = link.get('href') if not href or href.startswith(('#', 'mailto:', 'javascript:', 'tel:')): continue next_url = urljoin(url, href) if urlparse(next_url).netloc == urlparse(base_url).netloc: parent = link.find_parent('li', class_='toctree-l2') next_parent_path = parent_path if parent: section = parent.find_previous_sibling('li', class_='toctree-l1') if section: section_name = clean_filename(section.get_text().strip()) next_parent_path = os.path.join(parent_path, section_name) scrape_page(next_url, next_parent_path) except Exception as e: print(f"Error processing {url}: {str(e)}") try: scrape_page(base_url) except Exception as e: print(f"Error during scraping: {str(e)}") # Generate mkdocs.yml mkdocs_config = { 'site_name': 'CloudFerro Documentation', 'site_description': 'CloudFerro Documentation Mirror', 'theme': { 'name': 'material', 'palette': { 'primary': 'blue', 'accent': 'blue' }, 'features': [ 'navigation.instant', 'navigation.tracking', 'navigation.tabs', 'navigation.sections', 'navigation.expand', 'navigation.indexes', 'toc.integrate', 'content.code.copy' ] }, 'nav': nav_structure, 'markdown_extensions': [ 'admonition', 'pymdownx.details', 'pymdownx.superfences', 'pymdownx.highlight', 'pymdownx.inlinehilite', 'pymdownx.snippets', 'pymdownx.tabbed', 'footnotes', 'toc', 'tables', 'attr_list' ] } with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f: yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False) # Create requirements.txt requirements = [ "mkdocs>=1.5.0", "mkdocs-material>=9.6.0", "pymdown-extensions>=10.7", ] with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f: f.write('\n'.join(requirements)) print("\nScraping completed!") print(f"Documentation has been generated in the {project_dir} directory") print("\nTo preview the documentation locally:") print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt") print("2. Run local server: cd cloudferro-docs && mkdocs serve") if __name__ == "__main__": url = 'https://docs.cloudferro.com/en/latest/' scrape_cloudferro_docs(url)