Files
3engines_doc/extract_docs_brave.py
2025-06-19 09:01:18 +05:30

174 lines
6.2 KiB
Python

import time
import os
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import yaml
from bs4 import BeautifulSoup
import markdownify
def scrape_cloudferro_docs(base_url):
# Set up Brave options
chrome_options = Options()
# Specify Brave binary location
chrome_options.binary_location = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
chrome_options.add_argument("--headless") # Run in headless mode
# Initialize the WebDriver
try:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception as e:
print(f"Error initializing WebDriver: {e}")
return
# Create project directory
project_dir = "cloudferro-docs"
docs_dir = os.path.join(project_dir, "docs")
os.makedirs(docs_dir, exist_ok=True)
# Dictionary to store navigation structure
nav_structure = {}
visited_urls = set()
def scrape_page(url, parent_path=""):
if url in visited_urls:
return
visited_urls.add(url)
print(f"Scraping: {url}")
try:
driver.get(url)
time.sleep(2) # Wait for page to load
except Exception as e:
print(f"Error loading {url}: {e}")
return
# Parse page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract main content (using the correct selector for CloudFerro docs)
content = soup.select_one('.rst-content .document') or soup.select_one('.document')
if not content:
print(f"No content found at {url}")
return
# Convert HTML to Markdown
md_content = markdownify.markdownify(str(content), heading_style="ATX")
# Extract page title for file name and nav
title = soup.select_one('h1') or soup.select_one('title')
page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower()
page_title = page_title.replace('/', '_').replace('?', '')
# Create file path
relative_path = os.path.join(parent_path, f"{page_title}.md")
file_path = os.path.join(docs_dir, relative_path)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Save Markdown content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(md_content)
print(f"Saved: {file_path}")
# Update navigation structure
nav_path = parent_path.split('/') if parent_path else []
current_nav = nav_structure
for part in nav_path:
for item in current_nav:
if isinstance(item, dict) and part in item:
current_nav = item[part]
break
current_nav.append({page_title.replace('_', ' ').title(): relative_path})
# Find and scrape linked pages (focusing on sidebar links)
links = soup.select('.wy-menu-vertical li a') # CloudFerro uses Read the Docs theme
for link in links:
href = link.get('href')
if not href or href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'):
continue
absolute_url = urljoin(base_url, href)
if absolute_url.startswith(base_url) and absolute_url not in visited_urls:
new_parent_path = os.path.join(parent_path, page_title) if 'current' in link.get('class', []) else parent_path
scrape_page(absolute_url, new_parent_path)
# Start scraping from the base URL
try:
scrape_page(base_url)
except Exception as e:
print(f"Error during scraping: {e}")
finally:
driver.quit()
# Generate mkdocs.yml
mkdocs_config = {
'site_name': 'CloudFerro Documentation',
'site_url': 'https://cloudferro-docs.readthedocs.io',
'theme': {
'name': 'material',
'palette': {
'primary': 'blue',
'accent': 'blue'
},
'features': [
'navigation.instant',
'navigation.tracking',
'navigation.tabs',
'navigation.sections',
'navigation.expand',
'navigation.indexes',
'toc.integrate',
'content.code.copy'
]
},
'nav': convert_nav_structure(nav_structure),
'markdown_extensions': [
'admonition',
'pymdownx.details',
'pymdownx.superfences',
'pymdownx.highlight',
'pymdownx.inlinehilite',
'toc',
'tables'
]
}
with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False)
# Create requirements.txt
requirements = [
"mkdocs>=1.5.0",
"mkdocs-material>=9.6.0",
"pymdown-extensions>=10.7",
]
with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
f.write('\n'.join(requirements))
print("\nScraping completed!")
print(f"Documentation has been generated in the {project_dir} directory")
print("To preview the documentation locally:")
print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt")
print("2. Run local server: cd cloudferro-docs && mkdocs serve")
def convert_nav_structure(nav):
result = []
for item in nav:
if isinstance(item, dict):
for key, value in item.items():
if isinstance(value, str):
result.append({key: value})
else:
result.append({key: convert_nav_structure(value)})
return result
if __name__ == "__main__":
url = 'https://docs.cloudferro.com/en/latest/'
scrape_cloudferro_docs(url)