before changing links

This commit is contained in:
govardhan
2025-06-19 09:01:18 +05:30
commit 6686208bf1
1277 changed files with 29692 additions and 0 deletions

173
extract_docs_brave.py Normal file
View File

@ -0,0 +1,173 @@
import time
import os
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import yaml
from bs4 import BeautifulSoup
import markdownify
def scrape_cloudferro_docs(base_url):
# Set up Brave options
chrome_options = Options()
# Specify Brave binary location
chrome_options.binary_location = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
chrome_options.add_argument("--headless") # Run in headless mode
# Initialize the WebDriver
try:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception as e:
print(f"Error initializing WebDriver: {e}")
return
# Create project directory
project_dir = "cloudferro-docs"
docs_dir = os.path.join(project_dir, "docs")
os.makedirs(docs_dir, exist_ok=True)
# Dictionary to store navigation structure
nav_structure = {}
visited_urls = set()
def scrape_page(url, parent_path=""):
if url in visited_urls:
return
visited_urls.add(url)
print(f"Scraping: {url}")
try:
driver.get(url)
time.sleep(2) # Wait for page to load
except Exception as e:
print(f"Error loading {url}: {e}")
return
# Parse page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract main content (using the correct selector for CloudFerro docs)
content = soup.select_one('.rst-content .document') or soup.select_one('.document')
if not content:
print(f"No content found at {url}")
return
# Convert HTML to Markdown
md_content = markdownify.markdownify(str(content), heading_style="ATX")
# Extract page title for file name and nav
title = soup.select_one('h1') or soup.select_one('title')
page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower()
page_title = page_title.replace('/', '_').replace('?', '')
# Create file path
relative_path = os.path.join(parent_path, f"{page_title}.md")
file_path = os.path.join(docs_dir, relative_path)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Save Markdown content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(md_content)
print(f"Saved: {file_path}")
# Update navigation structure
nav_path = parent_path.split('/') if parent_path else []
current_nav = nav_structure
for part in nav_path:
for item in current_nav:
if isinstance(item, dict) and part in item:
current_nav = item[part]
break
current_nav.append({page_title.replace('_', ' ').title(): relative_path})
# Find and scrape linked pages (focusing on sidebar links)
links = soup.select('.wy-menu-vertical li a') # CloudFerro uses Read the Docs theme
for link in links:
href = link.get('href')
if not href or href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'):
continue
absolute_url = urljoin(base_url, href)
if absolute_url.startswith(base_url) and absolute_url not in visited_urls:
new_parent_path = os.path.join(parent_path, page_title) if 'current' in link.get('class', []) else parent_path
scrape_page(absolute_url, new_parent_path)
# Start scraping from the base URL
try:
scrape_page(base_url)
except Exception as e:
print(f"Error during scraping: {e}")
finally:
driver.quit()
# Generate mkdocs.yml
mkdocs_config = {
'site_name': 'CloudFerro Documentation',
'site_url': 'https://cloudferro-docs.readthedocs.io',
'theme': {
'name': 'material',
'palette': {
'primary': 'blue',
'accent': 'blue'
},
'features': [
'navigation.instant',
'navigation.tracking',
'navigation.tabs',
'navigation.sections',
'navigation.expand',
'navigation.indexes',
'toc.integrate',
'content.code.copy'
]
},
'nav': convert_nav_structure(nav_structure),
'markdown_extensions': [
'admonition',
'pymdownx.details',
'pymdownx.superfences',
'pymdownx.highlight',
'pymdownx.inlinehilite',
'toc',
'tables'
]
}
with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False)
# Create requirements.txt
requirements = [
"mkdocs>=1.5.0",
"mkdocs-material>=9.6.0",
"pymdown-extensions>=10.7",
]
with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
f.write('\n'.join(requirements))
print("\nScraping completed!")
print(f"Documentation has been generated in the {project_dir} directory")
print("To preview the documentation locally:")
print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt")
print("2. Run local server: cd cloudferro-docs && mkdocs serve")
def convert_nav_structure(nav):
result = []
for item in nav:
if isinstance(item, dict):
for key, value in item.items():
if isinstance(value, str):
result.append({key: value})
else:
result.append({key: convert_nav_structure(value)})
return result
if __name__ == "__main__":
url = 'https://docs.cloudferro.com/en/latest/'
scrape_cloudferro_docs(url)