151 lines
5.5 KiB
Python
151 lines
5.5 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
from urllib.parse import urljoin, urlparse
|
|
import time
|
|
import yaml
|
|
import markdownify
|
|
|
|
def scrape_cloudferro_docs(base_url):
|
|
# Create project directories
|
|
project_dir = "cloudferro-docs"
|
|
docs_dir = os.path.join(project_dir, "docs")
|
|
os.makedirs(docs_dir, exist_ok=True)
|
|
|
|
# Create project directory
|
|
project_dir = "cloudferro-docs"
|
|
docs_dir = os.path.join(project_dir, "docs")
|
|
os.makedirs(docs_dir, exist_ok=True)
|
|
|
|
# Dictionary to store navigation structure
|
|
nav_structure = {}
|
|
visited_urls = set()
|
|
|
|
def scrape_page(url, parent_path=""):
|
|
if url in visited_urls:
|
|
return
|
|
visited_urls.add(url)
|
|
try:
|
|
# Add delay to be respectful to the server
|
|
time.sleep(1)
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
# Parse page content with BeautifulSoup
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Extract main content (using the correct selector for CloudFerro docs)
|
|
content = soup.select_one('.rst-content .document') or soup.select_one('.document')
|
|
if not content:
|
|
print(f"No content found at {url}")
|
|
return
|
|
except Exception as e:
|
|
print(f"Error processing {url}: {str(e)}")
|
|
return
|
|
|
|
# Convert HTML to Markdown
|
|
md_content = markdownify.markdownify(str(content), heading_style="ATX")
|
|
|
|
# Extract page title for file name and nav
|
|
title = soup.select_one('.md-content h1') or soup.select_one('title')
|
|
page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower()
|
|
page_title = page_title.replace('/', '_').replace('?', '')
|
|
|
|
# Create file path
|
|
relative_path = os.path.join(parent_path, f"{page_title}.md")
|
|
file_path = os.path.join(docs_dir, relative_path)
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
|
|
# Save Markdown content
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(md_content)
|
|
|
|
# Update navigation structure
|
|
nav_path = parent_path.split('/') if parent_path else []
|
|
current_nav = nav_structure
|
|
for part in nav_path:
|
|
for item in current_nav:
|
|
if isinstance(item, dict) and part in item:
|
|
current_nav = item[part]
|
|
break
|
|
current_nav.append({page_title.replace('_', ' ').title(): relative_path})
|
|
|
|
# Find and scrape linked pages (e.g., sidebar or content links)
|
|
links = content.select('a[href]')
|
|
for link in links:
|
|
href = link['href']
|
|
if href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'):
|
|
continue
|
|
absolute_url = urljoin(base_url, href)
|
|
if absolute_url.startswith(base_url) and absolute_url not in visited_urls:
|
|
new_parent_path = parent_path
|
|
if 'section' in link.get('class', []): # Adjust based on actual class names
|
|
new_parent_path = os.path.join(parent_path, page_title)
|
|
scrape_page(absolute_url, new_parent_path)
|
|
|
|
# Start scraping from the base URL
|
|
try:
|
|
scrape_page(base_url)
|
|
except Exception as e:
|
|
print(f"Error during scraping: {e}")
|
|
|
|
# Generate mkdocs.yml
|
|
mkdocs_config = {
|
|
'site_name': 'CloudFerro Cloud Documentation',
|
|
'site_url': 'https://your-readthedocs-subdomain.readthedocs.io',
|
|
'theme': {
|
|
'name': 'material',
|
|
'palette': {
|
|
'primary': 'blue',
|
|
'accent': 'blue'
|
|
},
|
|
'features': ['content.code.copy', 'navigation.sections']
|
|
},
|
|
'nav': convert_nav_structure(nav_structure),
|
|
'markdown_extensions': [
|
|
'admonition',
|
|
'pymdownx.details',
|
|
'pymdownx.superfences'
|
|
]
|
|
}
|
|
|
|
with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
|
|
yaml.dump(mkdocs_config, f, allow_unicode=True)
|
|
|
|
# Create .readthedocs.yml
|
|
readthedocs_config = {
|
|
'version': 2,
|
|
'build': {
|
|
'os': 'ubuntu-24.04',
|
|
'tools': {'python': '3.11'}
|
|
},
|
|
'python': {
|
|
'install': [{'requirements': 'requirements.txt'}]
|
|
},
|
|
'mkdocs': {'configuration': 'mkdocs.yml'}
|
|
}
|
|
|
|
with open(os.path.join(project_dir, '.readthedocs.yml'), 'w', encoding='utf-8') as f:
|
|
yaml.dump(readthedocs_config, f, allow_unicode=True)
|
|
|
|
# Create requirements.txt
|
|
with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
|
|
f.write("mkdocs>=1.5.0\nmkdocs-material>=9.6.0\n")
|
|
|
|
print(f"Documentation cloned to {project_dir}. Run 'mkdocs serve' to preview locally.")
|
|
print("Push to a Git repository and import to Read the Docs to host.")
|
|
|
|
def convert_nav_structure(nav):
|
|
result = []
|
|
for item in nav:
|
|
if isinstance(item, dict):
|
|
for key, value in item.items():
|
|
if isinstance(value, str):
|
|
result.append({key: value})
|
|
else:
|
|
result.append({key: convert_nav_structure(value)})
|
|
return result
|
|
|
|
# URL of the CloudFerro documentation
|
|
url = 'https://docs.cloudferro.com/en/latest/'
|
|
scrape_cloudferro_docs(url) |