before changing links

This commit is contained in:
govardhan
2025-06-19 09:01:18 +05:30
commit 6686208bf1
1277 changed files with 29692 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
venv/

38
clean_md_files.py Normal file
View File

@ -0,0 +1,38 @@
import os
import re
def clean_md_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Remove content before first "---" if it exists
if "---" in content:
content = re.sub(r'^.*?---', '', content, flags=re.DOTALL)
# Remove navigation links at the bottom
content = re.sub(r'\n\[Previous\].*$', '', content, flags=re.DOTALL)
# Remove any remaining navigation-related content at the bottom
content = re.sub(r'\n\* \[.*?\].*$', '', content, flags=re.DOTALL)
# Clean up multiple blank lines
content = re.sub(r'\n{3,}', '\n\n', content)
# Trim leading/trailing whitespace while preserving content
content = content.strip()
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
def process_directory(dir_path):
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.md'):
file_path = os.path.join(root, file)
print(f"Processing: {file_path}")
clean_md_file(file_path)
if __name__ == "__main__":
docs_dir = "/Users/dhanraj/Desktop/kpme_scraper/docs"
process_directory(docs_dir)
print("Completed cleaning markdown files.")

View File

View File

View File

151
cloudferro_docs_scraper.py Normal file
View File

@ -0,0 +1,151 @@
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import time
import yaml
import markdownify
def scrape_cloudferro_docs(base_url):
# Create project directories
project_dir = "cloudferro-docs"
docs_dir = os.path.join(project_dir, "docs")
os.makedirs(docs_dir, exist_ok=True)
# Create project directory
project_dir = "cloudferro-docs"
docs_dir = os.path.join(project_dir, "docs")
os.makedirs(docs_dir, exist_ok=True)
# Dictionary to store navigation structure
nav_structure = {}
visited_urls = set()
def scrape_page(url, parent_path=""):
if url in visited_urls:
return
visited_urls.add(url)
try:
# Add delay to be respectful to the server
time.sleep(1)
response = requests.get(url)
response.raise_for_status()
# Parse page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract main content (using the correct selector for CloudFerro docs)
content = soup.select_one('.rst-content .document') or soup.select_one('.document')
if not content:
print(f"No content found at {url}")
return
except Exception as e:
print(f"Error processing {url}: {str(e)}")
return
# Convert HTML to Markdown
md_content = markdownify.markdownify(str(content), heading_style="ATX")
# Extract page title for file name and nav
title = soup.select_one('.md-content h1') or soup.select_one('title')
page_title = (title.text.strip() if title else url.split('/')[-1]).replace(' ', '_').lower()
page_title = page_title.replace('/', '_').replace('?', '')
# Create file path
relative_path = os.path.join(parent_path, f"{page_title}.md")
file_path = os.path.join(docs_dir, relative_path)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Save Markdown content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(md_content)
# Update navigation structure
nav_path = parent_path.split('/') if parent_path else []
current_nav = nav_structure
for part in nav_path:
for item in current_nav:
if isinstance(item, dict) and part in item:
current_nav = item[part]
break
current_nav.append({page_title.replace('_', ' ').title(): relative_path})
# Find and scrape linked pages (e.g., sidebar or content links)
links = content.select('a[href]')
for link in links:
href = link['href']
if href.startswith('#') or href.startswith('mailto:') or href.startswith('javascript:'):
continue
absolute_url = urljoin(base_url, href)
if absolute_url.startswith(base_url) and absolute_url not in visited_urls:
new_parent_path = parent_path
if 'section' in link.get('class', []): # Adjust based on actual class names
new_parent_path = os.path.join(parent_path, page_title)
scrape_page(absolute_url, new_parent_path)
# Start scraping from the base URL
try:
scrape_page(base_url)
except Exception as e:
print(f"Error during scraping: {e}")
# Generate mkdocs.yml
mkdocs_config = {
'site_name': 'CloudFerro Cloud Documentation',
'site_url': 'https://your-readthedocs-subdomain.readthedocs.io',
'theme': {
'name': 'material',
'palette': {
'primary': 'blue',
'accent': 'blue'
},
'features': ['content.code.copy', 'navigation.sections']
},
'nav': convert_nav_structure(nav_structure),
'markdown_extensions': [
'admonition',
'pymdownx.details',
'pymdownx.superfences'
]
}
with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
yaml.dump(mkdocs_config, f, allow_unicode=True)
# Create .readthedocs.yml
readthedocs_config = {
'version': 2,
'build': {
'os': 'ubuntu-24.04',
'tools': {'python': '3.11'}
},
'python': {
'install': [{'requirements': 'requirements.txt'}]
},
'mkdocs': {'configuration': 'mkdocs.yml'}
}
with open(os.path.join(project_dir, '.readthedocs.yml'), 'w', encoding='utf-8') as f:
yaml.dump(readthedocs_config, f, allow_unicode=True)
# Create requirements.txt
with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
f.write("mkdocs>=1.5.0\nmkdocs-material>=9.6.0\n")
print(f"Documentation cloned to {project_dir}. Run 'mkdocs serve' to preview locally.")
print("Push to a Git repository and import to Read the Docs to host.")
def convert_nav_structure(nav):
result = []
for item in nav:
if isinstance(item, dict):
for key, value in item.items():
if isinstance(value, str):
result.append({key: value})
else:
result.append({key: convert_nav_structure(value)})
return result
# URL of the CloudFerro documentation
url = 'https://docs.cloudferro.com/en/latest/'
scrape_cloudferro_docs(url)

View File

@ -0,0 +1,169 @@
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import time
import yaml
import markdownify
def scrape_cloudferro_docs(base_url):
"""Scrape CloudFerro documentation and save as markdown files."""
# Create project directory
project_dir = "cloudferro-docs"
docs_dir = os.path.join(project_dir, "docs")
os.makedirs(docs_dir, exist_ok=True)
# Keep track of visited URLs and navigation structure
visited_urls = set()
nav_structure = []
def clean_filename(text):
"""Convert text to a clean filename"""
return text.strip().replace(' ', '_').replace('/', '_').replace('?', '').lower()
def scrape_page(url, parent_path=""):
if url in visited_urls:
return
visited_urls.add(url)
print(f"Scraping: {url}")
try:
# Add delay to be respectful to the server
time.sleep(1)
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract main content
content = soup.select_one('.rst-content .document') or soup.select_one('.document')
if not content:
print(f"No content found at {url}")
return
# Get title
title = soup.select_one('h1')
if not title:
title = soup.select_one('title')
page_title = title.text.strip() if title else url.split('/')[-1]
# Convert content to markdown
md_content = markdownify.markdownify(str(content), heading_style="ATX")
# Create file path
filename = clean_filename(page_title) + '.md'
relative_path = os.path.join(parent_path, filename)
file_path = os.path.join(docs_dir, relative_path)
# Ensure directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Save content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(f"# {page_title}\n\n")
f.write(md_content)
print(f"Saved: {file_path}")
# Update navigation structure
current_section = nav_structure
if parent_path:
for section in parent_path.split('/'):
if section:
section_dict = next((item for item in current_section
if isinstance(item, dict) and section in item), None)
if not section_dict:
section_dict = {section: []}
current_section.append(section_dict)
current_section = section_dict[section]
current_section.append({page_title: relative_path})
# Find and process links
menu = soup.select_one('.wy-menu-vertical')
if menu:
for link in menu.select('a'):
href = link.get('href')
if not href or href.startswith(('#', 'mailto:', 'javascript:', 'tel:')):
continue
next_url = urljoin(url, href)
if urlparse(next_url).netloc == urlparse(base_url).netloc:
parent = link.find_parent('li', class_='toctree-l2')
next_parent_path = parent_path
if parent:
section = parent.find_previous_sibling('li', class_='toctree-l1')
if section:
section_name = clean_filename(section.get_text().strip())
next_parent_path = os.path.join(parent_path, section_name)
scrape_page(next_url, next_parent_path)
except Exception as e:
print(f"Error processing {url}: {str(e)}")
try:
scrape_page(base_url)
except Exception as e:
print(f"Error during scraping: {str(e)}")
# Generate mkdocs.yml
mkdocs_config = {
'site_name': 'CloudFerro Documentation',
'site_description': 'CloudFerro Documentation Mirror',
'theme': {
'name': 'material',
'palette': {
'primary': 'blue',
'accent': 'blue'
},
'features': [
'navigation.instant',
'navigation.tracking',
'navigation.tabs',
'navigation.sections',
'navigation.expand',
'navigation.indexes',
'toc.integrate',
'content.code.copy'
]
},
'nav': nav_structure,
'markdown_extensions': [
'admonition',
'pymdownx.details',
'pymdownx.superfences',
'pymdownx.highlight',
'pymdownx.inlinehilite',
'pymdownx.snippets',
'pymdownx.tabbed',
'footnotes',
'toc',
'tables',
'attr_list'
]
}
with open(os.path.join(project_dir, 'mkdocs.yml'), 'w', encoding='utf-8') as f:
yaml.dump(mkdocs_config, f, allow_unicode=True, sort_keys=False)
# Create requirements.txt
requirements = [
"mkdocs>=1.5.0",
"mkdocs-material>=9.6.0",
"pymdown-extensions>=10.7",
]
with open(os.path.join(project_dir, 'requirements.txt'), 'w', encoding='utf-8') as f:
f.write('\n'.join(requirements))
print("\nScraping completed!")
print(f"Documentation has been generated in the {project_dir} directory")
print("\nTo preview the documentation locally:")
print("1. Install requirements: pip install -r cloudferro-docs/requirements.txt")
print("2. Run local server: cd cloudferro-docs && mkdocs serve")
if __name__ == "__main__":
url = 'https://docs.cloudferro.com/en/latest/'
scrape_cloudferro_docs(url)

BIN
docs/_images/01.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

BIN
docs/_images/02.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
docs/_images/03.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

BIN
docs/_images/04.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
docs/_images/05.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

BIN
docs/_images/06.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

BIN
docs/_images/07.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

BIN
docs/_images/08.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
docs/_images/09.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

BIN
docs/_images/10.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

BIN
docs/_images/11.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

BIN
docs/_images/12.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

BIN
docs/_images/13.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

BIN
docs/_images/14.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.5 KiB

BIN
docs/_images/15.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 161 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

BIN
docs/_images/accessvm2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 109 KiB

BIN
docs/_images/accessvm3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

BIN
docs/_images/accessvm5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 642 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.4 KiB

BIN
docs/_images/add_ticket.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

BIN
docs/_images/all_nodes.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 827 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

BIN
docs/_images/apache_ip.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 387 KiB

BIN
docs/_images/api_access.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
docs/_images/apply_yaml.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 93 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

BIN
docs/_images/boto1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

BIN
docs/_images/boto2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.5 KiB

BIN
docs/_images/boto3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

BIN
docs/_images/boto4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

BIN
docs/_images/c1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

Some files were not shown because too many files have changed in this diff Show More