39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
import os
|
|
import re
|
|
|
|
def clean_md_file(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Remove content before first "---" if it exists
|
|
if "---" in content:
|
|
content = re.sub(r'^.*?---', '', content, flags=re.DOTALL)
|
|
|
|
# Remove navigation links at the bottom
|
|
content = re.sub(r'\n\[Previous\].*$', '', content, flags=re.DOTALL)
|
|
|
|
# Remove any remaining navigation-related content at the bottom
|
|
content = re.sub(r'\n\* \[.*?\].*$', '', content, flags=re.DOTALL)
|
|
|
|
# Clean up multiple blank lines
|
|
content = re.sub(r'\n{3,}', '\n\n', content)
|
|
|
|
# Trim leading/trailing whitespace while preserving content
|
|
content = content.strip()
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
def process_directory(dir_path):
|
|
for root, dirs, files in os.walk(dir_path):
|
|
for file in files:
|
|
if file.endswith('.md'):
|
|
file_path = os.path.join(root, file)
|
|
print(f"Processing: {file_path}")
|
|
clean_md_file(file_path)
|
|
|
|
if __name__ == "__main__":
|
|
docs_dir = "/Users/dhanraj/Desktop/kpme_scraper/docs"
|
|
process_directory(docs_dir)
|
|
print("Completed cleaning markdown files.")
|