first commit

This commit is contained in:
govardhan
2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions

411
run_parallel_scraper.py Normal file
View File

@ -0,0 +1,411 @@
#!/usr/bin/env python3
"""
Run Parallel Scraper - Execution Script
Various strategies for scraping ZaubaCorp data efficiently
"""
import asyncio
import os
import sys
import time
import argparse
from datetime import datetime
from typing import Dict, Any, Optional
import logging
# Import our modules
from zaubacorp_parallel_scraper import ZaubaCorpParallelScraper
from parallel_config import ParallelConfig, print_config_summary, validate_config
class ScrapingStrategy:
"""Different scraping strategies for various use cases"""
@staticmethod
async def quick_sample(num_pages: int = 100):
"""Quick sample scraping for testing"""
print(f"\n🚀 QUICK SAMPLE STRATEGY - {num_pages} pages")
print("=" * 60)
config = ParallelConfig.get_config(
'conservative',
max_workers=5,
batch_size=20,
output_dir='sample_output'
)
scraper = ZaubaCorpParallelScraper(
max_workers=config['max_workers'],
output_dir=config['output_dir']
)
await scraper.scrape_all_companies(
start_page=1,
end_page=num_pages,
batch_size=config['batch_size'],
scrape_details=False
)
@staticmethod
async def full_basic_scrape():
"""Full scraping of all companies list (basic info only)"""
print("\n🔥 FULL BASIC SCRAPE STRATEGY - All 90,769 pages")
print("=" * 60)
config = ParallelConfig.get_config(
'aggressive',
max_workers=25,
batch_size=250,
output_dir='full_basic_output'
)
print_config_summary(config)
scraper = ZaubaCorpParallelScraper(
max_workers=config['max_workers'],
output_dir=config['output_dir']
)
await scraper.scrape_all_companies(
start_page=1,
end_page=None, # All pages
batch_size=config['batch_size'],
scrape_details=False
)
@staticmethod
async def detailed_scrape(max_pages: int = 1000):
"""Detailed scraping including company detail pages"""
print(f"\n🔍 DETAILED SCRAPE STRATEGY - {max_pages} pages with details")
print("=" * 60)
config = ParallelConfig.get_config(
'balanced',
max_workers=10,
batch_size=50,
output_dir='detailed_output'
)
scraper = ZaubaCorpParallelScraper(
max_workers=config['max_workers'],
output_dir=config['output_dir']
)
await scraper.scrape_all_companies(
start_page=1,
end_page=max_pages,
batch_size=config['batch_size'],
scrape_details=True # This will scrape company detail pages
)
@staticmethod
async def resume_scrape(failed_pages_file: str):
"""Resume scraping from failed pages"""
print(f"\n🔄 RESUME SCRAPE STRATEGY - From {failed_pages_file}")
print("=" * 60)
import json
# Load failed pages
try:
with open(failed_pages_file, 'r') as f:
failed_pages = json.load(f)
print(f"Found {len(failed_pages)} failed pages to retry")
except Exception as e:
print(f"Error loading failed pages file: {e}")
return
config = ParallelConfig.get_config(
'conservative', # More conservative for retries
max_workers=8,
batch_size=25,
output_dir='resume_output'
)
scraper = ZaubaCorpParallelScraper(
max_workers=config['max_workers'],
output_dir=config['output_dir']
)
# Process failed pages in smaller batches
batch_size = 25
for i in range(0, len(failed_pages), batch_size):
batch = failed_pages[i:i + batch_size]
print(f"Processing retry batch {i//batch_size + 1}")
# Create a temporary scraper for this batch
await scraper.scrape_pages_batch(
session=None, # Will create its own session
page_numbers=batch,
batch_num=i//batch_size + 1,
scrape_details=False
)
@staticmethod
async def segmented_scrape(segments: int = 10):
"""Divide scraping into segments for distributed processing"""
print(f"\n📊 SEGMENTED SCRAPE STRATEGY - {segments} segments")
print("=" * 60)
total_pages = 90769
pages_per_segment = total_pages // segments
for segment in range(segments):
start_page = segment * pages_per_segment + 1
end_page = (segment + 1) * pages_per_segment
if segment == segments - 1: # Last segment gets remaining pages
end_page = total_pages
print(f"\n--- Segment {segment + 1}/{segments}: Pages {start_page}-{end_page} ---")
config = ParallelConfig.get_config(
'balanced',
output_dir=f'segment_{segment + 1}_output'
)
scraper = ZaubaCorpParallelScraper(
max_workers=config['max_workers'],
output_dir=config['output_dir']
)
await scraper.scrape_all_companies(
start_page=start_page,
end_page=end_page,
batch_size=config['batch_size'],
scrape_details=False
)
print(f"Completed segment {segment + 1}")
@staticmethod
async def smart_adaptive_scrape():
"""Adaptive scraping that adjusts based on success rate"""
print("\n🧠 SMART ADAPTIVE SCRAPE STRATEGY")
print("=" * 60)
# Start with conservative settings
current_workers = 5
current_batch_size = 50
success_threshold = 0.8 # 80% success rate required
total_processed = 0
current_page = 1
while current_page <= 90769:
print(f"\nAdaptive phase: workers={current_workers}, batch_size={current_batch_size}")
config = ParallelConfig.get_config(
'balanced',
max_workers=current_workers,
batch_size=current_batch_size,
output_dir='adaptive_output'
)
scraper = ZaubaCorpParallelScraper(
max_workers=config['max_workers'],
output_dir=config['output_dir']
)
# Process a test batch
test_batch_size = min(current_batch_size * 2, 90769 - current_page + 1)
end_page = min(current_page + test_batch_size - 1, 90769)
start_time = time.time()
await scraper.scrape_all_companies(
start_page=current_page,
end_page=end_page,
batch_size=current_batch_size,
scrape_details=False
)
end_time = time.time()
# Calculate success rate
success_rate = scraper.stats['pages_processed'] / (scraper.stats['pages_processed'] + scraper.stats['failed_pages'])
processing_speed = scraper.stats['pages_processed'] / (end_time - start_time)
print(f"Success rate: {success_rate:.2%}, Speed: {processing_speed:.1f} pages/sec")
# Adaptive adjustments
if success_rate >= success_threshold:
# Increase performance if successful
current_workers = min(current_workers + 2, 30)
current_batch_size = min(current_batch_size + 25, 300)
print("📈 Increasing performance settings")
else:
# Decrease performance if too many failures
current_workers = max(current_workers - 1, 3)
current_batch_size = max(current_batch_size - 10, 25)
print("📉 Decreasing performance settings")
current_page = end_page + 1
total_processed += test_batch_size
class ScraperRunner:
"""Main runner class for executing scraping strategies"""
def __init__(self):
self.available_strategies = {
'quick': ScrapingStrategy.quick_sample,
'full': ScrapingStrategy.full_basic_scrape,
'detailed': ScrapingStrategy.detailed_scrape,
'resume': ScrapingStrategy.resume_scrape,
'segmented': ScrapingStrategy.segmented_scrape,
'adaptive': ScrapingStrategy.smart_adaptive_scrape
}
def list_strategies(self):
"""List all available strategies"""
print("\nAvailable Scraping Strategies:")
print("=" * 50)
print("1. quick - Quick sample (100 pages)")
print("2. full - Full basic scrape (all pages)")
print("3. detailed - Detailed scrape with company pages")
print("4. resume - Resume from failed pages")
print("5. segmented - Segmented scraping")
print("6. adaptive - Smart adaptive scraping")
print("=" * 50)
async def run_strategy(self, strategy_name: str, **kwargs):
"""Run a specific strategy"""
if strategy_name not in self.available_strategies:
print(f"Error: Unknown strategy '{strategy_name}'")
self.list_strategies()
return
strategy_func = self.available_strategies[strategy_name]
print(f"\n🎯 Starting strategy: {strategy_name.upper()}")
print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
start_time = time.time()
try:
if strategy_name == 'quick' and 'pages' in kwargs:
await strategy_func(kwargs['pages'])
elif strategy_name == 'detailed' and 'pages' in kwargs:
await strategy_func(kwargs['pages'])
elif strategy_name == 'resume' and 'failed_file' in kwargs:
await strategy_func(kwargs['failed_file'])
elif strategy_name == 'segmented' and 'segments' in kwargs:
await strategy_func(kwargs['segments'])
else:
await strategy_func()
except KeyboardInterrupt:
print("\n⚠️ Scraping interrupted by user")
except Exception as e:
print(f"\n❌ Error during scraping: {e}")
logging.error(f"Strategy {strategy_name} failed: {e}")
end_time = time.time()
duration = end_time - start_time
print(f"\n✅ Strategy completed")
print(f"⏱️ Total duration: {duration/3600:.1f} hours")
print(f"🏁 End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
def main():
"""Main function with command line interface"""
parser = argparse.ArgumentParser(description='ZaubaCorp Parallel Scraper Runner')
parser.add_argument('strategy', choices=['quick', 'full', 'detailed', 'resume', 'segmented', 'adaptive'],
help='Scraping strategy to use')
parser.add_argument('--pages', type=int, help='Number of pages for quick/detailed strategies')
parser.add_argument('--segments', type=int, default=10, help='Number of segments for segmented strategy')
parser.add_argument('--failed-file', type=str, help='Failed pages file for resume strategy')
parser.add_argument('--list', action='store_true', help='List available strategies')
args = parser.parse_args()
runner = ScraperRunner()
if args.list:
runner.list_strategies()
return
# Validate arguments
if args.strategy == 'resume' and not args.failed_file:
print("Error: --failed-file required for resume strategy")
return
if args.strategy == 'resume' and not os.path.exists(args.failed_file):
print(f"Error: Failed file '{args.failed_file}' not found")
return
# Prepare kwargs
kwargs = {}
if args.pages:
kwargs['pages'] = args.pages
if args.segments:
kwargs['segments'] = args.segments
if args.failed_file:
kwargs['failed_file'] = args.failed_file
# Run the strategy
print("🚀 ZaubaCorp Parallel Scraper Runner")
print("=" * 50)
try:
asyncio.run(runner.run_strategy(args.strategy, **kwargs))
except KeyboardInterrupt:
print("\n👋 Goodbye!")
except Exception as e:
print(f"\n💥 Unexpected error: {e}")
if __name__ == "__main__":
# If no command line arguments, run interactive mode
if len(sys.argv) == 1:
print("🚀 ZaubaCorp Parallel Scraper - Interactive Mode")
print("=" * 60)
runner = ScraperRunner()
runner.list_strategies()
choice = input("\nSelect strategy (1-6): ").strip()
strategy_map = {
'1': 'quick',
'2': 'full',
'3': 'detailed',
'4': 'resume',
'5': 'segmented',
'6': 'adaptive'
}
if choice in strategy_map:
strategy = strategy_map[choice]
kwargs = {}
if strategy == 'quick':
pages = input("Number of pages (default 100): ").strip()
if pages:
kwargs['pages'] = int(pages)
else:
kwargs['pages'] = 100
elif strategy == 'detailed':
pages = input("Number of pages (default 1000): ").strip()
if pages:
kwargs['pages'] = int(pages)
else:
kwargs['pages'] = 1000
elif strategy == 'resume':
failed_file = input("Path to failed pages file: ").strip()
if not failed_file or not os.path.exists(failed_file):
print("Invalid file path")
sys.exit(1)
kwargs['failed_file'] = failed_file
elif strategy == 'segmented':
segments = input("Number of segments (default 10): ").strip()
if segments:
kwargs['segments'] = int(segments)
else:
kwargs['segments'] = 10
print(f"\n🎯 Running strategy: {strategy}")
asyncio.run(runner.run_strategy(strategy, **kwargs))
else:
print("Invalid choice")
else:
main()