412 lines
14 KiB
Python
412 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Run Parallel Scraper - Execution Script
|
|
Various strategies for scraping ZaubaCorp data efficiently
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
import time
|
|
import argparse
|
|
from datetime import datetime
|
|
from typing import Dict, Any, Optional
|
|
import logging
|
|
|
|
# Import our modules
|
|
from zaubacorp_parallel_scraper import ZaubaCorpParallelScraper
|
|
from parallel_config import ParallelConfig, print_config_summary, validate_config
|
|
|
|
class ScrapingStrategy:
|
|
"""Different scraping strategies for various use cases"""
|
|
|
|
@staticmethod
|
|
async def quick_sample(num_pages: int = 100):
|
|
"""Quick sample scraping for testing"""
|
|
print(f"\n🚀 QUICK SAMPLE STRATEGY - {num_pages} pages")
|
|
print("=" * 60)
|
|
|
|
config = ParallelConfig.get_config(
|
|
'conservative',
|
|
max_workers=5,
|
|
batch_size=20,
|
|
output_dir='sample_output'
|
|
)
|
|
|
|
scraper = ZaubaCorpParallelScraper(
|
|
max_workers=config['max_workers'],
|
|
output_dir=config['output_dir']
|
|
)
|
|
|
|
await scraper.scrape_all_companies(
|
|
start_page=1,
|
|
end_page=num_pages,
|
|
batch_size=config['batch_size'],
|
|
scrape_details=False
|
|
)
|
|
|
|
@staticmethod
|
|
async def full_basic_scrape():
|
|
"""Full scraping of all companies list (basic info only)"""
|
|
print("\n🔥 FULL BASIC SCRAPE STRATEGY - All 90,769 pages")
|
|
print("=" * 60)
|
|
|
|
config = ParallelConfig.get_config(
|
|
'aggressive',
|
|
max_workers=25,
|
|
batch_size=250,
|
|
output_dir='full_basic_output'
|
|
)
|
|
|
|
print_config_summary(config)
|
|
|
|
scraper = ZaubaCorpParallelScraper(
|
|
max_workers=config['max_workers'],
|
|
output_dir=config['output_dir']
|
|
)
|
|
|
|
await scraper.scrape_all_companies(
|
|
start_page=1,
|
|
end_page=None, # All pages
|
|
batch_size=config['batch_size'],
|
|
scrape_details=False
|
|
)
|
|
|
|
@staticmethod
|
|
async def detailed_scrape(max_pages: int = 1000):
|
|
"""Detailed scraping including company detail pages"""
|
|
print(f"\n🔍 DETAILED SCRAPE STRATEGY - {max_pages} pages with details")
|
|
print("=" * 60)
|
|
|
|
config = ParallelConfig.get_config(
|
|
'balanced',
|
|
max_workers=10,
|
|
batch_size=50,
|
|
output_dir='detailed_output'
|
|
)
|
|
|
|
scraper = ZaubaCorpParallelScraper(
|
|
max_workers=config['max_workers'],
|
|
output_dir=config['output_dir']
|
|
)
|
|
|
|
await scraper.scrape_all_companies(
|
|
start_page=1,
|
|
end_page=max_pages,
|
|
batch_size=config['batch_size'],
|
|
scrape_details=True # This will scrape company detail pages
|
|
)
|
|
|
|
@staticmethod
|
|
async def resume_scrape(failed_pages_file: str):
|
|
"""Resume scraping from failed pages"""
|
|
print(f"\n🔄 RESUME SCRAPE STRATEGY - From {failed_pages_file}")
|
|
print("=" * 60)
|
|
|
|
import json
|
|
|
|
# Load failed pages
|
|
try:
|
|
with open(failed_pages_file, 'r') as f:
|
|
failed_pages = json.load(f)
|
|
print(f"Found {len(failed_pages)} failed pages to retry")
|
|
except Exception as e:
|
|
print(f"Error loading failed pages file: {e}")
|
|
return
|
|
|
|
config = ParallelConfig.get_config(
|
|
'conservative', # More conservative for retries
|
|
max_workers=8,
|
|
batch_size=25,
|
|
output_dir='resume_output'
|
|
)
|
|
|
|
scraper = ZaubaCorpParallelScraper(
|
|
max_workers=config['max_workers'],
|
|
output_dir=config['output_dir']
|
|
)
|
|
|
|
# Process failed pages in smaller batches
|
|
batch_size = 25
|
|
for i in range(0, len(failed_pages), batch_size):
|
|
batch = failed_pages[i:i + batch_size]
|
|
print(f"Processing retry batch {i//batch_size + 1}")
|
|
|
|
# Create a temporary scraper for this batch
|
|
await scraper.scrape_pages_batch(
|
|
session=None, # Will create its own session
|
|
page_numbers=batch,
|
|
batch_num=i//batch_size + 1,
|
|
scrape_details=False
|
|
)
|
|
|
|
@staticmethod
|
|
async def segmented_scrape(segments: int = 10):
|
|
"""Divide scraping into segments for distributed processing"""
|
|
print(f"\n📊 SEGMENTED SCRAPE STRATEGY - {segments} segments")
|
|
print("=" * 60)
|
|
|
|
total_pages = 90769
|
|
pages_per_segment = total_pages // segments
|
|
|
|
for segment in range(segments):
|
|
start_page = segment * pages_per_segment + 1
|
|
end_page = (segment + 1) * pages_per_segment
|
|
|
|
if segment == segments - 1: # Last segment gets remaining pages
|
|
end_page = total_pages
|
|
|
|
print(f"\n--- Segment {segment + 1}/{segments}: Pages {start_page}-{end_page} ---")
|
|
|
|
config = ParallelConfig.get_config(
|
|
'balanced',
|
|
output_dir=f'segment_{segment + 1}_output'
|
|
)
|
|
|
|
scraper = ZaubaCorpParallelScraper(
|
|
max_workers=config['max_workers'],
|
|
output_dir=config['output_dir']
|
|
)
|
|
|
|
await scraper.scrape_all_companies(
|
|
start_page=start_page,
|
|
end_page=end_page,
|
|
batch_size=config['batch_size'],
|
|
scrape_details=False
|
|
)
|
|
|
|
print(f"Completed segment {segment + 1}")
|
|
|
|
@staticmethod
|
|
async def smart_adaptive_scrape():
|
|
"""Adaptive scraping that adjusts based on success rate"""
|
|
print("\n🧠 SMART ADAPTIVE SCRAPE STRATEGY")
|
|
print("=" * 60)
|
|
|
|
# Start with conservative settings
|
|
current_workers = 5
|
|
current_batch_size = 50
|
|
success_threshold = 0.8 # 80% success rate required
|
|
|
|
total_processed = 0
|
|
current_page = 1
|
|
|
|
while current_page <= 90769:
|
|
print(f"\nAdaptive phase: workers={current_workers}, batch_size={current_batch_size}")
|
|
|
|
config = ParallelConfig.get_config(
|
|
'balanced',
|
|
max_workers=current_workers,
|
|
batch_size=current_batch_size,
|
|
output_dir='adaptive_output'
|
|
)
|
|
|
|
scraper = ZaubaCorpParallelScraper(
|
|
max_workers=config['max_workers'],
|
|
output_dir=config['output_dir']
|
|
)
|
|
|
|
# Process a test batch
|
|
test_batch_size = min(current_batch_size * 2, 90769 - current_page + 1)
|
|
end_page = min(current_page + test_batch_size - 1, 90769)
|
|
|
|
start_time = time.time()
|
|
await scraper.scrape_all_companies(
|
|
start_page=current_page,
|
|
end_page=end_page,
|
|
batch_size=current_batch_size,
|
|
scrape_details=False
|
|
)
|
|
end_time = time.time()
|
|
|
|
# Calculate success rate
|
|
success_rate = scraper.stats['pages_processed'] / (scraper.stats['pages_processed'] + scraper.stats['failed_pages'])
|
|
processing_speed = scraper.stats['pages_processed'] / (end_time - start_time)
|
|
|
|
print(f"Success rate: {success_rate:.2%}, Speed: {processing_speed:.1f} pages/sec")
|
|
|
|
# Adaptive adjustments
|
|
if success_rate >= success_threshold:
|
|
# Increase performance if successful
|
|
current_workers = min(current_workers + 2, 30)
|
|
current_batch_size = min(current_batch_size + 25, 300)
|
|
print("📈 Increasing performance settings")
|
|
else:
|
|
# Decrease performance if too many failures
|
|
current_workers = max(current_workers - 1, 3)
|
|
current_batch_size = max(current_batch_size - 10, 25)
|
|
print("📉 Decreasing performance settings")
|
|
|
|
current_page = end_page + 1
|
|
total_processed += test_batch_size
|
|
|
|
class ScraperRunner:
|
|
"""Main runner class for executing scraping strategies"""
|
|
|
|
def __init__(self):
|
|
self.available_strategies = {
|
|
'quick': ScrapingStrategy.quick_sample,
|
|
'full': ScrapingStrategy.full_basic_scrape,
|
|
'detailed': ScrapingStrategy.detailed_scrape,
|
|
'resume': ScrapingStrategy.resume_scrape,
|
|
'segmented': ScrapingStrategy.segmented_scrape,
|
|
'adaptive': ScrapingStrategy.smart_adaptive_scrape
|
|
}
|
|
|
|
def list_strategies(self):
|
|
"""List all available strategies"""
|
|
print("\nAvailable Scraping Strategies:")
|
|
print("=" * 50)
|
|
print("1. quick - Quick sample (100 pages)")
|
|
print("2. full - Full basic scrape (all pages)")
|
|
print("3. detailed - Detailed scrape with company pages")
|
|
print("4. resume - Resume from failed pages")
|
|
print("5. segmented - Segmented scraping")
|
|
print("6. adaptive - Smart adaptive scraping")
|
|
print("=" * 50)
|
|
|
|
async def run_strategy(self, strategy_name: str, **kwargs):
|
|
"""Run a specific strategy"""
|
|
if strategy_name not in self.available_strategies:
|
|
print(f"Error: Unknown strategy '{strategy_name}'")
|
|
self.list_strategies()
|
|
return
|
|
|
|
strategy_func = self.available_strategies[strategy_name]
|
|
|
|
print(f"\n🎯 Starting strategy: {strategy_name.upper()}")
|
|
print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
if strategy_name == 'quick' and 'pages' in kwargs:
|
|
await strategy_func(kwargs['pages'])
|
|
elif strategy_name == 'detailed' and 'pages' in kwargs:
|
|
await strategy_func(kwargs['pages'])
|
|
elif strategy_name == 'resume' and 'failed_file' in kwargs:
|
|
await strategy_func(kwargs['failed_file'])
|
|
elif strategy_name == 'segmented' and 'segments' in kwargs:
|
|
await strategy_func(kwargs['segments'])
|
|
else:
|
|
await strategy_func()
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n⚠️ Scraping interrupted by user")
|
|
except Exception as e:
|
|
print(f"\n❌ Error during scraping: {e}")
|
|
logging.error(f"Strategy {strategy_name} failed: {e}")
|
|
|
|
end_time = time.time()
|
|
duration = end_time - start_time
|
|
|
|
print(f"\n✅ Strategy completed")
|
|
print(f"⏱️ Total duration: {duration/3600:.1f} hours")
|
|
print(f"🏁 End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
def main():
|
|
"""Main function with command line interface"""
|
|
parser = argparse.ArgumentParser(description='ZaubaCorp Parallel Scraper Runner')
|
|
parser.add_argument('strategy', choices=['quick', 'full', 'detailed', 'resume', 'segmented', 'adaptive'],
|
|
help='Scraping strategy to use')
|
|
parser.add_argument('--pages', type=int, help='Number of pages for quick/detailed strategies')
|
|
parser.add_argument('--segments', type=int, default=10, help='Number of segments for segmented strategy')
|
|
parser.add_argument('--failed-file', type=str, help='Failed pages file for resume strategy')
|
|
parser.add_argument('--list', action='store_true', help='List available strategies')
|
|
|
|
args = parser.parse_args()
|
|
|
|
runner = ScraperRunner()
|
|
|
|
if args.list:
|
|
runner.list_strategies()
|
|
return
|
|
|
|
# Validate arguments
|
|
if args.strategy == 'resume' and not args.failed_file:
|
|
print("Error: --failed-file required for resume strategy")
|
|
return
|
|
|
|
if args.strategy == 'resume' and not os.path.exists(args.failed_file):
|
|
print(f"Error: Failed file '{args.failed_file}' not found")
|
|
return
|
|
|
|
# Prepare kwargs
|
|
kwargs = {}
|
|
if args.pages:
|
|
kwargs['pages'] = args.pages
|
|
if args.segments:
|
|
kwargs['segments'] = args.segments
|
|
if args.failed_file:
|
|
kwargs['failed_file'] = args.failed_file
|
|
|
|
# Run the strategy
|
|
print("🚀 ZaubaCorp Parallel Scraper Runner")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
asyncio.run(runner.run_strategy(args.strategy, **kwargs))
|
|
except KeyboardInterrupt:
|
|
print("\n👋 Goodbye!")
|
|
except Exception as e:
|
|
print(f"\n💥 Unexpected error: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
# If no command line arguments, run interactive mode
|
|
if len(sys.argv) == 1:
|
|
print("🚀 ZaubaCorp Parallel Scraper - Interactive Mode")
|
|
print("=" * 60)
|
|
|
|
runner = ScraperRunner()
|
|
runner.list_strategies()
|
|
|
|
choice = input("\nSelect strategy (1-6): ").strip()
|
|
|
|
strategy_map = {
|
|
'1': 'quick',
|
|
'2': 'full',
|
|
'3': 'detailed',
|
|
'4': 'resume',
|
|
'5': 'segmented',
|
|
'6': 'adaptive'
|
|
}
|
|
|
|
if choice in strategy_map:
|
|
strategy = strategy_map[choice]
|
|
kwargs = {}
|
|
|
|
if strategy == 'quick':
|
|
pages = input("Number of pages (default 100): ").strip()
|
|
if pages:
|
|
kwargs['pages'] = int(pages)
|
|
else:
|
|
kwargs['pages'] = 100
|
|
|
|
elif strategy == 'detailed':
|
|
pages = input("Number of pages (default 1000): ").strip()
|
|
if pages:
|
|
kwargs['pages'] = int(pages)
|
|
else:
|
|
kwargs['pages'] = 1000
|
|
|
|
elif strategy == 'resume':
|
|
failed_file = input("Path to failed pages file: ").strip()
|
|
if not failed_file or not os.path.exists(failed_file):
|
|
print("Invalid file path")
|
|
sys.exit(1)
|
|
kwargs['failed_file'] = failed_file
|
|
|
|
elif strategy == 'segmented':
|
|
segments = input("Number of segments (default 10): ").strip()
|
|
if segments:
|
|
kwargs['segments'] = int(segments)
|
|
else:
|
|
kwargs['segments'] = 10
|
|
|
|
print(f"\n🎯 Running strategy: {strategy}")
|
|
asyncio.run(runner.run_strategy(strategy, **kwargs))
|
|
else:
|
|
print("Invalid choice")
|
|
else:
|
|
main()
|