#!/usr/bin/env python3 """ Run Parallel Scraper - Execution Script Various strategies for scraping ZaubaCorp data efficiently """ import asyncio import os import sys import time import argparse from datetime import datetime from typing import Dict, Any, Optional import logging # Import our modules from zaubacorp_parallel_scraper import ZaubaCorpParallelScraper from parallel_config import ParallelConfig, print_config_summary, validate_config class ScrapingStrategy: """Different scraping strategies for various use cases""" @staticmethod async def quick_sample(num_pages: int = 100): """Quick sample scraping for testing""" print(f"\nšŸš€ QUICK SAMPLE STRATEGY - {num_pages} pages") print("=" * 60) config = ParallelConfig.get_config( 'conservative', max_workers=5, batch_size=20, output_dir='sample_output' ) scraper = ZaubaCorpParallelScraper( max_workers=config['max_workers'], output_dir=config['output_dir'] ) await scraper.scrape_all_companies( start_page=1, end_page=num_pages, batch_size=config['batch_size'], scrape_details=False ) @staticmethod async def full_basic_scrape(): """Full scraping of all companies list (basic info only)""" print("\nšŸ”„ FULL BASIC SCRAPE STRATEGY - All 90,769 pages") print("=" * 60) config = ParallelConfig.get_config( 'aggressive', max_workers=25, batch_size=250, output_dir='full_basic_output' ) print_config_summary(config) scraper = ZaubaCorpParallelScraper( max_workers=config['max_workers'], output_dir=config['output_dir'] ) await scraper.scrape_all_companies( start_page=1, end_page=None, # All pages batch_size=config['batch_size'], scrape_details=False ) @staticmethod async def detailed_scrape(max_pages: int = 1000): """Detailed scraping including company detail pages""" print(f"\nšŸ” DETAILED SCRAPE STRATEGY - {max_pages} pages with details") print("=" * 60) config = ParallelConfig.get_config( 'balanced', max_workers=10, batch_size=50, output_dir='detailed_output' ) scraper = ZaubaCorpParallelScraper( max_workers=config['max_workers'], output_dir=config['output_dir'] ) await scraper.scrape_all_companies( start_page=1, end_page=max_pages, batch_size=config['batch_size'], scrape_details=True # This will scrape company detail pages ) @staticmethod async def resume_scrape(failed_pages_file: str): """Resume scraping from failed pages""" print(f"\nšŸ”„ RESUME SCRAPE STRATEGY - From {failed_pages_file}") print("=" * 60) import json # Load failed pages try: with open(failed_pages_file, 'r') as f: failed_pages = json.load(f) print(f"Found {len(failed_pages)} failed pages to retry") except Exception as e: print(f"Error loading failed pages file: {e}") return config = ParallelConfig.get_config( 'conservative', # More conservative for retries max_workers=8, batch_size=25, output_dir='resume_output' ) scraper = ZaubaCorpParallelScraper( max_workers=config['max_workers'], output_dir=config['output_dir'] ) # Process failed pages in smaller batches batch_size = 25 for i in range(0, len(failed_pages), batch_size): batch = failed_pages[i:i + batch_size] print(f"Processing retry batch {i//batch_size + 1}") # Create a temporary scraper for this batch await scraper.scrape_pages_batch( session=None, # Will create its own session page_numbers=batch, batch_num=i//batch_size + 1, scrape_details=False ) @staticmethod async def segmented_scrape(segments: int = 10): """Divide scraping into segments for distributed processing""" print(f"\nšŸ“Š SEGMENTED SCRAPE STRATEGY - {segments} segments") print("=" * 60) total_pages = 90769 pages_per_segment = total_pages // segments for segment in range(segments): start_page = segment * pages_per_segment + 1 end_page = (segment + 1) * pages_per_segment if segment == segments - 1: # Last segment gets remaining pages end_page = total_pages print(f"\n--- Segment {segment + 1}/{segments}: Pages {start_page}-{end_page} ---") config = ParallelConfig.get_config( 'balanced', output_dir=f'segment_{segment + 1}_output' ) scraper = ZaubaCorpParallelScraper( max_workers=config['max_workers'], output_dir=config['output_dir'] ) await scraper.scrape_all_companies( start_page=start_page, end_page=end_page, batch_size=config['batch_size'], scrape_details=False ) print(f"Completed segment {segment + 1}") @staticmethod async def smart_adaptive_scrape(): """Adaptive scraping that adjusts based on success rate""" print("\n🧠 SMART ADAPTIVE SCRAPE STRATEGY") print("=" * 60) # Start with conservative settings current_workers = 5 current_batch_size = 50 success_threshold = 0.8 # 80% success rate required total_processed = 0 current_page = 1 while current_page <= 90769: print(f"\nAdaptive phase: workers={current_workers}, batch_size={current_batch_size}") config = ParallelConfig.get_config( 'balanced', max_workers=current_workers, batch_size=current_batch_size, output_dir='adaptive_output' ) scraper = ZaubaCorpParallelScraper( max_workers=config['max_workers'], output_dir=config['output_dir'] ) # Process a test batch test_batch_size = min(current_batch_size * 2, 90769 - current_page + 1) end_page = min(current_page + test_batch_size - 1, 90769) start_time = time.time() await scraper.scrape_all_companies( start_page=current_page, end_page=end_page, batch_size=current_batch_size, scrape_details=False ) end_time = time.time() # Calculate success rate success_rate = scraper.stats['pages_processed'] / (scraper.stats['pages_processed'] + scraper.stats['failed_pages']) processing_speed = scraper.stats['pages_processed'] / (end_time - start_time) print(f"Success rate: {success_rate:.2%}, Speed: {processing_speed:.1f} pages/sec") # Adaptive adjustments if success_rate >= success_threshold: # Increase performance if successful current_workers = min(current_workers + 2, 30) current_batch_size = min(current_batch_size + 25, 300) print("šŸ“ˆ Increasing performance settings") else: # Decrease performance if too many failures current_workers = max(current_workers - 1, 3) current_batch_size = max(current_batch_size - 10, 25) print("šŸ“‰ Decreasing performance settings") current_page = end_page + 1 total_processed += test_batch_size class ScraperRunner: """Main runner class for executing scraping strategies""" def __init__(self): self.available_strategies = { 'quick': ScrapingStrategy.quick_sample, 'full': ScrapingStrategy.full_basic_scrape, 'detailed': ScrapingStrategy.detailed_scrape, 'resume': ScrapingStrategy.resume_scrape, 'segmented': ScrapingStrategy.segmented_scrape, 'adaptive': ScrapingStrategy.smart_adaptive_scrape } def list_strategies(self): """List all available strategies""" print("\nAvailable Scraping Strategies:") print("=" * 50) print("1. quick - Quick sample (100 pages)") print("2. full - Full basic scrape (all pages)") print("3. detailed - Detailed scrape with company pages") print("4. resume - Resume from failed pages") print("5. segmented - Segmented scraping") print("6. adaptive - Smart adaptive scraping") print("=" * 50) async def run_strategy(self, strategy_name: str, **kwargs): """Run a specific strategy""" if strategy_name not in self.available_strategies: print(f"Error: Unknown strategy '{strategy_name}'") self.list_strategies() return strategy_func = self.available_strategies[strategy_name] print(f"\nšŸŽÆ Starting strategy: {strategy_name.upper()}") print(f"ā° Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") start_time = time.time() try: if strategy_name == 'quick' and 'pages' in kwargs: await strategy_func(kwargs['pages']) elif strategy_name == 'detailed' and 'pages' in kwargs: await strategy_func(kwargs['pages']) elif strategy_name == 'resume' and 'failed_file' in kwargs: await strategy_func(kwargs['failed_file']) elif strategy_name == 'segmented' and 'segments' in kwargs: await strategy_func(kwargs['segments']) else: await strategy_func() except KeyboardInterrupt: print("\nāš ļø Scraping interrupted by user") except Exception as e: print(f"\nāŒ Error during scraping: {e}") logging.error(f"Strategy {strategy_name} failed: {e}") end_time = time.time() duration = end_time - start_time print(f"\nāœ… Strategy completed") print(f"ā±ļø Total duration: {duration/3600:.1f} hours") print(f"šŸ End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") def main(): """Main function with command line interface""" parser = argparse.ArgumentParser(description='ZaubaCorp Parallel Scraper Runner') parser.add_argument('strategy', choices=['quick', 'full', 'detailed', 'resume', 'segmented', 'adaptive'], help='Scraping strategy to use') parser.add_argument('--pages', type=int, help='Number of pages for quick/detailed strategies') parser.add_argument('--segments', type=int, default=10, help='Number of segments for segmented strategy') parser.add_argument('--failed-file', type=str, help='Failed pages file for resume strategy') parser.add_argument('--list', action='store_true', help='List available strategies') args = parser.parse_args() runner = ScraperRunner() if args.list: runner.list_strategies() return # Validate arguments if args.strategy == 'resume' and not args.failed_file: print("Error: --failed-file required for resume strategy") return if args.strategy == 'resume' and not os.path.exists(args.failed_file): print(f"Error: Failed file '{args.failed_file}' not found") return # Prepare kwargs kwargs = {} if args.pages: kwargs['pages'] = args.pages if args.segments: kwargs['segments'] = args.segments if args.failed_file: kwargs['failed_file'] = args.failed_file # Run the strategy print("šŸš€ ZaubaCorp Parallel Scraper Runner") print("=" * 50) try: asyncio.run(runner.run_strategy(args.strategy, **kwargs)) except KeyboardInterrupt: print("\nšŸ‘‹ Goodbye!") except Exception as e: print(f"\nšŸ’„ Unexpected error: {e}") if __name__ == "__main__": # If no command line arguments, run interactive mode if len(sys.argv) == 1: print("šŸš€ ZaubaCorp Parallel Scraper - Interactive Mode") print("=" * 60) runner = ScraperRunner() runner.list_strategies() choice = input("\nSelect strategy (1-6): ").strip() strategy_map = { '1': 'quick', '2': 'full', '3': 'detailed', '4': 'resume', '5': 'segmented', '6': 'adaptive' } if choice in strategy_map: strategy = strategy_map[choice] kwargs = {} if strategy == 'quick': pages = input("Number of pages (default 100): ").strip() if pages: kwargs['pages'] = int(pages) else: kwargs['pages'] = 100 elif strategy == 'detailed': pages = input("Number of pages (default 1000): ").strip() if pages: kwargs['pages'] = int(pages) else: kwargs['pages'] = 1000 elif strategy == 'resume': failed_file = input("Path to failed pages file: ").strip() if not failed_file or not os.path.exists(failed_file): print("Invalid file path") sys.exit(1) kwargs['failed_file'] = failed_file elif strategy == 'segmented': segments = input("Number of segments (default 10): ").strip() if segments: kwargs['segments'] = int(segments) else: kwargs['segments'] = 10 print(f"\nšŸŽÆ Running strategy: {strategy}") asyncio.run(runner.run_strategy(strategy, **kwargs)) else: print("Invalid choice") else: main()