#!/usr/bin/env python3 """ Parallel Configuration for ZaubaCorp Scraper Optimized settings for high-performance parallel scraping """ import os from typing import Dict, Any class ParallelConfig: """Configuration class for parallel scraping""" # Basic settings BASE_CONFIG = { 'base_url': 'https://www.zaubacorp.com', 'companies_list_base': 'https://www.zaubacorp.com/companies-list', 'total_pages': 90769, 'output_dir': 'zaubacorp_parallel_data' } # Performance settings for different use cases PERFORMANCE_PROFILES = { 'conservative': { 'max_workers': 5, 'batch_size': 50, 'request_delay': (0.5, 1.0), # Random delay range 'connection_limit': 10, 'timeout': 45, 'retries': 3, 'semaphore_limit': 3 }, 'balanced': { 'max_workers': 15, 'batch_size': 100, 'request_delay': (0.2, 0.5), 'connection_limit': 30, 'timeout': 30, 'retries': 3, 'semaphore_limit': 5 }, 'aggressive': { 'max_workers': 25, 'batch_size': 200, 'request_delay': (0.1, 0.3), 'connection_limit': 50, 'timeout': 20, 'retries': 2, 'semaphore_limit': 8 }, 'maximum': { 'max_workers': 40, 'batch_size': 300, 'request_delay': (0.05, 0.2), 'connection_limit': 80, 'timeout': 15, 'retries': 2, 'semaphore_limit': 10 } } # HTTP settings HTTP_CONFIG = { 'user_agents': [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15' ], 'default_headers': { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Cache-Control': 'max-age=0' } } # Logging configuration LOGGING_CONFIG = { 'level': 'INFO', 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 'console_output': True, 'file_output': True, 'log_filename': 'parallel_scraper.log' } # Output configuration OUTPUT_CONFIG = { 'save_formats': ['csv', 'json'], 'batch_save': True, 'final_consolidation': True, 'compression': False, # Set to True to compress large files 'encoding': 'utf-8' } # Error handling ERROR_CONFIG = { 'max_retries': 3, 'retry_delay_base': 1.0, # Base delay for exponential backoff 'rate_limit_wait': 10, # Wait time when rate limited 'ignore_ssl_errors': False, 'continue_on_error': True } # Data extraction settings EXTRACTION_CONFIG = { 'required_fields': ['cin', 'company_name'], 'optional_fields': ['status', 'paid_up_capital', 'address', 'company_url'], 'validate_data': True, 'skip_empty_records': True, 'clean_text': True } @classmethod def get_config(cls, profile: str = 'balanced', **overrides) -> Dict[str, Any]: """ Get configuration for a specific performance profile Args: profile: Performance profile name ('conservative', 'balanced', 'aggressive', 'maximum') **overrides: Override specific configuration values Returns: Complete configuration dictionary """ if profile not in cls.PERFORMANCE_PROFILES: raise ValueError(f"Unknown profile: {profile}. Available: {list(cls.PERFORMANCE_PROFILES.keys())}") config = { **cls.BASE_CONFIG, **cls.PERFORMANCE_PROFILES[profile], 'http': cls.HTTP_CONFIG, 'logging': cls.LOGGING_CONFIG, 'output': cls.OUTPUT_CONFIG, 'error_handling': cls.ERROR_CONFIG, 'extraction': cls.EXTRACTION_CONFIG } # Apply overrides for key, value in overrides.items(): if '.' in key: # Handle nested keys like 'logging.level' keys = key.split('.') target = config for k in keys[:-1]: target = target[k] target[keys[-1]] = value else: config[key] = value return config @classmethod def get_optimized_config(cls, system_cores: int = None, available_memory_gb: int = None) -> Dict[str, Any]: """ Get optimized configuration based on system resources Args: system_cores: Number of CPU cores available available_memory_gb: Available memory in GB Returns: Optimized configuration """ import psutil if system_cores is None: system_cores = psutil.cpu_count() if available_memory_gb is None: available_memory_gb = psutil.virtual_memory().available / (1024**3) # Determine optimal profile based on resources if system_cores >= 16 and available_memory_gb >= 16: profile = 'maximum' elif system_cores >= 8 and available_memory_gb >= 8: profile = 'aggressive' elif system_cores >= 4 and available_memory_gb >= 4: profile = 'balanced' else: profile = 'conservative' # Calculate optimal workers based on cores optimal_workers = min(system_cores * 2, 40) # Calculate optimal batch size based on memory optimal_batch_size = min(int(available_memory_gb * 50), 500) config = cls.get_config( profile, max_workers=optimal_workers, batch_size=optimal_batch_size ) return config # Predefined configurations for common scenarios QUICK_TEST_CONFIG = ParallelConfig.get_config( 'conservative', max_workers=3, batch_size=10, total_pages=50, output_dir='test_output' ) PRODUCTION_CONFIG = ParallelConfig.get_config( 'balanced', output_dir='production_output', **{ 'logging.level': 'INFO', 'output.compression': True, 'error_handling.continue_on_error': True } ) HIGH_SPEED_CONFIG = ParallelConfig.get_config( 'aggressive', output_dir='high_speed_output', **{ 'logging.level': 'WARNING', # Reduce logging overhead 'output.batch_save': True } ) # Custom configurations for specific use cases DETAIL_SCRAPING_CONFIG = ParallelConfig.get_config( 'conservative', # More conservative for detail scraping max_workers=8, batch_size=25, request_delay=(1.0, 2.0), # Longer delays for detail pages output_dir='detailed_output' ) # Configuration for resuming failed scrapes RETRY_CONFIG = ParallelConfig.get_config( 'balanced', **{ 'error_handling.max_retries': 5, 'error_handling.retry_delay_base': 2.0, 'error_handling.continue_on_error': True } ) def print_config_summary(config: Dict[str, Any]): """Print a summary of the configuration""" print("Configuration Summary:") print("=" * 50) print(f"Max Workers: {config['max_workers']}") print(f"Batch Size: {config['batch_size']}") print(f"Request Delay: {config['request_delay']}") print(f"Connection Limit: {config['connection_limit']}") print(f"Timeout: {config['timeout']}s") print(f"Retries: {config['retries']}") print(f"Output Directory: {config['output_dir']}") print(f"Total Pages: {config.get('total_pages', 'All')}") print("=" * 50) def validate_config(config: Dict[str, Any]) -> bool: """Validate configuration parameters""" required_keys = ['max_workers', 'batch_size', 'output_dir'] for key in required_keys: if key not in config: print(f"Error: Missing required configuration key: {key}") return False if config['max_workers'] < 1: print("Error: max_workers must be at least 1") return False if config['batch_size'] < 1: print("Error: batch_size must be at least 1") return False return True # Example usage configurations if __name__ == "__main__": # Example: Get optimized configuration optimized = ParallelConfig.get_optimized_config() print("Optimized Configuration:") print_config_summary(optimized) print("\nAvailable Profiles:") for profile in ParallelConfig.PERFORMANCE_PROFILES.keys(): config = ParallelConfig.get_config(profile) print(f"\n{profile.upper()}:") print_config_summary(config)