first commit
This commit is contained in:
297
parallel_config.py
Normal file
297
parallel_config.py
Normal file
@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parallel Configuration for ZaubaCorp Scraper
|
||||
Optimized settings for high-performance parallel scraping
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
class ParallelConfig:
|
||||
"""Configuration class for parallel scraping"""
|
||||
|
||||
# Basic settings
|
||||
BASE_CONFIG = {
|
||||
'base_url': 'https://www.zaubacorp.com',
|
||||
'companies_list_base': 'https://www.zaubacorp.com/companies-list',
|
||||
'total_pages': 90769,
|
||||
'output_dir': 'zaubacorp_parallel_data'
|
||||
}
|
||||
|
||||
# Performance settings for different use cases
|
||||
PERFORMANCE_PROFILES = {
|
||||
'conservative': {
|
||||
'max_workers': 5,
|
||||
'batch_size': 50,
|
||||
'request_delay': (0.5, 1.0), # Random delay range
|
||||
'connection_limit': 10,
|
||||
'timeout': 45,
|
||||
'retries': 3,
|
||||
'semaphore_limit': 3
|
||||
},
|
||||
'balanced': {
|
||||
'max_workers': 15,
|
||||
'batch_size': 100,
|
||||
'request_delay': (0.2, 0.5),
|
||||
'connection_limit': 30,
|
||||
'timeout': 30,
|
||||
'retries': 3,
|
||||
'semaphore_limit': 5
|
||||
},
|
||||
'aggressive': {
|
||||
'max_workers': 25,
|
||||
'batch_size': 200,
|
||||
'request_delay': (0.1, 0.3),
|
||||
'connection_limit': 50,
|
||||
'timeout': 20,
|
||||
'retries': 2,
|
||||
'semaphore_limit': 8
|
||||
},
|
||||
'maximum': {
|
||||
'max_workers': 40,
|
||||
'batch_size': 300,
|
||||
'request_delay': (0.05, 0.2),
|
||||
'connection_limit': 80,
|
||||
'timeout': 15,
|
||||
'retries': 2,
|
||||
'semaphore_limit': 10
|
||||
}
|
||||
}
|
||||
|
||||
# HTTP settings
|
||||
HTTP_CONFIG = {
|
||||
'user_agents': [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
|
||||
],
|
||||
'default_headers': {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Cache-Control': 'max-age=0'
|
||||
}
|
||||
}
|
||||
|
||||
# Logging configuration
|
||||
LOGGING_CONFIG = {
|
||||
'level': 'INFO',
|
||||
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
'console_output': True,
|
||||
'file_output': True,
|
||||
'log_filename': 'parallel_scraper.log'
|
||||
}
|
||||
|
||||
# Output configuration
|
||||
OUTPUT_CONFIG = {
|
||||
'save_formats': ['csv', 'json'],
|
||||
'batch_save': True,
|
||||
'final_consolidation': True,
|
||||
'compression': False, # Set to True to compress large files
|
||||
'encoding': 'utf-8'
|
||||
}
|
||||
|
||||
# Error handling
|
||||
ERROR_CONFIG = {
|
||||
'max_retries': 3,
|
||||
'retry_delay_base': 1.0, # Base delay for exponential backoff
|
||||
'rate_limit_wait': 10, # Wait time when rate limited
|
||||
'ignore_ssl_errors': False,
|
||||
'continue_on_error': True
|
||||
}
|
||||
|
||||
# Data extraction settings
|
||||
EXTRACTION_CONFIG = {
|
||||
'required_fields': ['cin', 'company_name'],
|
||||
'optional_fields': ['status', 'paid_up_capital', 'address', 'company_url'],
|
||||
'validate_data': True,
|
||||
'skip_empty_records': True,
|
||||
'clean_text': True
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_config(cls, profile: str = 'balanced', **overrides) -> Dict[str, Any]:
|
||||
"""
|
||||
Get configuration for a specific performance profile
|
||||
|
||||
Args:
|
||||
profile: Performance profile name ('conservative', 'balanced', 'aggressive', 'maximum')
|
||||
**overrides: Override specific configuration values
|
||||
|
||||
Returns:
|
||||
Complete configuration dictionary
|
||||
"""
|
||||
if profile not in cls.PERFORMANCE_PROFILES:
|
||||
raise ValueError(f"Unknown profile: {profile}. Available: {list(cls.PERFORMANCE_PROFILES.keys())}")
|
||||
|
||||
config = {
|
||||
**cls.BASE_CONFIG,
|
||||
**cls.PERFORMANCE_PROFILES[profile],
|
||||
'http': cls.HTTP_CONFIG,
|
||||
'logging': cls.LOGGING_CONFIG,
|
||||
'output': cls.OUTPUT_CONFIG,
|
||||
'error_handling': cls.ERROR_CONFIG,
|
||||
'extraction': cls.EXTRACTION_CONFIG
|
||||
}
|
||||
|
||||
# Apply overrides
|
||||
for key, value in overrides.items():
|
||||
if '.' in key:
|
||||
# Handle nested keys like 'logging.level'
|
||||
keys = key.split('.')
|
||||
target = config
|
||||
for k in keys[:-1]:
|
||||
target = target[k]
|
||||
target[keys[-1]] = value
|
||||
else:
|
||||
config[key] = value
|
||||
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def get_optimized_config(cls, system_cores: int = None, available_memory_gb: int = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Get optimized configuration based on system resources
|
||||
|
||||
Args:
|
||||
system_cores: Number of CPU cores available
|
||||
available_memory_gb: Available memory in GB
|
||||
|
||||
Returns:
|
||||
Optimized configuration
|
||||
"""
|
||||
import psutil
|
||||
|
||||
if system_cores is None:
|
||||
system_cores = psutil.cpu_count()
|
||||
|
||||
if available_memory_gb is None:
|
||||
available_memory_gb = psutil.virtual_memory().available / (1024**3)
|
||||
|
||||
# Determine optimal profile based on resources
|
||||
if system_cores >= 16 and available_memory_gb >= 16:
|
||||
profile = 'maximum'
|
||||
elif system_cores >= 8 and available_memory_gb >= 8:
|
||||
profile = 'aggressive'
|
||||
elif system_cores >= 4 and available_memory_gb >= 4:
|
||||
profile = 'balanced'
|
||||
else:
|
||||
profile = 'conservative'
|
||||
|
||||
# Calculate optimal workers based on cores
|
||||
optimal_workers = min(system_cores * 2, 40)
|
||||
|
||||
# Calculate optimal batch size based on memory
|
||||
optimal_batch_size = min(int(available_memory_gb * 50), 500)
|
||||
|
||||
config = cls.get_config(
|
||||
profile,
|
||||
max_workers=optimal_workers,
|
||||
batch_size=optimal_batch_size
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
# Predefined configurations for common scenarios
|
||||
QUICK_TEST_CONFIG = ParallelConfig.get_config(
|
||||
'conservative',
|
||||
max_workers=3,
|
||||
batch_size=10,
|
||||
total_pages=50,
|
||||
output_dir='test_output'
|
||||
)
|
||||
|
||||
PRODUCTION_CONFIG = ParallelConfig.get_config(
|
||||
'balanced',
|
||||
output_dir='production_output',
|
||||
**{
|
||||
'logging.level': 'INFO',
|
||||
'output.compression': True,
|
||||
'error_handling.continue_on_error': True
|
||||
}
|
||||
)
|
||||
|
||||
HIGH_SPEED_CONFIG = ParallelConfig.get_config(
|
||||
'aggressive',
|
||||
output_dir='high_speed_output',
|
||||
**{
|
||||
'logging.level': 'WARNING', # Reduce logging overhead
|
||||
'output.batch_save': True
|
||||
}
|
||||
)
|
||||
|
||||
# Custom configurations for specific use cases
|
||||
DETAIL_SCRAPING_CONFIG = ParallelConfig.get_config(
|
||||
'conservative', # More conservative for detail scraping
|
||||
max_workers=8,
|
||||
batch_size=25,
|
||||
request_delay=(1.0, 2.0), # Longer delays for detail pages
|
||||
output_dir='detailed_output'
|
||||
)
|
||||
|
||||
# Configuration for resuming failed scrapes
|
||||
RETRY_CONFIG = ParallelConfig.get_config(
|
||||
'balanced',
|
||||
**{
|
||||
'error_handling.max_retries': 5,
|
||||
'error_handling.retry_delay_base': 2.0,
|
||||
'error_handling.continue_on_error': True
|
||||
}
|
||||
)
|
||||
|
||||
def print_config_summary(config: Dict[str, Any]):
|
||||
"""Print a summary of the configuration"""
|
||||
print("Configuration Summary:")
|
||||
print("=" * 50)
|
||||
print(f"Max Workers: {config['max_workers']}")
|
||||
print(f"Batch Size: {config['batch_size']}")
|
||||
print(f"Request Delay: {config['request_delay']}")
|
||||
print(f"Connection Limit: {config['connection_limit']}")
|
||||
print(f"Timeout: {config['timeout']}s")
|
||||
print(f"Retries: {config['retries']}")
|
||||
print(f"Output Directory: {config['output_dir']}")
|
||||
print(f"Total Pages: {config.get('total_pages', 'All')}")
|
||||
print("=" * 50)
|
||||
|
||||
def validate_config(config: Dict[str, Any]) -> bool:
|
||||
"""Validate configuration parameters"""
|
||||
required_keys = ['max_workers', 'batch_size', 'output_dir']
|
||||
|
||||
for key in required_keys:
|
||||
if key not in config:
|
||||
print(f"Error: Missing required configuration key: {key}")
|
||||
return False
|
||||
|
||||
if config['max_workers'] < 1:
|
||||
print("Error: max_workers must be at least 1")
|
||||
return False
|
||||
|
||||
if config['batch_size'] < 1:
|
||||
print("Error: batch_size must be at least 1")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# Example usage configurations
|
||||
if __name__ == "__main__":
|
||||
# Example: Get optimized configuration
|
||||
optimized = ParallelConfig.get_optimized_config()
|
||||
print("Optimized Configuration:")
|
||||
print_config_summary(optimized)
|
||||
|
||||
print("\nAvailable Profiles:")
|
||||
for profile in ParallelConfig.PERFORMANCE_PROFILES.keys():
|
||||
config = ParallelConfig.get_config(profile)
|
||||
print(f"\n{profile.upper()}:")
|
||||
print_config_summary(config)
|
||||
Reference in New Issue
Block a user