first commit

2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions
--- a/parallel_config.py
+++ b/parallel_config.py
@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+"""
+Parallel Configuration for ZaubaCorp Scraper
+Optimized settings for high-performance parallel scraping
+"""
+
+import os
+from typing import Dict, Any
+
+class ParallelConfig:
+    """Configuration class for parallel scraping"""
+
+    # Basic settings
+    BASE_CONFIG = {
+        'base_url': 'https://www.zaubacorp.com',
+        'companies_list_base': 'https://www.zaubacorp.com/companies-list',
+        'total_pages': 90769,
+        'output_dir': 'zaubacorp_parallel_data'
+    }
+
+    # Performance settings for different use cases
+    PERFORMANCE_PROFILES = {
+        'conservative': {
+            'max_workers': 5,
+            'batch_size': 50,
+            'request_delay': (0.5, 1.0),  # Random delay range
+            'connection_limit': 10,
+            'timeout': 45,
+            'retries': 3,
+            'semaphore_limit': 3
+        },
+        'balanced': {
+            'max_workers': 15,
+            'batch_size': 100,
+            'request_delay': (0.2, 0.5),
+            'connection_limit': 30,
+            'timeout': 30,
+            'retries': 3,
+            'semaphore_limit': 5
+        },
+        'aggressive': {
+            'max_workers': 25,
+            'batch_size': 200,
+            'request_delay': (0.1, 0.3),
+            'connection_limit': 50,
+            'timeout': 20,
+            'retries': 2,
+            'semaphore_limit': 8
+        },
+        'maximum': {
+            'max_workers': 40,
+            'batch_size': 300,
+            'request_delay': (0.05, 0.2),
+            'connection_limit': 80,
+            'timeout': 15,
+            'retries': 2,
+            'semaphore_limit': 10
+        }
+    }
+
+    # HTTP settings
+    HTTP_CONFIG = {
+        'user_agents': [
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0',
+            'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/121.0',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
+        ],
+        'default_headers': {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Cache-Control': 'max-age=0'
+        }
+    }
+
+    # Logging configuration
+    LOGGING_CONFIG = {
+        'level': 'INFO',
+        'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        'console_output': True,
+        'file_output': True,
+        'log_filename': 'parallel_scraper.log'
+    }
+
+    # Output configuration
+    OUTPUT_CONFIG = {
+        'save_formats': ['csv', 'json'],
+        'batch_save': True,
+        'final_consolidation': True,
+        'compression': False,  # Set to True to compress large files
+        'encoding': 'utf-8'
+    }
+
+    # Error handling
+    ERROR_CONFIG = {
+        'max_retries': 3,
+        'retry_delay_base': 1.0,  # Base delay for exponential backoff
+        'rate_limit_wait': 10,    # Wait time when rate limited
+        'ignore_ssl_errors': False,
+        'continue_on_error': True
+    }
+
+    # Data extraction settings
+    EXTRACTION_CONFIG = {
+        'required_fields': ['cin', 'company_name'],
+        'optional_fields': ['status', 'paid_up_capital', 'address', 'company_url'],
+        'validate_data': True,
+        'skip_empty_records': True,
+        'clean_text': True
+    }
+
+    @classmethod
+    def get_config(cls, profile: str = 'balanced', **overrides) -> Dict[str, Any]:
+        """
+        Get configuration for a specific performance profile
+
+        Args:
+            profile: Performance profile name ('conservative', 'balanced', 'aggressive', 'maximum')
+            **overrides: Override specific configuration values
+
+        Returns:
+            Complete configuration dictionary
+        """
+        if profile not in cls.PERFORMANCE_PROFILES:
+            raise ValueError(f"Unknown profile: {profile}. Available: {list(cls.PERFORMANCE_PROFILES.keys())}")
+
+        config = {
+            **cls.BASE_CONFIG,
+            **cls.PERFORMANCE_PROFILES[profile],
+            'http': cls.HTTP_CONFIG,
+            'logging': cls.LOGGING_CONFIG,
+            'output': cls.OUTPUT_CONFIG,
+            'error_handling': cls.ERROR_CONFIG,
+            'extraction': cls.EXTRACTION_CONFIG
+        }
+
+        # Apply overrides
+        for key, value in overrides.items():
+            if '.' in key:
+                # Handle nested keys like 'logging.level'
+                keys = key.split('.')
+                target = config
+                for k in keys[:-1]:
+                    target = target[k]
+                target[keys[-1]] = value
+            else:
+                config[key] = value
+
+        return config
+
+    @classmethod
+    def get_optimized_config(cls, system_cores: int = None, available_memory_gb: int = None) -> Dict[str, Any]:
+        """
+        Get optimized configuration based on system resources
+
+        Args:
+            system_cores: Number of CPU cores available
+            available_memory_gb: Available memory in GB
+
+        Returns:
+            Optimized configuration
+        """
+        import psutil
+
+        if system_cores is None:
+            system_cores = psutil.cpu_count()
+
+        if available_memory_gb is None:
+            available_memory_gb = psutil.virtual_memory().available / (1024**3)
+
+        # Determine optimal profile based on resources
+        if system_cores >= 16 and available_memory_gb >= 16:
+            profile = 'maximum'
+        elif system_cores >= 8 and available_memory_gb >= 8:
+            profile = 'aggressive'
+        elif system_cores >= 4 and available_memory_gb >= 4:
+            profile = 'balanced'
+        else:
+            profile = 'conservative'
+
+        # Calculate optimal workers based on cores
+        optimal_workers = min(system_cores * 2, 40)
+
+        # Calculate optimal batch size based on memory
+        optimal_batch_size = min(int(available_memory_gb * 50), 500)
+
+        config = cls.get_config(
+            profile,
+            max_workers=optimal_workers,
+            batch_size=optimal_batch_size
+        )
+
+        return config
+
+# Predefined configurations for common scenarios
+QUICK_TEST_CONFIG = ParallelConfig.get_config(
+    'conservative',
+    max_workers=3,
+    batch_size=10,
+    total_pages=50,
+    output_dir='test_output'
+)
+
+PRODUCTION_CONFIG = ParallelConfig.get_config(
+    'balanced',
+    output_dir='production_output',
+    **{
+        'logging.level': 'INFO',
+        'output.compression': True,
+        'error_handling.continue_on_error': True
+    }
+)
+
+HIGH_SPEED_CONFIG = ParallelConfig.get_config(
+    'aggressive',
+    output_dir='high_speed_output',
+    **{
+        'logging.level': 'WARNING',  # Reduce logging overhead
+        'output.batch_save': True
+    }
+)
+
+# Custom configurations for specific use cases
+DETAIL_SCRAPING_CONFIG = ParallelConfig.get_config(
+    'conservative',  # More conservative for detail scraping
+    max_workers=8,
+    batch_size=25,
+    request_delay=(1.0, 2.0),  # Longer delays for detail pages
+    output_dir='detailed_output'
+)
+
+# Configuration for resuming failed scrapes
+RETRY_CONFIG = ParallelConfig.get_config(
+    'balanced',
+    **{
+        'error_handling.max_retries': 5,
+        'error_handling.retry_delay_base': 2.0,
+        'error_handling.continue_on_error': True
+    }
+)
+
+def print_config_summary(config: Dict[str, Any]):
+    """Print a summary of the configuration"""
+    print("Configuration Summary:")
+    print("=" * 50)
+    print(f"Max Workers: {config['max_workers']}")
+    print(f"Batch Size: {config['batch_size']}")
+    print(f"Request Delay: {config['request_delay']}")
+    print(f"Connection Limit: {config['connection_limit']}")
+    print(f"Timeout: {config['timeout']}s")
+    print(f"Retries: {config['retries']}")
+    print(f"Output Directory: {config['output_dir']}")
+    print(f"Total Pages: {config.get('total_pages', 'All')}")
+    print("=" * 50)
+
+def validate_config(config: Dict[str, Any]) -> bool:
+    """Validate configuration parameters"""
+    required_keys = ['max_workers', 'batch_size', 'output_dir']
+
+    for key in required_keys:
+        if key not in config:
+            print(f"Error: Missing required configuration key: {key}")
+            return False
+
+    if config['max_workers'] < 1:
+        print("Error: max_workers must be at least 1")
+        return False
+
+    if config['batch_size'] < 1:
+        print("Error: batch_size must be at least 1")
+        return False
+
+    return True
+
+# Example usage configurations
+if __name__ == "__main__":
+    # Example: Get optimized configuration
+    optimized = ParallelConfig.get_optimized_config()
+    print("Optimized Configuration:")
+    print_config_summary(optimized)
+
+    print("\nAvailable Profiles:")
+    for profile in ParallelConfig.PERFORMANCE_PROFILES.keys():
+        config = ParallelConfig.get_config(profile)
+        print(f"\n{profile.upper()}:")
+        print_config_summary(config)