first commit

2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions
--- a/run_parallel_scraper.py
+++ b/run_parallel_scraper.py
@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+"""
+Run Parallel Scraper - Execution Script
+Various strategies for scraping ZaubaCorp data efficiently
+"""
+
+import asyncio
+import os
+import sys
+import time
+import argparse
+from datetime import datetime
+from typing import Dict, Any, Optional
+import logging
+
+# Import our modules
+from zaubacorp_parallel_scraper import ZaubaCorpParallelScraper
+from parallel_config import ParallelConfig, print_config_summary, validate_config
+
+class ScrapingStrategy:
+    """Different scraping strategies for various use cases"""
+
+    @staticmethod
+    async def quick_sample(num_pages: int = 100):
+        """Quick sample scraping for testing"""
+        print(f"\n🚀 QUICK SAMPLE STRATEGY - {num_pages} pages")
+        print("=" * 60)
+
+        config = ParallelConfig.get_config(
+            'conservative',
+            max_workers=5,
+            batch_size=20,
+            output_dir='sample_output'
+        )
+
+        scraper = ZaubaCorpParallelScraper(
+            max_workers=config['max_workers'],
+            output_dir=config['output_dir']
+        )
+
+        await scraper.scrape_all_companies(
+            start_page=1,
+            end_page=num_pages,
+            batch_size=config['batch_size'],
+            scrape_details=False
+        )
+
+    @staticmethod
+    async def full_basic_scrape():
+        """Full scraping of all companies list (basic info only)"""
+        print("\n🔥 FULL BASIC SCRAPE STRATEGY - All 90,769 pages")
+        print("=" * 60)
+
+        config = ParallelConfig.get_config(
+            'aggressive',
+            max_workers=25,
+            batch_size=250,
+            output_dir='full_basic_output'
+        )
+
+        print_config_summary(config)
+
+        scraper = ZaubaCorpParallelScraper(
+            max_workers=config['max_workers'],
+            output_dir=config['output_dir']
+        )
+
+        await scraper.scrape_all_companies(
+            start_page=1,
+            end_page=None,  # All pages
+            batch_size=config['batch_size'],
+            scrape_details=False
+        )
+
+    @staticmethod
+    async def detailed_scrape(max_pages: int = 1000):
+        """Detailed scraping including company detail pages"""
+        print(f"\n🔍 DETAILED SCRAPE STRATEGY - {max_pages} pages with details")
+        print("=" * 60)
+
+        config = ParallelConfig.get_config(
+            'balanced',
+            max_workers=10,
+            batch_size=50,
+            output_dir='detailed_output'
+        )
+
+        scraper = ZaubaCorpParallelScraper(
+            max_workers=config['max_workers'],
+            output_dir=config['output_dir']
+        )
+
+        await scraper.scrape_all_companies(
+            start_page=1,
+            end_page=max_pages,
+            batch_size=config['batch_size'],
+            scrape_details=True  # This will scrape company detail pages
+        )
+
+    @staticmethod
+    async def resume_scrape(failed_pages_file: str):
+        """Resume scraping from failed pages"""
+        print(f"\n🔄 RESUME SCRAPE STRATEGY - From {failed_pages_file}")
+        print("=" * 60)
+
+        import json
+
+        # Load failed pages
+        try:
+            with open(failed_pages_file, 'r') as f:
+                failed_pages = json.load(f)
+            print(f"Found {len(failed_pages)} failed pages to retry")
+        except Exception as e:
+            print(f"Error loading failed pages file: {e}")
+            return
+
+        config = ParallelConfig.get_config(
+            'conservative',  # More conservative for retries
+            max_workers=8,
+            batch_size=25,
+            output_dir='resume_output'
+        )
+
+        scraper = ZaubaCorpParallelScraper(
+            max_workers=config['max_workers'],
+            output_dir=config['output_dir']
+        )
+
+        # Process failed pages in smaller batches
+        batch_size = 25
+        for i in range(0, len(failed_pages), batch_size):
+            batch = failed_pages[i:i + batch_size]
+            print(f"Processing retry batch {i//batch_size + 1}")
+
+            # Create a temporary scraper for this batch
+            await scraper.scrape_pages_batch(
+                session=None,  # Will create its own session
+                page_numbers=batch,
+                batch_num=i//batch_size + 1,
+                scrape_details=False
+            )
+
+    @staticmethod
+    async def segmented_scrape(segments: int = 10):
+        """Divide scraping into segments for distributed processing"""
+        print(f"\n📊 SEGMENTED SCRAPE STRATEGY - {segments} segments")
+        print("=" * 60)
+
+        total_pages = 90769
+        pages_per_segment = total_pages // segments
+
+        for segment in range(segments):
+            start_page = segment * pages_per_segment + 1
+            end_page = (segment + 1) * pages_per_segment
+
+            if segment == segments - 1:  # Last segment gets remaining pages
+                end_page = total_pages
+
+            print(f"\n--- Segment {segment + 1}/{segments}: Pages {start_page}-{end_page} ---")
+
+            config = ParallelConfig.get_config(
+                'balanced',
+                output_dir=f'segment_{segment + 1}_output'
+            )
+
+            scraper = ZaubaCorpParallelScraper(
+                max_workers=config['max_workers'],
+                output_dir=config['output_dir']
+            )
+
+            await scraper.scrape_all_companies(
+                start_page=start_page,
+                end_page=end_page,
+                batch_size=config['batch_size'],
+                scrape_details=False
+            )
+
+            print(f"Completed segment {segment + 1}")
+
+    @staticmethod
+    async def smart_adaptive_scrape():
+        """Adaptive scraping that adjusts based on success rate"""
+        print("\n🧠 SMART ADAPTIVE SCRAPE STRATEGY")
+        print("=" * 60)
+
+        # Start with conservative settings
+        current_workers = 5
+        current_batch_size = 50
+        success_threshold = 0.8  # 80% success rate required
+
+        total_processed = 0
+        current_page = 1
+
+        while current_page <= 90769:
+            print(f"\nAdaptive phase: workers={current_workers}, batch_size={current_batch_size}")
+
+            config = ParallelConfig.get_config(
+                'balanced',
+                max_workers=current_workers,
+                batch_size=current_batch_size,
+                output_dir='adaptive_output'
+            )
+
+            scraper = ZaubaCorpParallelScraper(
+                max_workers=config['max_workers'],
+                output_dir=config['output_dir']
+            )
+
+            # Process a test batch
+            test_batch_size = min(current_batch_size * 2, 90769 - current_page + 1)
+            end_page = min(current_page + test_batch_size - 1, 90769)
+
+            start_time = time.time()
+            await scraper.scrape_all_companies(
+                start_page=current_page,
+                end_page=end_page,
+                batch_size=current_batch_size,
+                scrape_details=False
+            )
+            end_time = time.time()
+
+            # Calculate success rate
+            success_rate = scraper.stats['pages_processed'] / (scraper.stats['pages_processed'] + scraper.stats['failed_pages'])
+            processing_speed = scraper.stats['pages_processed'] / (end_time - start_time)
+
+            print(f"Success rate: {success_rate:.2%}, Speed: {processing_speed:.1f} pages/sec")
+
+            # Adaptive adjustments
+            if success_rate >= success_threshold:
+                # Increase performance if successful
+                current_workers = min(current_workers + 2, 30)
+                current_batch_size = min(current_batch_size + 25, 300)
+                print("📈 Increasing performance settings")
+            else:
+                # Decrease performance if too many failures
+                current_workers = max(current_workers - 1, 3)
+                current_batch_size = max(current_batch_size - 10, 25)
+                print("📉 Decreasing performance settings")
+
+            current_page = end_page + 1
+            total_processed += test_batch_size
+
+class ScraperRunner:
+    """Main runner class for executing scraping strategies"""
+
+    def __init__(self):
+        self.available_strategies = {
+            'quick': ScrapingStrategy.quick_sample,
+            'full': ScrapingStrategy.full_basic_scrape,
+            'detailed': ScrapingStrategy.detailed_scrape,
+            'resume': ScrapingStrategy.resume_scrape,
+            'segmented': ScrapingStrategy.segmented_scrape,
+            'adaptive': ScrapingStrategy.smart_adaptive_scrape
+        }
+
+    def list_strategies(self):
+        """List all available strategies"""
+        print("\nAvailable Scraping Strategies:")
+        print("=" * 50)
+        print("1. quick     - Quick sample (100 pages)")
+        print("2. full      - Full basic scrape (all pages)")
+        print("3. detailed  - Detailed scrape with company pages")
+        print("4. resume    - Resume from failed pages")
+        print("5. segmented - Segmented scraping")
+        print("6. adaptive  - Smart adaptive scraping")
+        print("=" * 50)
+
+    async def run_strategy(self, strategy_name: str, **kwargs):
+        """Run a specific strategy"""
+        if strategy_name not in self.available_strategies:
+            print(f"Error: Unknown strategy '{strategy_name}'")
+            self.list_strategies()
+            return
+
+        strategy_func = self.available_strategies[strategy_name]
+
+        print(f"\n🎯 Starting strategy: {strategy_name.upper()}")
+        print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+        start_time = time.time()
+
+        try:
+            if strategy_name == 'quick' and 'pages' in kwargs:
+                await strategy_func(kwargs['pages'])
+            elif strategy_name == 'detailed' and 'pages' in kwargs:
+                await strategy_func(kwargs['pages'])
+            elif strategy_name == 'resume' and 'failed_file' in kwargs:
+                await strategy_func(kwargs['failed_file'])
+            elif strategy_name == 'segmented' and 'segments' in kwargs:
+                await strategy_func(kwargs['segments'])
+            else:
+                await strategy_func()
+
+        except KeyboardInterrupt:
+            print("\n⚠️  Scraping interrupted by user")
+        except Exception as e:
+            print(f"\n❌ Error during scraping: {e}")
+            logging.error(f"Strategy {strategy_name} failed: {e}")
+
+        end_time = time.time()
+        duration = end_time - start_time
+
+        print(f"\n✅ Strategy completed")
+        print(f"⏱️  Total duration: {duration/3600:.1f} hours")
+        print(f"🏁 End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+def main():
+    """Main function with command line interface"""
+    parser = argparse.ArgumentParser(description='ZaubaCorp Parallel Scraper Runner')
+    parser.add_argument('strategy', choices=['quick', 'full', 'detailed', 'resume', 'segmented', 'adaptive'],
+                       help='Scraping strategy to use')
+    parser.add_argument('--pages', type=int, help='Number of pages for quick/detailed strategies')
+    parser.add_argument('--segments', type=int, default=10, help='Number of segments for segmented strategy')
+    parser.add_argument('--failed-file', type=str, help='Failed pages file for resume strategy')
+    parser.add_argument('--list', action='store_true', help='List available strategies')
+
+    args = parser.parse_args()
+
+    runner = ScraperRunner()
+
+    if args.list:
+        runner.list_strategies()
+        return
+
+    # Validate arguments
+    if args.strategy == 'resume' and not args.failed_file:
+        print("Error: --failed-file required for resume strategy")
+        return
+
+    if args.strategy == 'resume' and not os.path.exists(args.failed_file):
+        print(f"Error: Failed file '{args.failed_file}' not found")
+        return
+
+    # Prepare kwargs
+    kwargs = {}
+    if args.pages:
+        kwargs['pages'] = args.pages
+    if args.segments:
+        kwargs['segments'] = args.segments
+    if args.failed_file:
+        kwargs['failed_file'] = args.failed_file
+
+    # Run the strategy
+    print("🚀 ZaubaCorp Parallel Scraper Runner")
+    print("=" * 50)
+
+    try:
+        asyncio.run(runner.run_strategy(args.strategy, **kwargs))
+    except KeyboardInterrupt:
+        print("\n👋 Goodbye!")
+    except Exception as e:
+        print(f"\n💥 Unexpected error: {e}")
+
+if __name__ == "__main__":
+    # If no command line arguments, run interactive mode
+    if len(sys.argv) == 1:
+        print("🚀 ZaubaCorp Parallel Scraper - Interactive Mode")
+        print("=" * 60)
+
+        runner = ScraperRunner()
+        runner.list_strategies()
+
+        choice = input("\nSelect strategy (1-6): ").strip()
+
+        strategy_map = {
+            '1': 'quick',
+            '2': 'full',
+            '3': 'detailed',
+            '4': 'resume',
+            '5': 'segmented',
+            '6': 'adaptive'
+        }
+
+        if choice in strategy_map:
+            strategy = strategy_map[choice]
+            kwargs = {}
+
+            if strategy == 'quick':
+                pages = input("Number of pages (default 100): ").strip()
+                if pages:
+                    kwargs['pages'] = int(pages)
+                else:
+                    kwargs['pages'] = 100
+
+            elif strategy == 'detailed':
+                pages = input("Number of pages (default 1000): ").strip()
+                if pages:
+                    kwargs['pages'] = int(pages)
+                else:
+                    kwargs['pages'] = 1000
+
+            elif strategy == 'resume':
+                failed_file = input("Path to failed pages file: ").strip()
+                if not failed_file or not os.path.exists(failed_file):
+                    print("Invalid file path")
+                    sys.exit(1)
+                kwargs['failed_file'] = failed_file
+
+            elif strategy == 'segmented':
+                segments = input("Number of segments (default 10): ").strip()
+                if segments:
+                    kwargs['segments'] = int(segments)
+                else:
+                    kwargs['segments'] = 10
+
+            print(f"\n🎯 Running strategy: {strategy}")
+            asyncio.run(runner.run_strategy(strategy, **kwargs))
+        else:
+            print("Invalid choice")
+    else:
+        main()