comp/run_parallel_scraper.py

#!/usr/bin/env python3
"""
Run Parallel Scraper - Execution Script
Various strategies for scraping ZaubaCorp data efficiently
"""

import asyncio
import os
import sys
import time
import argparse
from datetime import datetime
from typing import Dict, Any, Optional
import logging

# Import our modules
from zaubacorp_parallel_scraper import ZaubaCorpParallelScraper
from parallel_config import ParallelConfig, print_config_summary, validate_config

class ScrapingStrategy:
    """Different scraping strategies for various use cases"""

    @staticmethod
    async def quick_sample(num_pages: int = 100):
        """Quick sample scraping for testing"""
        print(f"\n🚀 QUICK SAMPLE STRATEGY - {num_pages} pages")
        print("=" * 60)

        config = ParallelConfig.get_config(
            'conservative',
            max_workers=5,
            batch_size=20,
            output_dir='sample_output'
        )

        scraper = ZaubaCorpParallelScraper(
            max_workers=config['max_workers'],
            output_dir=config['output_dir']
        )

        await scraper.scrape_all_companies(
            start_page=1,
            end_page=num_pages,
            batch_size=config['batch_size'],
            scrape_details=False
        )

    @staticmethod
    async def full_basic_scrape():
        """Full scraping of all companies list (basic info only)"""
        print("\n🔥 FULL BASIC SCRAPE STRATEGY - All 90,769 pages")
        print("=" * 60)

        config = ParallelConfig.get_config(
            'aggressive',
            max_workers=25,
            batch_size=250,
            output_dir='full_basic_output'
        )

        print_config_summary(config)

        scraper = ZaubaCorpParallelScraper(
            max_workers=config['max_workers'],
            output_dir=config['output_dir']
        )

        await scraper.scrape_all_companies(
            start_page=1,
            end_page=None,  # All pages
            batch_size=config['batch_size'],
            scrape_details=False
        )

    @staticmethod
    async def detailed_scrape(max_pages: int = 1000):
        """Detailed scraping including company detail pages"""
        print(f"\n🔍 DETAILED SCRAPE STRATEGY - {max_pages} pages with details")
        print("=" * 60)

        config = ParallelConfig.get_config(
            'balanced',
            max_workers=10,
            batch_size=50,
            output_dir='detailed_output'
        )

        scraper = ZaubaCorpParallelScraper(
            max_workers=config['max_workers'],
            output_dir=config['output_dir']
        )

        await scraper.scrape_all_companies(
            start_page=1,
            end_page=max_pages,
            batch_size=config['batch_size'],
            scrape_details=True  # This will scrape company detail pages
        )

    @staticmethod
    async def resume_scrape(failed_pages_file: str):
        """Resume scraping from failed pages"""
        print(f"\n🔄 RESUME SCRAPE STRATEGY - From {failed_pages_file}")
        print("=" * 60)

        import json

        # Load failed pages
        try:
            with open(failed_pages_file, 'r') as f:
                failed_pages = json.load(f)
            print(f"Found {len(failed_pages)} failed pages to retry")
        except Exception as e:
            print(f"Error loading failed pages file: {e}")
            return

        config = ParallelConfig.get_config(
            'conservative',  # More conservative for retries
            max_workers=8,
            batch_size=25,
            output_dir='resume_output'
        )

        scraper = ZaubaCorpParallelScraper(
            max_workers=config['max_workers'],
            output_dir=config['output_dir']
        )

        # Process failed pages in smaller batches
        batch_size = 25
        for i in range(0, len(failed_pages), batch_size):
            batch = failed_pages[i:i + batch_size]
            print(f"Processing retry batch {i//batch_size + 1}")

            # Create a temporary scraper for this batch
            await scraper.scrape_pages_batch(
                session=None,  # Will create its own session
                page_numbers=batch,
                batch_num=i//batch_size + 1,
                scrape_details=False
            )

    @staticmethod
    async def segmented_scrape(segments: int = 10):
        """Divide scraping into segments for distributed processing"""
        print(f"\n📊 SEGMENTED SCRAPE STRATEGY - {segments} segments")
        print("=" * 60)

        total_pages = 90769
        pages_per_segment = total_pages // segments

        for segment in range(segments):
            start_page = segment * pages_per_segment + 1
            end_page = (segment + 1) * pages_per_segment

            if segment == segments - 1:  # Last segment gets remaining pages
                end_page = total_pages

            print(f"\n--- Segment {segment + 1}/{segments}: Pages {start_page}-{end_page} ---")

            config = ParallelConfig.get_config(
                'balanced',
                output_dir=f'segment_{segment + 1}_output'
            )

            scraper = ZaubaCorpParallelScraper(
                max_workers=config['max_workers'],
                output_dir=config['output_dir']
            )

            await scraper.scrape_all_companies(
                start_page=start_page,
                end_page=end_page,
                batch_size=config['batch_size'],
                scrape_details=False
            )

            print(f"Completed segment {segment + 1}")

    @staticmethod
    async def smart_adaptive_scrape():
        """Adaptive scraping that adjusts based on success rate"""
        print("\n🧠 SMART ADAPTIVE SCRAPE STRATEGY")
        print("=" * 60)

        # Start with conservative settings
        current_workers = 5
        current_batch_size = 50
        success_threshold = 0.8  # 80% success rate required

        total_processed = 0
        current_page = 1

        while current_page <= 90769:
            print(f"\nAdaptive phase: workers={current_workers}, batch_size={current_batch_size}")

            config = ParallelConfig.get_config(
                'balanced',
                max_workers=current_workers,
                batch_size=current_batch_size,
                output_dir='adaptive_output'
            )

            scraper = ZaubaCorpParallelScraper(
                max_workers=config['max_workers'],
                output_dir=config['output_dir']
            )

            # Process a test batch
            test_batch_size = min(current_batch_size * 2, 90769 - current_page + 1)
            end_page = min(current_page + test_batch_size - 1, 90769)

            start_time = time.time()
            await scraper.scrape_all_companies(
                start_page=current_page,
                end_page=end_page,
                batch_size=current_batch_size,
                scrape_details=False
            )
            end_time = time.time()

            # Calculate success rate
            success_rate = scraper.stats['pages_processed'] / (scraper.stats['pages_processed'] + scraper.stats['failed_pages'])
            processing_speed = scraper.stats['pages_processed'] / (end_time - start_time)

            print(f"Success rate: {success_rate:.2%}, Speed: {processing_speed:.1f} pages/sec")

            # Adaptive adjustments
            if success_rate >= success_threshold:
                # Increase performance if successful
                current_workers = min(current_workers + 2, 30)
                current_batch_size = min(current_batch_size + 25, 300)
                print("📈 Increasing performance settings")
            else:
                # Decrease performance if too many failures
                current_workers = max(current_workers - 1, 3)
                current_batch_size = max(current_batch_size - 10, 25)
                print("📉 Decreasing performance settings")

            current_page = end_page + 1
            total_processed += test_batch_size

class ScraperRunner:
    """Main runner class for executing scraping strategies"""

    def __init__(self):
        self.available_strategies = {
            'quick': ScrapingStrategy.quick_sample,
            'full': ScrapingStrategy.full_basic_scrape,
            'detailed': ScrapingStrategy.detailed_scrape,
            'resume': ScrapingStrategy.resume_scrape,
            'segmented': ScrapingStrategy.segmented_scrape,
            'adaptive': ScrapingStrategy.smart_adaptive_scrape
        }

    def list_strategies(self):
        """List all available strategies"""
        print("\nAvailable Scraping Strategies:")
        print("=" * 50)
        print("1. quick     - Quick sample (100 pages)")
        print("2. full      - Full basic scrape (all pages)")
        print("3. detailed  - Detailed scrape with company pages")
        print("4. resume    - Resume from failed pages")
        print("5. segmented - Segmented scraping")
        print("6. adaptive  - Smart adaptive scraping")
        print("=" * 50)

    async def run_strategy(self, strategy_name: str, **kwargs):
        """Run a specific strategy"""
        if strategy_name not in self.available_strategies:
            print(f"Error: Unknown strategy '{strategy_name}'")
            self.list_strategies()
            return

        strategy_func = self.available_strategies[strategy_name]

        print(f"\n🎯 Starting strategy: {strategy_name.upper()}")
        print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        start_time = time.time()

        try:
            if strategy_name == 'quick' and 'pages' in kwargs:
                await strategy_func(kwargs['pages'])
            elif strategy_name == 'detailed' and 'pages' in kwargs:
                await strategy_func(kwargs['pages'])
            elif strategy_name == 'resume' and 'failed_file' in kwargs:
                await strategy_func(kwargs['failed_file'])
            elif strategy_name == 'segmented' and 'segments' in kwargs:
                await strategy_func(kwargs['segments'])
            else:
                await strategy_func()

        except KeyboardInterrupt:
            print("\n⚠️  Scraping interrupted by user")
        except Exception as e:
            print(f"\n❌ Error during scraping: {e}")
            logging.error(f"Strategy {strategy_name} failed: {e}")

        end_time = time.time()
        duration = end_time - start_time

        print(f"\n✅ Strategy completed")
        print(f"⏱️  Total duration: {duration/3600:.1f} hours")
        print(f"🏁 End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

def main():
    """Main function with command line interface"""
    parser = argparse.ArgumentParser(description='ZaubaCorp Parallel Scraper Runner')
    parser.add_argument('strategy', choices=['quick', 'full', 'detailed', 'resume', 'segmented', 'adaptive'],
                       help='Scraping strategy to use')
    parser.add_argument('--pages', type=int, help='Number of pages for quick/detailed strategies')
    parser.add_argument('--segments', type=int, default=10, help='Number of segments for segmented strategy')
    parser.add_argument('--failed-file', type=str, help='Failed pages file for resume strategy')
    parser.add_argument('--list', action='store_true', help='List available strategies')

    args = parser.parse_args()

    runner = ScraperRunner()

    if args.list:
        runner.list_strategies()
        return

    # Validate arguments
    if args.strategy == 'resume' and not args.failed_file:
        print("Error: --failed-file required for resume strategy")
        return

    if args.strategy == 'resume' and not os.path.exists(args.failed_file):
        print(f"Error: Failed file '{args.failed_file}' not found")
        return

    # Prepare kwargs
    kwargs = {}
    if args.pages:
        kwargs['pages'] = args.pages
    if args.segments:
        kwargs['segments'] = args.segments
    if args.failed_file:
        kwargs['failed_file'] = args.failed_file

    # Run the strategy
    print("🚀 ZaubaCorp Parallel Scraper Runner")
    print("=" * 50)

    try:
        asyncio.run(runner.run_strategy(args.strategy, **kwargs))
    except KeyboardInterrupt:
        print("\n👋 Goodbye!")
    except Exception as e:
        print(f"\n💥 Unexpected error: {e}")

if __name__ == "__main__":
    # If no command line arguments, run interactive mode
    if len(sys.argv) == 1:
        print("🚀 ZaubaCorp Parallel Scraper - Interactive Mode")
        print("=" * 60)

        runner = ScraperRunner()
        runner.list_strategies()

        choice = input("\nSelect strategy (1-6): ").strip()

        strategy_map = {
            '1': 'quick',
            '2': 'full',
            '3': 'detailed',
            '4': 'resume',
            '5': 'segmented',
            '6': 'adaptive'
        }

        if choice in strategy_map:
            strategy = strategy_map[choice]
            kwargs = {}

            if strategy == 'quick':
                pages = input("Number of pages (default 100): ").strip()
                if pages:
                    kwargs['pages'] = int(pages)
                else:
                    kwargs['pages'] = 100

            elif strategy == 'detailed':
                pages = input("Number of pages (default 1000): ").strip()
                if pages:
                    kwargs['pages'] = int(pages)
                else:
                    kwargs['pages'] = 1000

            elif strategy == 'resume':
                failed_file = input("Path to failed pages file: ").strip()
                if not failed_file or not os.path.exists(failed_file):
                    print("Invalid file path")
                    sys.exit(1)
                kwargs['failed_file'] = failed_file

            elif strategy == 'segmented':
                segments = input("Number of segments (default 10): ").strip()
                if segments:
                    kwargs['segments'] = int(segments)
                else:
                    kwargs['segments'] = 10

            print(f"\n🎯 Running strategy: {strategy}")
            asyncio.run(runner.run_strategy(strategy, **kwargs))
        else:
            print("Invalid choice")
    else:
        main()