comp/zaubacorp_parallel_scraper.py

import asyncio
import aiohttp
import aiofiles
import pandas as pd
import json
import time
import logging
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from typing import List, Dict, Optional
import random

class ZaubaCorpParallelScraper:
    def __init__(self, max_workers=10, output_dir="zaubacorp_parallel_data"):
        self.base_url = "https://www.zaubacorp.com"
        self.companies_list_base = "https://www.zaubacorp.com/companies-list"
        self.max_workers = max_workers
        self.output_dir = output_dir
        self.scraped_companies = []
        self.failed_pages = []
        self.lock = threading.Lock()

        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)

        # Setup logging
        self.setup_logging()

        # Statistics
        self.stats = {
            'total_pages': 90769,  # Based on your observation
            'pages_processed': 0,
            'companies_found': 0,
            'companies_detailed': 0,
            'failed_pages': 0,
            'start_time': None,
            'end_time': None
        }

        # User agents for rotation
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0'
        ]

    def setup_logging(self):
        """Setup logging configuration"""
        log_file = os.path.join(self.output_dir, 'parallel_scraper.log')
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def get_random_headers(self):
        """Get random headers to avoid detection"""
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

    async def fetch_page(self, session: aiohttp.ClientSession, url: str, retries: int = 3) -> Optional[str]:
        """Fetch a single page with retry logic"""
        for attempt in range(retries):
            try:
                headers = self.get_random_headers()
                async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=30)) as response:
                    if response.status == 200:
                        return await response.text()
                    elif response.status == 429:  # Rate limited
                        wait_time = (2 ** attempt) + random.uniform(0, 1)
                        await asyncio.sleep(wait_time)
                        continue
                    else:
                        self.logger.warning(f"HTTP {response.status} for {url}")

            except Exception as e:
                self.logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < retries - 1:
                    await asyncio.sleep(random.uniform(1, 3))

        return None

    def parse_companies_list_page(self, html: str, page_num: int) -> List[Dict]:
        """Parse companies from a list page"""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            companies = []

            # Find all table rows with company data
            rows = soup.find_all('tr')

            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 5:  # Should have CIN, Name, Status, Capital, Address
                    try:
                        # Extract CIN and company URL
                        cin_cell = cells[0]
                        cin_link = cin_cell.find('a')
                        cin = cin_link.text.strip() if cin_link else cin_cell.text.strip()
                        company_url = cin_link.get('href') if cin_link else ''

                        # Extract company name
                        name_cell = cells[1]
                        name_link = name_cell.find('a')
                        company_name = name_link.text.strip() if name_link else name_cell.text.strip()

                        # Extract status
                        status = cells[2].text.strip()

                        # Extract paid up capital
                        paid_up_capital = cells[3].text.strip()

                        # Extract address
                        address = cells[4].text.strip()

                        if company_name and cin:  # Only add if we have essential data
                            company_data = {
                                'cin': cin,
                                'company_name': company_name,
                                'status': status,
                                'paid_up_capital': paid_up_capital,
                                'address': address,
                                'company_url': urljoin(self.base_url, company_url) if company_url else '',
                                'page_number': page_num,
                                'scraped_at': datetime.now().isoformat()
                            }
                            companies.append(company_data)

                    except Exception as e:
                        self.logger.warning(f"Error parsing row on page {page_num}: {e}")
                        continue

            return companies

        except Exception as e:
            self.logger.error(f"Error parsing page {page_num}: {e}")
            return []

    async def scrape_companies_list_page(self, session: aiohttp.ClientSession, page_num: int) -> List[Dict]:
        """Scrape a single companies list page"""
        url = f"{self.companies_list_base}/p-{page_num}-company.html"

        html = await self.fetch_page(session, url)
        if html:
            companies = self.parse_companies_list_page(html, page_num)

            with self.lock:
                self.stats['pages_processed'] += 1
                self.stats['companies_found'] += len(companies)

            self.logger.info(f"Page {page_num}: Found {len(companies)} companies")
            return companies
        else:
            with self.lock:
                self.failed_pages.append(page_num)
                self.stats['failed_pages'] += 1

            self.logger.error(f"Failed to fetch page {page_num}")
            return []

    async def scrape_company_details(self, session: aiohttp.ClientSession, company: Dict) -> Dict:
        """Scrape detailed information from a company profile page"""
        if not company.get('company_url'):
            return company

        try:
            html = await self.fetch_page(session, company['company_url'])
            if html:
                soup = BeautifulSoup(html, 'html.parser')

                # Extract additional details from company page
                # This is a basic extraction - can be enhanced based on page structure

                # Look for tables with company details
                tables = soup.find_all('table')
                for table in tables:
                    rows = table.find_all('tr')
                    for row in rows:
                        cells = row.find_all(['td', 'th'])
                        if len(cells) >= 2:
                            key = cells[0].get_text().strip().lower()
                            value = cells[1].get_text().strip()

                            # Map common fields
                            if 'registration number' in key:
                                company['registration_number'] = value
                            elif 'authorized capital' in key:
                                company['authorized_capital'] = value
                            elif 'company category' in key:
                                company['company_category'] = value
                            elif 'class of company' in key:
                                company['class_of_company'] = value
                            elif 'roc' in key:
                                company['roc'] = value
                            elif 'registration date' in key or 'incorporation' in key:
                                company['registration_date'] = value
                            elif 'email' in key:
                                company['email'] = value
                            elif 'phone' in key or 'mobile' in key:
                                company['phone'] = value

                with self.lock:
                    self.stats['companies_detailed'] += 1

            return company

        except Exception as e:
            self.logger.error(f"Error scraping company details for {company.get('company_name', 'Unknown')}: {e}")
            return company

    async def save_batch_data(self, companies: List[Dict], batch_num: int):
        """Save a batch of companies data"""
        if not companies:
            return

        try:
            # Save as JSON
            json_file = os.path.join(self.output_dir, f'companies_batch_{batch_num}.json')
            async with aiofiles.open(json_file, 'w', encoding='utf-8') as f:
                await f.write(json.dumps(companies, indent=2, ensure_ascii=False))

            # Save as CSV
            csv_file = os.path.join(self.output_dir, f'companies_batch_{batch_num}.csv')
            df = pd.DataFrame(companies)
            df.to_csv(csv_file, index=False, encoding='utf-8')

            self.logger.info(f"Saved batch {batch_num} with {len(companies)} companies")

        except Exception as e:
            self.logger.error(f"Error saving batch {batch_num}: {e}")

    async def scrape_pages_batch(self, session: aiohttp.ClientSession, page_numbers: List[int], batch_num: int, scrape_details: bool = False):
        """Scrape a batch of pages"""
        all_companies = []

        # Create semaphore to limit concurrent requests per batch
        semaphore = asyncio.Semaphore(5)  # Max 5 concurrent requests per batch

        async def scrape_single_page(page_num):
            async with semaphore:
                companies = await self.scrape_companies_list_page(session, page_num)

                if scrape_details and companies:
                    # Scrape detailed information for each company
                    detailed_companies = []
                    for company in companies:
                        detailed_company = await self.scrape_company_details(session, company)
                        detailed_companies.append(detailed_company)
                        # Small delay between detail requests
                        await asyncio.sleep(random.uniform(0.1, 0.3))
                    return detailed_companies

                return companies

        # Process all pages in this batch
        tasks = [scrape_single_page(page_num) for page_num in page_numbers]
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Collect results
        for result in results:
            if isinstance(result, list):
                all_companies.extend(result)
            elif isinstance(result, Exception):
                self.logger.error(f"Batch {batch_num} task failed: {result}")

        # Save batch data
        if all_companies:
            await self.save_batch_data(all_companies, batch_num)

            # Also add to main collection
            with self.lock:
                self.scraped_companies.extend(all_companies)

        return all_companies

    async def scrape_all_companies(self, start_page: int = 1, end_page: int = None, batch_size: int = 100, scrape_details: bool = False):
        """Main method to scrape all companies using parallel processing"""
        self.stats['start_time'] = datetime.now()
        end_page = end_page or self.stats['total_pages']

        self.logger.info(f"Starting parallel scraping from page {start_page} to {end_page}")
        self.logger.info(f"Batch size: {batch_size}, Max workers: {self.max_workers}")
        self.logger.info(f"Scrape details: {scrape_details}")

        # Create page batches
        page_ranges = []
        for i in range(start_page, end_page + 1, batch_size):
            batch_end = min(i + batch_size - 1, end_page)
            page_ranges.append(list(range(i, batch_end + 1)))

        self.logger.info(f"Created {len(page_ranges)} batches")

        # Configure connection limits
        connector = aiohttp.TCPConnector(
            limit=self.max_workers * 2,
            limit_per_host=self.max_workers,
            ttl_dns_cache=300,
            use_dns_cache=True,
        )

        timeout = aiohttp.ClientTimeout(total=60, connect=30)

        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
            # Process batches with limited concurrency
            semaphore = asyncio.Semaphore(self.max_workers)

            async def process_batch(batch_pages, batch_num):
                async with semaphore:
                    return await self.scrape_pages_batch(session, batch_pages, batch_num, scrape_details)

            # Create tasks for all batches
            tasks = [
                process_batch(batch_pages, i)
                for i, batch_pages in enumerate(page_ranges, 1)
            ]

            # Process batches
            self.logger.info("Starting batch processing...")
            results = await asyncio.gather(*tasks, return_exceptions=True)

            # Log results
            successful_batches = sum(1 for r in results if isinstance(r, list))
            failed_batches = len(results) - successful_batches

            self.logger.info(f"Completed: {successful_batches} successful, {failed_batches} failed batches")

        self.stats['end_time'] = datetime.now()
        await self.save_final_results()
        self.print_final_stats()

    async def save_final_results(self):
        """Save final consolidated results"""
        try:
            if self.scraped_companies:
                # Save consolidated JSON
                json_file = os.path.join(self.output_dir, 'all_companies.json')
                async with aiofiles.open(json_file, 'w', encoding='utf-8') as f:
                    await f.write(json.dumps(self.scraped_companies, indent=2, ensure_ascii=False))

                # Save consolidated CSV
                csv_file = os.path.join(self.output_dir, 'all_companies.csv')
                df = pd.DataFrame(self.scraped_companies)
                df.to_csv(csv_file, index=False, encoding='utf-8')

                self.logger.info(f"Saved {len(self.scraped_companies)} companies to final files")

            # Save statistics
            stats_file = os.path.join(self.output_dir, 'scraping_statistics.json')
            async with aiofiles.open(stats_file, 'w', encoding='utf-8') as f:
                stats_copy = self.stats.copy()
                if stats_copy['start_time']:
                    stats_copy['start_time'] = stats_copy['start_time'].isoformat()
                if stats_copy['end_time']:
                    stats_copy['end_time'] = stats_copy['end_time'].isoformat()
                await f.write(json.dumps(stats_copy, indent=2))

            # Save failed pages
            if self.failed_pages:
                failed_file = os.path.join(self.output_dir, 'failed_pages.json')
                async with aiofiles.open(failed_file, 'w', encoding='utf-8') as f:
                    await f.write(json.dumps(self.failed_pages, indent=2))

        except Exception as e:
            self.logger.error(f"Error saving final results: {e}")

    def print_final_stats(self):
        """Print final statistics"""
        duration = self.stats['end_time'] - self.stats['start_time']

        print("\n" + "="*80)
        print("ZAUBACORP PARALLEL SCRAPING COMPLETED")
        print("="*80)
        print(f"Total pages processed: {self.stats['pages_processed']:,}")
        print(f"Total companies found: {self.stats['companies_found']:,}")
        print(f"Companies with details: {self.stats['companies_detailed']:,}")
        print(f"Failed pages: {self.stats['failed_pages']:,}")
        print(f"Success rate: {(self.stats['pages_processed']/(self.stats['pages_processed']+self.stats['failed_pages'])*100):.1f}%")
        print(f"Duration: {duration}")
        print(f"Average speed: {self.stats['pages_processed']/duration.total_seconds():.2f} pages/second")
        print(f"Companies per minute: {self.stats['companies_found']/(duration.total_seconds()/60):.0f}")
        print(f"Output directory: {self.output_dir}")
        print("="*80)

def main():
    """Main function to run the parallel scraper"""
    print("ZaubaCorp Parallel Scraper")
    print("=" * 50)

    # Configuration
    MAX_WORKERS = 20  # Adjust based on your system and network
    BATCH_SIZE = 200  # Pages per batch
    START_PAGE = 1
    END_PAGE = 1000  # Set to None for all pages (90,769)
    SCRAPE_DETAILS = False  # Set to True to scrape company detail pages (much slower)
    OUTPUT_DIR = "zaubacorp_parallel_data"

    print(f"Configuration:")
    print(f"  Max workers: {MAX_WORKERS}")
    print(f"  Batch size: {BATCH_SIZE}")
    print(f"  Page range: {START_PAGE} to {END_PAGE or 90769}")
    print(f"  Scrape details: {SCRAPE_DETAILS}")
    print(f"  Output directory: {OUTPUT_DIR}")

    # Create scraper
    scraper = ZaubaCorpParallelScraper(
        max_workers=MAX_WORKERS,
        output_dir=OUTPUT_DIR
    )

    # Run scraper
    try:
        asyncio.run(scraper.scrape_all_companies(
            start_page=START_PAGE,
            end_page=END_PAGE,
            batch_size=BATCH_SIZE,
            scrape_details=SCRAPE_DETAILS
        ))
    except KeyboardInterrupt:
        print("\nScraping interrupted by user")
        asyncio.run(scraper.save_final_results())
    except Exception as e:
        print(f"Unexpected error: {e}")
        logging.error(f"Unexpected error: {e}")

if __name__ == "__main__":
    main()