first commit

2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions
--- a/example_usage.py
+++ b/example_usage.py
@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""
+Example usage of the ZaubaCorp scraper
+
+This script demonstrates different ways to use the ZaubaCorp scraper
+with various configurations and use cases.
+"""
+
+import os
+import time
+from datetime import datetime
+from zaubacorp_scraper import ZaubaCorpScraper
+from zaubacorp_scraper_enhanced import ZaubaCorpScraperEnhanced
+
+def example_basic_scraping():
+    """Example 1: Basic scraping with default settings"""
+    print("\n" + "="*60)
+    print("EXAMPLE 1: Basic Scraping")
+    print("="*60)
+
+    # Create a scraper instance with basic settings
+    scraper = ZaubaCorpScraper(
+        headless=True,  # Run without browser window
+        output_dir="example_output_basic"
+    )
+
+    try:
+        # Scrape a small number of companies for demonstration
+        scraper.scrape_companies(
+            max_companies=10,  # Limit to 10 companies
+            max_pages=2        # Check only first 2 pages
+        )
+        print("Basic scraping completed successfully!")
+
+    except Exception as e:
+        print(f"Error in basic scraping: {e}")
+
+def example_enhanced_scraping():
+    """Example 2: Enhanced scraping with custom configuration"""
+    print("\n" + "="*60)
+    print("EXAMPLE 2: Enhanced Scraping with Custom Config")
+    print("="*60)
+
+    # Custom configuration
+    custom_config = {
+        'scraping': {
+            'max_companies': 25,
+            'max_pages': 3,
+            'delay_between_requests': 1.5,  # 1.5 seconds between requests
+            'save_interval': 10  # Save every 10 companies
+        },
+        'browser': {
+            'headless': True,
+            'page_load_timeout': 45
+        },
+        'output': {
+            'output_dir': 'example_output_enhanced',
+            'save_formats': ['csv', 'json'],
+            'csv_filename': 'companies_enhanced.csv',
+            'json_filename': 'companies_enhanced.json'
+        }
+    }
+
+    # Create enhanced scraper with custom config
+    scraper = ZaubaCorpScraperEnhanced(config=custom_config)
+
+    try:
+        scraper.scrape_companies()
+        print("Enhanced scraping completed successfully!")
+
+    except Exception as e:
+        print(f"Error in enhanced scraping: {e}")
+
+def example_visual_debugging():
+    """Example 3: Visual debugging mode (browser window visible)"""
+    print("\n" + "="*60)
+    print("EXAMPLE 3: Visual Debugging Mode")
+    print("="*60)
+    print("This will open a browser window so you can see what's happening")
+
+    # Configuration for visual debugging
+    debug_config = {
+        'scraping': {
+            'max_companies': 5,  # Small number for debugging
+            'max_pages': 1,
+            'delay_between_requests': 3  # Slower for observation
+        },
+        'browser': {
+            'headless': False,  # Show browser window
+            'page_load_timeout': 30
+        },
+        'output': {
+            'output_dir': 'example_output_debug'
+        }
+    }
+
+    scraper = ZaubaCorpScraperEnhanced(config=debug_config)
+
+    try:
+        scraper.scrape_companies()
+        print("Visual debugging completed!")
+
+    except Exception as e:
+        print(f"Error in visual debugging: {e}")
+
+def example_data_analysis():
+    """Example 4: Basic data analysis of scraped results"""
+    print("\n" + "="*60)
+    print("EXAMPLE 4: Basic Data Analysis")
+    print("="*60)
+
+    try:
+        import pandas as pd
+
+        # Look for existing CSV files
+        csv_files = [
+            'example_output_basic/zaubacorp_companies.csv',
+            'example_output_enhanced/companies_enhanced.csv',
+            'example_output_debug/zaubacorp_companies.csv'
+        ]
+
+        for csv_file in csv_files:
+            if os.path.exists(csv_file):
+                print(f"\nAnalyzing data from: {csv_file}")
+
+                df = pd.read_csv(csv_file)
+
+                print(f"Total companies: {len(df)}")
+                print(f"Companies with CIN: {df['cin'].notna().sum()}")
+                print(f"Companies with email: {df['email'].notna().sum()}")
+                print(f"Unique states: {df['state'].nunique()}")
+
+                # Show sample data
+                print("\nSample companies:")
+                print(df[['company_name', 'cin', 'state']].head())
+
+                # Company status distribution
+                if 'company_status' in df.columns:
+                    print("\nCompany status distribution:")
+                    print(df['company_status'].value_counts())
+
+                break
+        else:
+            print("No CSV files found. Run one of the scraping examples first.")
+
+    except ImportError:
+        print("pandas not available for data analysis")
+    except Exception as e:
+        print(f"Error in data analysis: {e}")
+
+def example_filtered_scraping():
+    """Example 5: Scraping with custom filters and processing"""
+    print("\n" + "="*60)
+    print("EXAMPLE 5: Custom Filtered Scraping")
+    print("="*60)
+
+    class FilteredScraper(ZaubaCorpScraperEnhanced):
+        """Custom scraper that filters companies by criteria"""
+
+        def scrape_company_details(self, company_url):
+            """Override to add custom filtering"""
+            company_data = super().scrape_company_details(company_url)
+
+            if company_data:
+                # Example: Only keep companies from specific states
+                target_states = ['Maharashtra', 'Karnataka', 'Delhi', 'Tamil Nadu']
+
+                if company_data.get('state') not in target_states:
+                    self.logger.info(f"Filtered out company from {company_data.get('state')}")
+                    return None
+
+                # Example: Only keep companies with CIN
+                if not company_data.get('cin'):
+                    self.logger.info("Filtered out company without CIN")
+                    return None
+
+                self.logger.info(f"Accepted company: {company_data.get('company_name')}")
+
+            return company_data
+
+    # Configuration for filtered scraping
+    filtered_config = {
+        'scraping': {
+            'max_companies': 30,
+            'max_pages': 3,
+            'delay_between_requests': 2
+        },
+        'output': {
+            'output_dir': 'example_output_filtered',
+            'csv_filename': 'filtered_companies.csv'
+        }
+    }
+
+    scraper = FilteredScraper(config=filtered_config)
+
+    try:
+        scraper.scrape_companies()
+        print("Filtered scraping completed!")
+
+    except Exception as e:
+        print(f"Error in filtered scraping: {e}")
+
+def main():
+    """Main function to run all examples"""
+    print("ZaubaCorp Scraper Examples")
+    print("=" * 60)
+    print("This script demonstrates various ways to use the ZaubaCorp scraper.")
+    print("WARNING: This will actually scrape data from zaubacorp.com")
+    print("Make sure you comply with their terms of service.")
+
+    # Ask user which examples to run
+    examples = {
+        '1': ('Basic Scraping', example_basic_scraping),
+        '2': ('Enhanced Scraping', example_enhanced_scraping),
+        '3': ('Visual Debugging', example_visual_debugging),
+        '4': ('Data Analysis', example_data_analysis),
+        '5': ('Filtered Scraping', example_filtered_scraping),
+        'all': ('Run All Examples', None)
+    }
+
+    print("\nAvailable examples:")
+    for key, (name, _) in examples.items():
+        print(f"{key}. {name}")
+
+    choice = input("\nEnter your choice (1-5, all, or 'q' to quit): ").strip().lower()
+
+    if choice == 'q':
+        print("Goodbye!")
+        return
+
+    start_time = datetime.now()
+
+    if choice == 'all':
+        # Run all examples except visual debugging (to avoid interruption)
+        for key, (name, func) in examples.items():
+            if key not in ['3', 'all'] and func:
+                print(f"\nRunning {name}...")
+                try:
+                    func()
+                except KeyboardInterrupt:
+                    print("\nExample interrupted by user")
+                    break
+                except Exception as e:
+                    print(f"Example failed: {e}")
+
+                # Small delay between examples
+                time.sleep(2)
+
+    elif choice in examples and examples[choice][1]:
+        name, func = examples[choice]
+        print(f"\nRunning {name}...")
+        try:
+            func()
+        except KeyboardInterrupt:
+            print("\nExample interrupted by user")
+        except Exception as e:
+            print(f"Example failed: {e}")
+
+    else:
+        print("Invalid choice!")
+        return
+
+    end_time = datetime.now()
+    duration = end_time - start_time
+
+    print(f"\nAll examples completed in {duration}")
+    print("\nCheck the output directories for results:")
+    for directory in ['example_output_basic', 'example_output_enhanced',
+                     'example_output_debug', 'example_output_filtered']:
+        if os.path.exists(directory):
+            files = os.listdir(directory)
+            print(f"  {directory}/: {files}")
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\nProgram interrupted by user. Goodbye!")
+    except Exception as e:
+        print(f"Unexpected error: {e}")