#!/usr/bin/env python3 """ Example usage of the ZaubaCorp scraper This script demonstrates different ways to use the ZaubaCorp scraper with various configurations and use cases. """ import os import time from datetime import datetime from zaubacorp_scraper import ZaubaCorpScraper from zaubacorp_scraper_enhanced import ZaubaCorpScraperEnhanced def example_basic_scraping(): """Example 1: Basic scraping with default settings""" print("\n" + "="*60) print("EXAMPLE 1: Basic Scraping") print("="*60) # Create a scraper instance with basic settings scraper = ZaubaCorpScraper( headless=True, # Run without browser window output_dir="example_output_basic" ) try: # Scrape a small number of companies for demonstration scraper.scrape_companies( max_companies=10, # Limit to 10 companies max_pages=2 # Check only first 2 pages ) print("Basic scraping completed successfully!") except Exception as e: print(f"Error in basic scraping: {e}") def example_enhanced_scraping(): """Example 2: Enhanced scraping with custom configuration""" print("\n" + "="*60) print("EXAMPLE 2: Enhanced Scraping with Custom Config") print("="*60) # Custom configuration custom_config = { 'scraping': { 'max_companies': 25, 'max_pages': 3, 'delay_between_requests': 1.5, # 1.5 seconds between requests 'save_interval': 10 # Save every 10 companies }, 'browser': { 'headless': True, 'page_load_timeout': 45 }, 'output': { 'output_dir': 'example_output_enhanced', 'save_formats': ['csv', 'json'], 'csv_filename': 'companies_enhanced.csv', 'json_filename': 'companies_enhanced.json' } } # Create enhanced scraper with custom config scraper = ZaubaCorpScraperEnhanced(config=custom_config) try: scraper.scrape_companies() print("Enhanced scraping completed successfully!") except Exception as e: print(f"Error in enhanced scraping: {e}") def example_visual_debugging(): """Example 3: Visual debugging mode (browser window visible)""" print("\n" + "="*60) print("EXAMPLE 3: Visual Debugging Mode") print("="*60) print("This will open a browser window so you can see what's happening") # Configuration for visual debugging debug_config = { 'scraping': { 'max_companies': 5, # Small number for debugging 'max_pages': 1, 'delay_between_requests': 3 # Slower for observation }, 'browser': { 'headless': False, # Show browser window 'page_load_timeout': 30 }, 'output': { 'output_dir': 'example_output_debug' } } scraper = ZaubaCorpScraperEnhanced(config=debug_config) try: scraper.scrape_companies() print("Visual debugging completed!") except Exception as e: print(f"Error in visual debugging: {e}") def example_data_analysis(): """Example 4: Basic data analysis of scraped results""" print("\n" + "="*60) print("EXAMPLE 4: Basic Data Analysis") print("="*60) try: import pandas as pd # Look for existing CSV files csv_files = [ 'example_output_basic/zaubacorp_companies.csv', 'example_output_enhanced/companies_enhanced.csv', 'example_output_debug/zaubacorp_companies.csv' ] for csv_file in csv_files: if os.path.exists(csv_file): print(f"\nAnalyzing data from: {csv_file}") df = pd.read_csv(csv_file) print(f"Total companies: {len(df)}") print(f"Companies with CIN: {df['cin'].notna().sum()}") print(f"Companies with email: {df['email'].notna().sum()}") print(f"Unique states: {df['state'].nunique()}") # Show sample data print("\nSample companies:") print(df[['company_name', 'cin', 'state']].head()) # Company status distribution if 'company_status' in df.columns: print("\nCompany status distribution:") print(df['company_status'].value_counts()) break else: print("No CSV files found. Run one of the scraping examples first.") except ImportError: print("pandas not available for data analysis") except Exception as e: print(f"Error in data analysis: {e}") def example_filtered_scraping(): """Example 5: Scraping with custom filters and processing""" print("\n" + "="*60) print("EXAMPLE 5: Custom Filtered Scraping") print("="*60) class FilteredScraper(ZaubaCorpScraperEnhanced): """Custom scraper that filters companies by criteria""" def scrape_company_details(self, company_url): """Override to add custom filtering""" company_data = super().scrape_company_details(company_url) if company_data: # Example: Only keep companies from specific states target_states = ['Maharashtra', 'Karnataka', 'Delhi', 'Tamil Nadu'] if company_data.get('state') not in target_states: self.logger.info(f"Filtered out company from {company_data.get('state')}") return None # Example: Only keep companies with CIN if not company_data.get('cin'): self.logger.info("Filtered out company without CIN") return None self.logger.info(f"Accepted company: {company_data.get('company_name')}") return company_data # Configuration for filtered scraping filtered_config = { 'scraping': { 'max_companies': 30, 'max_pages': 3, 'delay_between_requests': 2 }, 'output': { 'output_dir': 'example_output_filtered', 'csv_filename': 'filtered_companies.csv' } } scraper = FilteredScraper(config=filtered_config) try: scraper.scrape_companies() print("Filtered scraping completed!") except Exception as e: print(f"Error in filtered scraping: {e}") def main(): """Main function to run all examples""" print("ZaubaCorp Scraper Examples") print("=" * 60) print("This script demonstrates various ways to use the ZaubaCorp scraper.") print("WARNING: This will actually scrape data from zaubacorp.com") print("Make sure you comply with their terms of service.") # Ask user which examples to run examples = { '1': ('Basic Scraping', example_basic_scraping), '2': ('Enhanced Scraping', example_enhanced_scraping), '3': ('Visual Debugging', example_visual_debugging), '4': ('Data Analysis', example_data_analysis), '5': ('Filtered Scraping', example_filtered_scraping), 'all': ('Run All Examples', None) } print("\nAvailable examples:") for key, (name, _) in examples.items(): print(f"{key}. {name}") choice = input("\nEnter your choice (1-5, all, or 'q' to quit): ").strip().lower() if choice == 'q': print("Goodbye!") return start_time = datetime.now() if choice == 'all': # Run all examples except visual debugging (to avoid interruption) for key, (name, func) in examples.items(): if key not in ['3', 'all'] and func: print(f"\nRunning {name}...") try: func() except KeyboardInterrupt: print("\nExample interrupted by user") break except Exception as e: print(f"Example failed: {e}") # Small delay between examples time.sleep(2) elif choice in examples and examples[choice][1]: name, func = examples[choice] print(f"\nRunning {name}...") try: func() except KeyboardInterrupt: print("\nExample interrupted by user") except Exception as e: print(f"Example failed: {e}") else: print("Invalid choice!") return end_time = datetime.now() duration = end_time - start_time print(f"\nAll examples completed in {duration}") print("\nCheck the output directories for results:") for directory in ['example_output_basic', 'example_output_enhanced', 'example_output_debug', 'example_output_filtered']: if os.path.exists(directory): files = os.listdir(directory) print(f" {directory}/: {files}") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\nProgram interrupted by user. Goodbye!") except Exception as e: print(f"Unexpected error: {e}")