first commit

This commit is contained in:
govardhan
2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions

280
example_usage.py Normal file
View File

@ -0,0 +1,280 @@
#!/usr/bin/env python3
"""
Example usage of the ZaubaCorp scraper
This script demonstrates different ways to use the ZaubaCorp scraper
with various configurations and use cases.
"""
import os
import time
from datetime import datetime
from zaubacorp_scraper import ZaubaCorpScraper
from zaubacorp_scraper_enhanced import ZaubaCorpScraperEnhanced
def example_basic_scraping():
"""Example 1: Basic scraping with default settings"""
print("\n" + "="*60)
print("EXAMPLE 1: Basic Scraping")
print("="*60)
# Create a scraper instance with basic settings
scraper = ZaubaCorpScraper(
headless=True, # Run without browser window
output_dir="example_output_basic"
)
try:
# Scrape a small number of companies for demonstration
scraper.scrape_companies(
max_companies=10, # Limit to 10 companies
max_pages=2 # Check only first 2 pages
)
print("Basic scraping completed successfully!")
except Exception as e:
print(f"Error in basic scraping: {e}")
def example_enhanced_scraping():
"""Example 2: Enhanced scraping with custom configuration"""
print("\n" + "="*60)
print("EXAMPLE 2: Enhanced Scraping with Custom Config")
print("="*60)
# Custom configuration
custom_config = {
'scraping': {
'max_companies': 25,
'max_pages': 3,
'delay_between_requests': 1.5, # 1.5 seconds between requests
'save_interval': 10 # Save every 10 companies
},
'browser': {
'headless': True,
'page_load_timeout': 45
},
'output': {
'output_dir': 'example_output_enhanced',
'save_formats': ['csv', 'json'],
'csv_filename': 'companies_enhanced.csv',
'json_filename': 'companies_enhanced.json'
}
}
# Create enhanced scraper with custom config
scraper = ZaubaCorpScraperEnhanced(config=custom_config)
try:
scraper.scrape_companies()
print("Enhanced scraping completed successfully!")
except Exception as e:
print(f"Error in enhanced scraping: {e}")
def example_visual_debugging():
"""Example 3: Visual debugging mode (browser window visible)"""
print("\n" + "="*60)
print("EXAMPLE 3: Visual Debugging Mode")
print("="*60)
print("This will open a browser window so you can see what's happening")
# Configuration for visual debugging
debug_config = {
'scraping': {
'max_companies': 5, # Small number for debugging
'max_pages': 1,
'delay_between_requests': 3 # Slower for observation
},
'browser': {
'headless': False, # Show browser window
'page_load_timeout': 30
},
'output': {
'output_dir': 'example_output_debug'
}
}
scraper = ZaubaCorpScraperEnhanced(config=debug_config)
try:
scraper.scrape_companies()
print("Visual debugging completed!")
except Exception as e:
print(f"Error in visual debugging: {e}")
def example_data_analysis():
"""Example 4: Basic data analysis of scraped results"""
print("\n" + "="*60)
print("EXAMPLE 4: Basic Data Analysis")
print("="*60)
try:
import pandas as pd
# Look for existing CSV files
csv_files = [
'example_output_basic/zaubacorp_companies.csv',
'example_output_enhanced/companies_enhanced.csv',
'example_output_debug/zaubacorp_companies.csv'
]
for csv_file in csv_files:
if os.path.exists(csv_file):
print(f"\nAnalyzing data from: {csv_file}")
df = pd.read_csv(csv_file)
print(f"Total companies: {len(df)}")
print(f"Companies with CIN: {df['cin'].notna().sum()}")
print(f"Companies with email: {df['email'].notna().sum()}")
print(f"Unique states: {df['state'].nunique()}")
# Show sample data
print("\nSample companies:")
print(df[['company_name', 'cin', 'state']].head())
# Company status distribution
if 'company_status' in df.columns:
print("\nCompany status distribution:")
print(df['company_status'].value_counts())
break
else:
print("No CSV files found. Run one of the scraping examples first.")
except ImportError:
print("pandas not available for data analysis")
except Exception as e:
print(f"Error in data analysis: {e}")
def example_filtered_scraping():
"""Example 5: Scraping with custom filters and processing"""
print("\n" + "="*60)
print("EXAMPLE 5: Custom Filtered Scraping")
print("="*60)
class FilteredScraper(ZaubaCorpScraperEnhanced):
"""Custom scraper that filters companies by criteria"""
def scrape_company_details(self, company_url):
"""Override to add custom filtering"""
company_data = super().scrape_company_details(company_url)
if company_data:
# Example: Only keep companies from specific states
target_states = ['Maharashtra', 'Karnataka', 'Delhi', 'Tamil Nadu']
if company_data.get('state') not in target_states:
self.logger.info(f"Filtered out company from {company_data.get('state')}")
return None
# Example: Only keep companies with CIN
if not company_data.get('cin'):
self.logger.info("Filtered out company without CIN")
return None
self.logger.info(f"Accepted company: {company_data.get('company_name')}")
return company_data
# Configuration for filtered scraping
filtered_config = {
'scraping': {
'max_companies': 30,
'max_pages': 3,
'delay_between_requests': 2
},
'output': {
'output_dir': 'example_output_filtered',
'csv_filename': 'filtered_companies.csv'
}
}
scraper = FilteredScraper(config=filtered_config)
try:
scraper.scrape_companies()
print("Filtered scraping completed!")
except Exception as e:
print(f"Error in filtered scraping: {e}")
def main():
"""Main function to run all examples"""
print("ZaubaCorp Scraper Examples")
print("=" * 60)
print("This script demonstrates various ways to use the ZaubaCorp scraper.")
print("WARNING: This will actually scrape data from zaubacorp.com")
print("Make sure you comply with their terms of service.")
# Ask user which examples to run
examples = {
'1': ('Basic Scraping', example_basic_scraping),
'2': ('Enhanced Scraping', example_enhanced_scraping),
'3': ('Visual Debugging', example_visual_debugging),
'4': ('Data Analysis', example_data_analysis),
'5': ('Filtered Scraping', example_filtered_scraping),
'all': ('Run All Examples', None)
}
print("\nAvailable examples:")
for key, (name, _) in examples.items():
print(f"{key}. {name}")
choice = input("\nEnter your choice (1-5, all, or 'q' to quit): ").strip().lower()
if choice == 'q':
print("Goodbye!")
return
start_time = datetime.now()
if choice == 'all':
# Run all examples except visual debugging (to avoid interruption)
for key, (name, func) in examples.items():
if key not in ['3', 'all'] and func:
print(f"\nRunning {name}...")
try:
func()
except KeyboardInterrupt:
print("\nExample interrupted by user")
break
except Exception as e:
print(f"Example failed: {e}")
# Small delay between examples
time.sleep(2)
elif choice in examples and examples[choice][1]:
name, func = examples[choice]
print(f"\nRunning {name}...")
try:
func()
except KeyboardInterrupt:
print("\nExample interrupted by user")
except Exception as e:
print(f"Example failed: {e}")
else:
print("Invalid choice!")
return
end_time = datetime.now()
duration = end_time - start_time
print(f"\nAll examples completed in {duration}")
print("\nCheck the output directories for results:")
for directory in ['example_output_basic', 'example_output_enhanced',
'example_output_debug', 'example_output_filtered']:
if os.path.exists(directory):
files = os.listdir(directory)
print(f" {directory}/: {files}")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\nProgram interrupted by user. Goodbye!")
except Exception as e:
print(f"Unexpected error: {e}")