281 lines
8.9 KiB
Python
281 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Example usage of the ZaubaCorp scraper
|
|
|
|
This script demonstrates different ways to use the ZaubaCorp scraper
|
|
with various configurations and use cases.
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
from datetime import datetime
|
|
from zaubacorp_scraper import ZaubaCorpScraper
|
|
from zaubacorp_scraper_enhanced import ZaubaCorpScraperEnhanced
|
|
|
|
def example_basic_scraping():
|
|
"""Example 1: Basic scraping with default settings"""
|
|
print("\n" + "="*60)
|
|
print("EXAMPLE 1: Basic Scraping")
|
|
print("="*60)
|
|
|
|
# Create a scraper instance with basic settings
|
|
scraper = ZaubaCorpScraper(
|
|
headless=True, # Run without browser window
|
|
output_dir="example_output_basic"
|
|
)
|
|
|
|
try:
|
|
# Scrape a small number of companies for demonstration
|
|
scraper.scrape_companies(
|
|
max_companies=10, # Limit to 10 companies
|
|
max_pages=2 # Check only first 2 pages
|
|
)
|
|
print("Basic scraping completed successfully!")
|
|
|
|
except Exception as e:
|
|
print(f"Error in basic scraping: {e}")
|
|
|
|
def example_enhanced_scraping():
|
|
"""Example 2: Enhanced scraping with custom configuration"""
|
|
print("\n" + "="*60)
|
|
print("EXAMPLE 2: Enhanced Scraping with Custom Config")
|
|
print("="*60)
|
|
|
|
# Custom configuration
|
|
custom_config = {
|
|
'scraping': {
|
|
'max_companies': 25,
|
|
'max_pages': 3,
|
|
'delay_between_requests': 1.5, # 1.5 seconds between requests
|
|
'save_interval': 10 # Save every 10 companies
|
|
},
|
|
'browser': {
|
|
'headless': True,
|
|
'page_load_timeout': 45
|
|
},
|
|
'output': {
|
|
'output_dir': 'example_output_enhanced',
|
|
'save_formats': ['csv', 'json'],
|
|
'csv_filename': 'companies_enhanced.csv',
|
|
'json_filename': 'companies_enhanced.json'
|
|
}
|
|
}
|
|
|
|
# Create enhanced scraper with custom config
|
|
scraper = ZaubaCorpScraperEnhanced(config=custom_config)
|
|
|
|
try:
|
|
scraper.scrape_companies()
|
|
print("Enhanced scraping completed successfully!")
|
|
|
|
except Exception as e:
|
|
print(f"Error in enhanced scraping: {e}")
|
|
|
|
def example_visual_debugging():
|
|
"""Example 3: Visual debugging mode (browser window visible)"""
|
|
print("\n" + "="*60)
|
|
print("EXAMPLE 3: Visual Debugging Mode")
|
|
print("="*60)
|
|
print("This will open a browser window so you can see what's happening")
|
|
|
|
# Configuration for visual debugging
|
|
debug_config = {
|
|
'scraping': {
|
|
'max_companies': 5, # Small number for debugging
|
|
'max_pages': 1,
|
|
'delay_between_requests': 3 # Slower for observation
|
|
},
|
|
'browser': {
|
|
'headless': False, # Show browser window
|
|
'page_load_timeout': 30
|
|
},
|
|
'output': {
|
|
'output_dir': 'example_output_debug'
|
|
}
|
|
}
|
|
|
|
scraper = ZaubaCorpScraperEnhanced(config=debug_config)
|
|
|
|
try:
|
|
scraper.scrape_companies()
|
|
print("Visual debugging completed!")
|
|
|
|
except Exception as e:
|
|
print(f"Error in visual debugging: {e}")
|
|
|
|
def example_data_analysis():
|
|
"""Example 4: Basic data analysis of scraped results"""
|
|
print("\n" + "="*60)
|
|
print("EXAMPLE 4: Basic Data Analysis")
|
|
print("="*60)
|
|
|
|
try:
|
|
import pandas as pd
|
|
|
|
# Look for existing CSV files
|
|
csv_files = [
|
|
'example_output_basic/zaubacorp_companies.csv',
|
|
'example_output_enhanced/companies_enhanced.csv',
|
|
'example_output_debug/zaubacorp_companies.csv'
|
|
]
|
|
|
|
for csv_file in csv_files:
|
|
if os.path.exists(csv_file):
|
|
print(f"\nAnalyzing data from: {csv_file}")
|
|
|
|
df = pd.read_csv(csv_file)
|
|
|
|
print(f"Total companies: {len(df)}")
|
|
print(f"Companies with CIN: {df['cin'].notna().sum()}")
|
|
print(f"Companies with email: {df['email'].notna().sum()}")
|
|
print(f"Unique states: {df['state'].nunique()}")
|
|
|
|
# Show sample data
|
|
print("\nSample companies:")
|
|
print(df[['company_name', 'cin', 'state']].head())
|
|
|
|
# Company status distribution
|
|
if 'company_status' in df.columns:
|
|
print("\nCompany status distribution:")
|
|
print(df['company_status'].value_counts())
|
|
|
|
break
|
|
else:
|
|
print("No CSV files found. Run one of the scraping examples first.")
|
|
|
|
except ImportError:
|
|
print("pandas not available for data analysis")
|
|
except Exception as e:
|
|
print(f"Error in data analysis: {e}")
|
|
|
|
def example_filtered_scraping():
|
|
"""Example 5: Scraping with custom filters and processing"""
|
|
print("\n" + "="*60)
|
|
print("EXAMPLE 5: Custom Filtered Scraping")
|
|
print("="*60)
|
|
|
|
class FilteredScraper(ZaubaCorpScraperEnhanced):
|
|
"""Custom scraper that filters companies by criteria"""
|
|
|
|
def scrape_company_details(self, company_url):
|
|
"""Override to add custom filtering"""
|
|
company_data = super().scrape_company_details(company_url)
|
|
|
|
if company_data:
|
|
# Example: Only keep companies from specific states
|
|
target_states = ['Maharashtra', 'Karnataka', 'Delhi', 'Tamil Nadu']
|
|
|
|
if company_data.get('state') not in target_states:
|
|
self.logger.info(f"Filtered out company from {company_data.get('state')}")
|
|
return None
|
|
|
|
# Example: Only keep companies with CIN
|
|
if not company_data.get('cin'):
|
|
self.logger.info("Filtered out company without CIN")
|
|
return None
|
|
|
|
self.logger.info(f"Accepted company: {company_data.get('company_name')}")
|
|
|
|
return company_data
|
|
|
|
# Configuration for filtered scraping
|
|
filtered_config = {
|
|
'scraping': {
|
|
'max_companies': 30,
|
|
'max_pages': 3,
|
|
'delay_between_requests': 2
|
|
},
|
|
'output': {
|
|
'output_dir': 'example_output_filtered',
|
|
'csv_filename': 'filtered_companies.csv'
|
|
}
|
|
}
|
|
|
|
scraper = FilteredScraper(config=filtered_config)
|
|
|
|
try:
|
|
scraper.scrape_companies()
|
|
print("Filtered scraping completed!")
|
|
|
|
except Exception as e:
|
|
print(f"Error in filtered scraping: {e}")
|
|
|
|
def main():
|
|
"""Main function to run all examples"""
|
|
print("ZaubaCorp Scraper Examples")
|
|
print("=" * 60)
|
|
print("This script demonstrates various ways to use the ZaubaCorp scraper.")
|
|
print("WARNING: This will actually scrape data from zaubacorp.com")
|
|
print("Make sure you comply with their terms of service.")
|
|
|
|
# Ask user which examples to run
|
|
examples = {
|
|
'1': ('Basic Scraping', example_basic_scraping),
|
|
'2': ('Enhanced Scraping', example_enhanced_scraping),
|
|
'3': ('Visual Debugging', example_visual_debugging),
|
|
'4': ('Data Analysis', example_data_analysis),
|
|
'5': ('Filtered Scraping', example_filtered_scraping),
|
|
'all': ('Run All Examples', None)
|
|
}
|
|
|
|
print("\nAvailable examples:")
|
|
for key, (name, _) in examples.items():
|
|
print(f"{key}. {name}")
|
|
|
|
choice = input("\nEnter your choice (1-5, all, or 'q' to quit): ").strip().lower()
|
|
|
|
if choice == 'q':
|
|
print("Goodbye!")
|
|
return
|
|
|
|
start_time = datetime.now()
|
|
|
|
if choice == 'all':
|
|
# Run all examples except visual debugging (to avoid interruption)
|
|
for key, (name, func) in examples.items():
|
|
if key not in ['3', 'all'] and func:
|
|
print(f"\nRunning {name}...")
|
|
try:
|
|
func()
|
|
except KeyboardInterrupt:
|
|
print("\nExample interrupted by user")
|
|
break
|
|
except Exception as e:
|
|
print(f"Example failed: {e}")
|
|
|
|
# Small delay between examples
|
|
time.sleep(2)
|
|
|
|
elif choice in examples and examples[choice][1]:
|
|
name, func = examples[choice]
|
|
print(f"\nRunning {name}...")
|
|
try:
|
|
func()
|
|
except KeyboardInterrupt:
|
|
print("\nExample interrupted by user")
|
|
except Exception as e:
|
|
print(f"Example failed: {e}")
|
|
|
|
else:
|
|
print("Invalid choice!")
|
|
return
|
|
|
|
end_time = datetime.now()
|
|
duration = end_time - start_time
|
|
|
|
print(f"\nAll examples completed in {duration}")
|
|
print("\nCheck the output directories for results:")
|
|
for directory in ['example_output_basic', 'example_output_enhanced',
|
|
'example_output_debug', 'example_output_filtered']:
|
|
if os.path.exists(directory):
|
|
files = os.listdir(directory)
|
|
print(f" {directory}/: {files}")
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\nProgram interrupted by user. Goodbye!")
|
|
except Exception as e:
|
|
print(f"Unexpected error: {e}")
|