first commit
This commit is contained in:
280
example_usage.py
Normal file
280
example_usage.py
Normal file
@ -0,0 +1,280 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Example usage of the ZaubaCorp scraper
|
||||
|
||||
This script demonstrates different ways to use the ZaubaCorp scraper
|
||||
with various configurations and use cases.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from zaubacorp_scraper import ZaubaCorpScraper
|
||||
from zaubacorp_scraper_enhanced import ZaubaCorpScraperEnhanced
|
||||
|
||||
def example_basic_scraping():
|
||||
"""Example 1: Basic scraping with default settings"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 1: Basic Scraping")
|
||||
print("="*60)
|
||||
|
||||
# Create a scraper instance with basic settings
|
||||
scraper = ZaubaCorpScraper(
|
||||
headless=True, # Run without browser window
|
||||
output_dir="example_output_basic"
|
||||
)
|
||||
|
||||
try:
|
||||
# Scrape a small number of companies for demonstration
|
||||
scraper.scrape_companies(
|
||||
max_companies=10, # Limit to 10 companies
|
||||
max_pages=2 # Check only first 2 pages
|
||||
)
|
||||
print("Basic scraping completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in basic scraping: {e}")
|
||||
|
||||
def example_enhanced_scraping():
|
||||
"""Example 2: Enhanced scraping with custom configuration"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 2: Enhanced Scraping with Custom Config")
|
||||
print("="*60)
|
||||
|
||||
# Custom configuration
|
||||
custom_config = {
|
||||
'scraping': {
|
||||
'max_companies': 25,
|
||||
'max_pages': 3,
|
||||
'delay_between_requests': 1.5, # 1.5 seconds between requests
|
||||
'save_interval': 10 # Save every 10 companies
|
||||
},
|
||||
'browser': {
|
||||
'headless': True,
|
||||
'page_load_timeout': 45
|
||||
},
|
||||
'output': {
|
||||
'output_dir': 'example_output_enhanced',
|
||||
'save_formats': ['csv', 'json'],
|
||||
'csv_filename': 'companies_enhanced.csv',
|
||||
'json_filename': 'companies_enhanced.json'
|
||||
}
|
||||
}
|
||||
|
||||
# Create enhanced scraper with custom config
|
||||
scraper = ZaubaCorpScraperEnhanced(config=custom_config)
|
||||
|
||||
try:
|
||||
scraper.scrape_companies()
|
||||
print("Enhanced scraping completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in enhanced scraping: {e}")
|
||||
|
||||
def example_visual_debugging():
|
||||
"""Example 3: Visual debugging mode (browser window visible)"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 3: Visual Debugging Mode")
|
||||
print("="*60)
|
||||
print("This will open a browser window so you can see what's happening")
|
||||
|
||||
# Configuration for visual debugging
|
||||
debug_config = {
|
||||
'scraping': {
|
||||
'max_companies': 5, # Small number for debugging
|
||||
'max_pages': 1,
|
||||
'delay_between_requests': 3 # Slower for observation
|
||||
},
|
||||
'browser': {
|
||||
'headless': False, # Show browser window
|
||||
'page_load_timeout': 30
|
||||
},
|
||||
'output': {
|
||||
'output_dir': 'example_output_debug'
|
||||
}
|
||||
}
|
||||
|
||||
scraper = ZaubaCorpScraperEnhanced(config=debug_config)
|
||||
|
||||
try:
|
||||
scraper.scrape_companies()
|
||||
print("Visual debugging completed!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in visual debugging: {e}")
|
||||
|
||||
def example_data_analysis():
|
||||
"""Example 4: Basic data analysis of scraped results"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 4: Basic Data Analysis")
|
||||
print("="*60)
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
# Look for existing CSV files
|
||||
csv_files = [
|
||||
'example_output_basic/zaubacorp_companies.csv',
|
||||
'example_output_enhanced/companies_enhanced.csv',
|
||||
'example_output_debug/zaubacorp_companies.csv'
|
||||
]
|
||||
|
||||
for csv_file in csv_files:
|
||||
if os.path.exists(csv_file):
|
||||
print(f"\nAnalyzing data from: {csv_file}")
|
||||
|
||||
df = pd.read_csv(csv_file)
|
||||
|
||||
print(f"Total companies: {len(df)}")
|
||||
print(f"Companies with CIN: {df['cin'].notna().sum()}")
|
||||
print(f"Companies with email: {df['email'].notna().sum()}")
|
||||
print(f"Unique states: {df['state'].nunique()}")
|
||||
|
||||
# Show sample data
|
||||
print("\nSample companies:")
|
||||
print(df[['company_name', 'cin', 'state']].head())
|
||||
|
||||
# Company status distribution
|
||||
if 'company_status' in df.columns:
|
||||
print("\nCompany status distribution:")
|
||||
print(df['company_status'].value_counts())
|
||||
|
||||
break
|
||||
else:
|
||||
print("No CSV files found. Run one of the scraping examples first.")
|
||||
|
||||
except ImportError:
|
||||
print("pandas not available for data analysis")
|
||||
except Exception as e:
|
||||
print(f"Error in data analysis: {e}")
|
||||
|
||||
def example_filtered_scraping():
|
||||
"""Example 5: Scraping with custom filters and processing"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 5: Custom Filtered Scraping")
|
||||
print("="*60)
|
||||
|
||||
class FilteredScraper(ZaubaCorpScraperEnhanced):
|
||||
"""Custom scraper that filters companies by criteria"""
|
||||
|
||||
def scrape_company_details(self, company_url):
|
||||
"""Override to add custom filtering"""
|
||||
company_data = super().scrape_company_details(company_url)
|
||||
|
||||
if company_data:
|
||||
# Example: Only keep companies from specific states
|
||||
target_states = ['Maharashtra', 'Karnataka', 'Delhi', 'Tamil Nadu']
|
||||
|
||||
if company_data.get('state') not in target_states:
|
||||
self.logger.info(f"Filtered out company from {company_data.get('state')}")
|
||||
return None
|
||||
|
||||
# Example: Only keep companies with CIN
|
||||
if not company_data.get('cin'):
|
||||
self.logger.info("Filtered out company without CIN")
|
||||
return None
|
||||
|
||||
self.logger.info(f"Accepted company: {company_data.get('company_name')}")
|
||||
|
||||
return company_data
|
||||
|
||||
# Configuration for filtered scraping
|
||||
filtered_config = {
|
||||
'scraping': {
|
||||
'max_companies': 30,
|
||||
'max_pages': 3,
|
||||
'delay_between_requests': 2
|
||||
},
|
||||
'output': {
|
||||
'output_dir': 'example_output_filtered',
|
||||
'csv_filename': 'filtered_companies.csv'
|
||||
}
|
||||
}
|
||||
|
||||
scraper = FilteredScraper(config=filtered_config)
|
||||
|
||||
try:
|
||||
scraper.scrape_companies()
|
||||
print("Filtered scraping completed!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in filtered scraping: {e}")
|
||||
|
||||
def main():
|
||||
"""Main function to run all examples"""
|
||||
print("ZaubaCorp Scraper Examples")
|
||||
print("=" * 60)
|
||||
print("This script demonstrates various ways to use the ZaubaCorp scraper.")
|
||||
print("WARNING: This will actually scrape data from zaubacorp.com")
|
||||
print("Make sure you comply with their terms of service.")
|
||||
|
||||
# Ask user which examples to run
|
||||
examples = {
|
||||
'1': ('Basic Scraping', example_basic_scraping),
|
||||
'2': ('Enhanced Scraping', example_enhanced_scraping),
|
||||
'3': ('Visual Debugging', example_visual_debugging),
|
||||
'4': ('Data Analysis', example_data_analysis),
|
||||
'5': ('Filtered Scraping', example_filtered_scraping),
|
||||
'all': ('Run All Examples', None)
|
||||
}
|
||||
|
||||
print("\nAvailable examples:")
|
||||
for key, (name, _) in examples.items():
|
||||
print(f"{key}. {name}")
|
||||
|
||||
choice = input("\nEnter your choice (1-5, all, or 'q' to quit): ").strip().lower()
|
||||
|
||||
if choice == 'q':
|
||||
print("Goodbye!")
|
||||
return
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
if choice == 'all':
|
||||
# Run all examples except visual debugging (to avoid interruption)
|
||||
for key, (name, func) in examples.items():
|
||||
if key not in ['3', 'all'] and func:
|
||||
print(f"\nRunning {name}...")
|
||||
try:
|
||||
func()
|
||||
except KeyboardInterrupt:
|
||||
print("\nExample interrupted by user")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Example failed: {e}")
|
||||
|
||||
# Small delay between examples
|
||||
time.sleep(2)
|
||||
|
||||
elif choice in examples and examples[choice][1]:
|
||||
name, func = examples[choice]
|
||||
print(f"\nRunning {name}...")
|
||||
try:
|
||||
func()
|
||||
except KeyboardInterrupt:
|
||||
print("\nExample interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"Example failed: {e}")
|
||||
|
||||
else:
|
||||
print("Invalid choice!")
|
||||
return
|
||||
|
||||
end_time = datetime.now()
|
||||
duration = end_time - start_time
|
||||
|
||||
print(f"\nAll examples completed in {duration}")
|
||||
print("\nCheck the output directories for results:")
|
||||
for directory in ['example_output_basic', 'example_output_enhanced',
|
||||
'example_output_debug', 'example_output_filtered']:
|
||||
if os.path.exists(directory):
|
||||
files = os.listdir(directory)
|
||||
print(f" {directory}/: {files}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\nProgram interrupted by user. Goodbye!")
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}")
|
||||
Reference in New Issue
Block a user