# Configuration file for ZaubaCorp Scraper # Browser settings BROWSER_CONFIG = { "headless": True, # Set to False to see browser window "window_size": "1920,1080", "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "page_load_timeout": 30, "implicit_wait": 10 } # Scraping limits SCRAPING_LIMITS = { "max_companies": 100, # Set to None for unlimited "max_pages": 5, # Set to None for all pages "delay_between_requests": 1, # Seconds to wait between requests "save_interval": 50 # Save data every N companies } # Output settings OUTPUT_CONFIG = { "output_dir": "zaubacorp_data", "save_formats": ["csv", "json"], # Available: csv, json "csv_filename": "zaubacorp_companies.csv", "json_filename": "zaubacorp_companies.json" } # URLs URLS = { "base_url": "https://www.zaubacorp.com", "companies_list_url": "https://www.zaubacorp.com/companies-list" } # Selectors for different elements on the page SELECTORS = { "pagination": [ '.pagination a', '.pager a', '.page-link', 'a[href*="page="]', 'a[href*="companies-list"]' ], "company_links": [ 'a[href*="/company/"]', 'a[href*="company-detail"]', '.company-name a', '.company-link', 'a[title*="company"]' ], "company_name": [ 'h1', '.company-name', '.main-heading', 'title' ] } # Fields to extract from company pages COMPANY_FIELDS = { 'url': '', 'company_name': '', 'cin': '', 'registration_number': '', 'company_category': '', 'company_sub_category': '', 'class_of_company': '', 'roc': '', 'registration_date': '', 'company_status': '', 'authorized_capital': '', 'paid_up_capital': '', 'activity_code': '', 'email': '', 'address': '', 'state': '', 'pincode': '', 'country': '', 'directors': [], 'last_updated': '' } # Retry settings RETRY_CONFIG = { "max_retries": 3, "retry_delay": 2, # Seconds to wait before retry "timeout": 30 # Seconds to wait for page load } # Logging settings LOGGING_CONFIG = { "log_level": "INFO", "log_file": "zaubacorp_scraper.log", "console_output": True }