first commit

This commit is contained in:
govardhan
2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions

94
config.py Normal file
View File

@ -0,0 +1,94 @@
# Configuration file for ZaubaCorp Scraper
# Browser settings
BROWSER_CONFIG = {
"headless": True, # Set to False to see browser window
"window_size": "1920,1080",
"user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"page_load_timeout": 30,
"implicit_wait": 10
}
# Scraping limits
SCRAPING_LIMITS = {
"max_companies": 100, # Set to None for unlimited
"max_pages": 5, # Set to None for all pages
"delay_between_requests": 1, # Seconds to wait between requests
"save_interval": 50 # Save data every N companies
}
# Output settings
OUTPUT_CONFIG = {
"output_dir": "zaubacorp_data",
"save_formats": ["csv", "json"], # Available: csv, json
"csv_filename": "zaubacorp_companies.csv",
"json_filename": "zaubacorp_companies.json"
}
# URLs
URLS = {
"base_url": "https://www.zaubacorp.com",
"companies_list_url": "https://www.zaubacorp.com/companies-list"
}
# Selectors for different elements on the page
SELECTORS = {
"pagination": [
'.pagination a',
'.pager a',
'.page-link',
'a[href*="page="]',
'a[href*="companies-list"]'
],
"company_links": [
'a[href*="/company/"]',
'a[href*="company-detail"]',
'.company-name a',
'.company-link',
'a[title*="company"]'
],
"company_name": [
'h1',
'.company-name',
'.main-heading',
'title'
]
}
# Fields to extract from company pages
COMPANY_FIELDS = {
'url': '',
'company_name': '',
'cin': '',
'registration_number': '',
'company_category': '',
'company_sub_category': '',
'class_of_company': '',
'roc': '',
'registration_date': '',
'company_status': '',
'authorized_capital': '',
'paid_up_capital': '',
'activity_code': '',
'email': '',
'address': '',
'state': '',
'pincode': '',
'country': '',
'directors': [],
'last_updated': ''
}
# Retry settings
RETRY_CONFIG = {
"max_retries": 3,
"retry_delay": 2, # Seconds to wait before retry
"timeout": 30 # Seconds to wait for page load
}
# Logging settings
LOGGING_CONFIG = {
"log_level": "INFO",
"log_file": "zaubacorp_scraper.log",
"console_output": True
}