95 lines
2.3 KiB
Python
95 lines
2.3 KiB
Python
# Configuration file for ZaubaCorp Scraper
|
|
|
|
# Browser settings
|
|
BROWSER_CONFIG = {
|
|
"headless": True, # Set to False to see browser window
|
|
"window_size": "1920,1080",
|
|
"user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"page_load_timeout": 30,
|
|
"implicit_wait": 10
|
|
}
|
|
|
|
# Scraping limits
|
|
SCRAPING_LIMITS = {
|
|
"max_companies": 100, # Set to None for unlimited
|
|
"max_pages": 5, # Set to None for all pages
|
|
"delay_between_requests": 1, # Seconds to wait between requests
|
|
"save_interval": 50 # Save data every N companies
|
|
}
|
|
|
|
# Output settings
|
|
OUTPUT_CONFIG = {
|
|
"output_dir": "zaubacorp_data",
|
|
"save_formats": ["csv", "json"], # Available: csv, json
|
|
"csv_filename": "zaubacorp_companies.csv",
|
|
"json_filename": "zaubacorp_companies.json"
|
|
}
|
|
|
|
# URLs
|
|
URLS = {
|
|
"base_url": "https://www.zaubacorp.com",
|
|
"companies_list_url": "https://www.zaubacorp.com/companies-list"
|
|
}
|
|
|
|
# Selectors for different elements on the page
|
|
SELECTORS = {
|
|
"pagination": [
|
|
'.pagination a',
|
|
'.pager a',
|
|
'.page-link',
|
|
'a[href*="page="]',
|
|
'a[href*="companies-list"]'
|
|
],
|
|
"company_links": [
|
|
'a[href*="/company/"]',
|
|
'a[href*="company-detail"]',
|
|
'.company-name a',
|
|
'.company-link',
|
|
'a[title*="company"]'
|
|
],
|
|
"company_name": [
|
|
'h1',
|
|
'.company-name',
|
|
'.main-heading',
|
|
'title'
|
|
]
|
|
}
|
|
|
|
# Fields to extract from company pages
|
|
COMPANY_FIELDS = {
|
|
'url': '',
|
|
'company_name': '',
|
|
'cin': '',
|
|
'registration_number': '',
|
|
'company_category': '',
|
|
'company_sub_category': '',
|
|
'class_of_company': '',
|
|
'roc': '',
|
|
'registration_date': '',
|
|
'company_status': '',
|
|
'authorized_capital': '',
|
|
'paid_up_capital': '',
|
|
'activity_code': '',
|
|
'email': '',
|
|
'address': '',
|
|
'state': '',
|
|
'pincode': '',
|
|
'country': '',
|
|
'directors': [],
|
|
'last_updated': ''
|
|
}
|
|
|
|
# Retry settings
|
|
RETRY_CONFIG = {
|
|
"max_retries": 3,
|
|
"retry_delay": 2, # Seconds to wait before retry
|
|
"timeout": 30 # Seconds to wait for page load
|
|
}
|
|
|
|
# Logging settings
|
|
LOGGING_CONFIG = {
|
|
"log_level": "INFO",
|
|
"log_file": "zaubacorp_scraper.log",
|
|
"console_output": True
|
|
}
|