first commit
This commit is contained in:
94
config.py
Normal file
94
config.py
Normal file
@ -0,0 +1,94 @@
|
||||
# Configuration file for ZaubaCorp Scraper
|
||||
|
||||
# Browser settings
|
||||
BROWSER_CONFIG = {
|
||||
"headless": True, # Set to False to see browser window
|
||||
"window_size": "1920,1080",
|
||||
"user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"page_load_timeout": 30,
|
||||
"implicit_wait": 10
|
||||
}
|
||||
|
||||
# Scraping limits
|
||||
SCRAPING_LIMITS = {
|
||||
"max_companies": 100, # Set to None for unlimited
|
||||
"max_pages": 5, # Set to None for all pages
|
||||
"delay_between_requests": 1, # Seconds to wait between requests
|
||||
"save_interval": 50 # Save data every N companies
|
||||
}
|
||||
|
||||
# Output settings
|
||||
OUTPUT_CONFIG = {
|
||||
"output_dir": "zaubacorp_data",
|
||||
"save_formats": ["csv", "json"], # Available: csv, json
|
||||
"csv_filename": "zaubacorp_companies.csv",
|
||||
"json_filename": "zaubacorp_companies.json"
|
||||
}
|
||||
|
||||
# URLs
|
||||
URLS = {
|
||||
"base_url": "https://www.zaubacorp.com",
|
||||
"companies_list_url": "https://www.zaubacorp.com/companies-list"
|
||||
}
|
||||
|
||||
# Selectors for different elements on the page
|
||||
SELECTORS = {
|
||||
"pagination": [
|
||||
'.pagination a',
|
||||
'.pager a',
|
||||
'.page-link',
|
||||
'a[href*="page="]',
|
||||
'a[href*="companies-list"]'
|
||||
],
|
||||
"company_links": [
|
||||
'a[href*="/company/"]',
|
||||
'a[href*="company-detail"]',
|
||||
'.company-name a',
|
||||
'.company-link',
|
||||
'a[title*="company"]'
|
||||
],
|
||||
"company_name": [
|
||||
'h1',
|
||||
'.company-name',
|
||||
'.main-heading',
|
||||
'title'
|
||||
]
|
||||
}
|
||||
|
||||
# Fields to extract from company pages
|
||||
COMPANY_FIELDS = {
|
||||
'url': '',
|
||||
'company_name': '',
|
||||
'cin': '',
|
||||
'registration_number': '',
|
||||
'company_category': '',
|
||||
'company_sub_category': '',
|
||||
'class_of_company': '',
|
||||
'roc': '',
|
||||
'registration_date': '',
|
||||
'company_status': '',
|
||||
'authorized_capital': '',
|
||||
'paid_up_capital': '',
|
||||
'activity_code': '',
|
||||
'email': '',
|
||||
'address': '',
|
||||
'state': '',
|
||||
'pincode': '',
|
||||
'country': '',
|
||||
'directors': [],
|
||||
'last_updated': ''
|
||||
}
|
||||
|
||||
# Retry settings
|
||||
RETRY_CONFIG = {
|
||||
"max_retries": 3,
|
||||
"retry_delay": 2, # Seconds to wait before retry
|
||||
"timeout": 30 # Seconds to wait for page load
|
||||
}
|
||||
|
||||
# Logging settings
|
||||
LOGGING_CONFIG = {
|
||||
"log_level": "INFO",
|
||||
"log_file": "zaubacorp_scraper.log",
|
||||
"console_output": True
|
||||
}
|
||||
Reference in New Issue
Block a user