first commit

2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions
--- a/config.py
+++ b/config.py
@ -0,0 +1,94 @@
+# Configuration file for ZaubaCorp Scraper
+
+# Browser settings
+BROWSER_CONFIG = {
+    "headless": True,  # Set to False to see browser window
+    "window_size": "1920,1080",
+    "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    "page_load_timeout": 30,
+    "implicit_wait": 10
+}
+
+# Scraping limits
+SCRAPING_LIMITS = {
+    "max_companies": 100,  # Set to None for unlimited
+    "max_pages": 5,        # Set to None for all pages
+    "delay_between_requests": 1,  # Seconds to wait between requests
+    "save_interval": 50    # Save data every N companies
+}
+
+# Output settings
+OUTPUT_CONFIG = {
+    "output_dir": "zaubacorp_data",
+    "save_formats": ["csv", "json"],  # Available: csv, json
+    "csv_filename": "zaubacorp_companies.csv",
+    "json_filename": "zaubacorp_companies.json"
+}
+
+# URLs
+URLS = {
+    "base_url": "https://www.zaubacorp.com",
+    "companies_list_url": "https://www.zaubacorp.com/companies-list"
+}
+
+# Selectors for different elements on the page
+SELECTORS = {
+    "pagination": [
+        '.pagination a',
+        '.pager a',
+        '.page-link',
+        'a[href*="page="]',
+        'a[href*="companies-list"]'
+    ],
+    "company_links": [
+        'a[href*="/company/"]',
+        'a[href*="company-detail"]',
+        '.company-name a',
+        '.company-link',
+        'a[title*="company"]'
+    ],
+    "company_name": [
+        'h1',
+        '.company-name',
+        '.main-heading',
+        'title'
+    ]
+}
+
+# Fields to extract from company pages
+COMPANY_FIELDS = {
+    'url': '',
+    'company_name': '',
+    'cin': '',
+    'registration_number': '',
+    'company_category': '',
+    'company_sub_category': '',
+    'class_of_company': '',
+    'roc': '',
+    'registration_date': '',
+    'company_status': '',
+    'authorized_capital': '',
+    'paid_up_capital': '',
+    'activity_code': '',
+    'email': '',
+    'address': '',
+    'state': '',
+    'pincode': '',
+    'country': '',
+    'directors': [],
+    'last_updated': ''
+}
+
+# Retry settings
+RETRY_CONFIG = {
+    "max_retries": 3,
+    "retry_delay": 2,  # Seconds to wait before retry
+    "timeout": 30      # Seconds to wait for page load
+}
+
+# Logging settings
+LOGGING_CONFIG = {
+    "log_level": "INFO",
+    "log_file": "zaubacorp_scraper.log",
+    "console_output": True
+}