first commit

2025-08-18 23:16:46 +05:30
commit 7e4f3da26d
28 changed files with 69351 additions and 0 deletions
--- a/zaubacorp_scraper.py
+++ b/zaubacorp_scraper.py
@ -0,0 +1,388 @@
+import time
+import os
+import csv
+import json
+from urllib.parse import urljoin, urlparse
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from webdriver_manager.chrome import ChromeDriverManager
+from bs4 import BeautifulSoup
+import pandas as pd
+
+class ZaubaCorpScraper:
+    def __init__(self, headless=True, output_dir="zaubacorp_data"):
+        self.base_url = "https://www.zaubacorp.com"
+        self.companies_list_url = "https://www.zaubacorp.com/companies-list"
+        self.output_dir = output_dir
+        self.driver = None
+        self.scraped_companies = []
+        self.visited_urls = set()
+
+        # Create output directory
+        os.makedirs(self.output_dir, exist_ok=True)
+
+        # Setup Chrome options
+        self.chrome_options = Options()
+        if headless:
+            self.chrome_options.add_argument("--headless")
+        self.chrome_options.add_argument("--no-sandbox")
+        self.chrome_options.add_argument("--disable-dev-shm-usage")
+        self.chrome_options.add_argument("--disable-gpu")
+        self.chrome_options.add_argument("--window-size=1920,1080")
+        self.chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+
+        # For macOS with Brave Browser (uncomment if using Brave)
+        # self.chrome_options.binary_location = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
+
+    def initialize_driver(self):
+        """Initialize the WebDriver"""
+        try:
+            service = Service(ChromeDriverManager().install())
+            self.driver = webdriver.Chrome(service=service, options=self.chrome_options)
+            self.driver.implicitly_wait(10)
+            print("WebDriver initialized successfully")
+        except Exception as e:
+            print(f"Error initializing WebDriver: {e}")
+            raise
+
+    def close_driver(self):
+        """Close the WebDriver"""
+        if self.driver:
+            self.driver.quit()
+            print("WebDriver closed")
+
+    def get_companies_list_pages(self):
+        """Get all available pages from the companies list"""
+        try:
+            print(f"Loading companies list page: {self.companies_list_url}")
+            self.driver.get(self.companies_list_url)
+            time.sleep(3)
+
+            # Wait for page to load
+            WebDriverWait(self.driver, 10).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
+
+            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
+
+            # Find pagination links
+            pagination_links = []
+
+            # Look for pagination elements (common patterns)
+            pagination_selectors = [
+                '.pagination a',
+                '.pager a',
+                '.page-link',
+                'a[href*="page="]',
+                'a[href*="companies-list"]'
+            ]
+
+            for selector in pagination_selectors:
+                links = soup.select(selector)
+                if links:
+                    for link in links:
+                        href = link.get('href')
+                        if href and ('page=' in href or 'companies-list' in href):
+                            full_url = urljoin(self.base_url, href)
+                            if full_url not in pagination_links:
+                                pagination_links.append(full_url)
+                    break
+
+            # If no pagination found, just return the main page
+            if not pagination_links:
+                pagination_links = [self.companies_list_url]
+
+            print(f"Found {len(pagination_links)} pages to scrape")
+            return pagination_links
+
+        except Exception as e:
+            print(f"Error getting companies list pages: {e}")
+            return [self.companies_list_url]
+
+    def extract_company_links(self, page_url):
+        """Extract company profile links from a companies list page"""
+        try:
+            print(f"Extracting company links from: {page_url}")
+            self.driver.get(page_url)
+            time.sleep(2)
+
+            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
+            company_links = []
+
+            # Common selectors for company links
+            link_selectors = [
+                'a[href*="/company/"]',
+                'a[href*="company-detail"]',
+                '.company-name a',
+                '.company-link',
+                'a[title*="company"]'
+            ]
+
+            for selector in link_selectors:
+                links = soup.select(selector)
+                if links:
+                    for link in links:
+                        href = link.get('href')
+                        if href:
+                            full_url = urljoin(self.base_url, href)
+                            if full_url not in company_links:
+                                company_links.append(full_url)
+                    break
+
+            # Fallback: look for any links that might be company profiles
+            if not company_links:
+                all_links = soup.find_all('a', href=True)
+                for link in all_links:
+                    href = link.get('href')
+                    if href and ('company' in href.lower() or 'profile' in href.lower()):
+                        full_url = urljoin(self.base_url, href)
+                        if full_url not in company_links:
+                            company_links.append(full_url)
+
+            print(f"Found {len(company_links)} company links on this page")
+            return company_links
+
+        except Exception as e:
+            print(f"Error extracting company links from {page_url}: {e}")
+            return []
+
+    def scrape_company_details(self, company_url):
+        """Scrape detailed information from a company profile page"""
+        try:
+            print(f"Scraping company details: {company_url}")
+            self.driver.get(company_url)
+            time.sleep(2)
+
+            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
+
+            company_data = {
+                'url': company_url,
+                'company_name': '',
+                'cin': '',
+                'registration_number': '',
+                'company_category': '',
+                'company_sub_category': '',
+                'class_of_company': '',
+                'roc': '',
+                'registration_date': '',
+                'company_status': '',
+                'authorized_capital': '',
+                'paid_up_capital': '',
+                'activity_code': '',
+                'email': '',
+                'address': '',
+                'state': '',
+                'pincode': '',
+                'country': '',
+                'directors': [],
+                'last_updated': ''
+            }
+
+            # Extract company name
+            name_selectors = ['h1', '.company-name', '.main-heading', 'title']
+            for selector in name_selectors:
+                element = soup.select_one(selector)
+                if element and element.text.strip():
+                    company_data['company_name'] = element.text.strip()
+                    break
+
+            # Extract CIN
+            cin_patterns = ['CIN', 'Corporate Identification Number', 'Company ID']
+            for pattern in cin_patterns:
+                element = soup.find(text=lambda x: x and pattern in x)
+                if element:
+                    parent = element.parent
+                    if parent:
+                        cin_text = parent.get_text()
+                        # Extract CIN pattern (usually alphanumeric)
+                        import re
+                        cin_match = re.search(r'[A-Z]\d{5}[A-Z]{2}\d{4}[A-Z]{3}\d{6}', cin_text)
+                        if cin_match:
+                            company_data['cin'] = cin_match.group()
+                            break
+
+            # Extract other details using table data or key-value pairs
+            tables = soup.find_all('table')
+            for table in tables:
+                rows = table.find_all('tr')
+                for row in rows:
+                    cells = row.find_all(['td', 'th'])
+                    if len(cells) >= 2:
+                        key = cells[0].get_text().strip().lower()
+                        value = cells[1].get_text().strip()
+
+                        if 'registration' in key and 'number' in key:
+                            company_data['registration_number'] = value
+                        elif 'category' in key:
+                            company_data['company_category'] = value
+                        elif 'class' in key:
+                            company_data['class_of_company'] = value
+                        elif 'roc' in key:
+                            company_data['roc'] = value
+                        elif 'registration date' in key or 'incorporation' in key:
+                            company_data['registration_date'] = value
+                        elif 'status' in key:
+                            company_data['company_status'] = value
+                        elif 'authorized capital' in key:
+                            company_data['authorized_capital'] = value
+                        elif 'paid' in key and 'capital' in key:
+                            company_data['paid_up_capital'] = value
+                        elif 'activity' in key or 'business' in key:
+                            company_data['activity_code'] = value
+                        elif 'email' in key:
+                            company_data['email'] = value
+                        elif 'address' in key:
+                            company_data['address'] = value
+                        elif 'state' in key:
+                            company_data['state'] = value
+                        elif 'pincode' in key or 'pin' in key:
+                            company_data['pincode'] = value
+                        elif 'country' in key:
+                            company_data['country'] = value
+
+            # Extract directors information
+            directors_section = soup.find(text=lambda x: x and 'director' in x.lower())
+            if directors_section:
+                # Look for director names in the surrounding area
+                directors_container = directors_section.parent
+                if directors_container:
+                    director_links = directors_container.find_all('a')
+                    for link in director_links:
+                        director_name = link.get_text().strip()
+                        if director_name and len(director_name) > 2:
+                            company_data['directors'].append(director_name)
+
+            return company_data
+
+        except Exception as e:
+            print(f"Error scraping company details from {company_url}: {e}")
+            return None
+
+    def save_data(self, format='csv'):
+        """Save scraped data to file"""
+        if not self.scraped_companies:
+            print("No data to save")
+            return
+
+        # Prepare data for saving
+        companies_for_export = []
+        for company in self.scraped_companies:
+            company_copy = company.copy()
+            # Convert directors list to string
+            company_copy['directors'] = '; '.join(company['directors']) if company['directors'] else ''
+            companies_for_export.append(company_copy)
+
+        if format.lower() == 'csv':
+            csv_file = os.path.join(self.output_dir, 'zaubacorp_companies.csv')
+            df = pd.DataFrame(companies_for_export)
+            df.to_csv(csv_file, index=False, encoding='utf-8')
+            print(f"Data saved to {csv_file}")
+
+        elif format.lower() == 'json':
+            json_file = os.path.join(self.output_dir, 'zaubacorp_companies.json')
+            with open(json_file, 'w', encoding='utf-8') as f:
+                json.dump(self.scraped_companies, f, indent=2, ensure_ascii=False)
+            print(f"Data saved to {json_file}")
+
+        # Always save both formats
+        if format != 'csv':
+            csv_file = os.path.join(self.output_dir, 'zaubacorp_companies.csv')
+            df = pd.DataFrame(companies_for_export)
+            df.to_csv(csv_file, index=False, encoding='utf-8')
+
+        if format != 'json':
+            json_file = os.path.join(self.output_dir, 'zaubacorp_companies.json')
+            with open(json_file, 'w', encoding='utf-8') as f:
+                json.dump(self.scraped_companies, f, indent=2, ensure_ascii=False)
+
+    def scrape_companies(self, max_companies=None, max_pages=None):
+        """Main method to scrape companies data"""
+        try:
+            self.initialize_driver()
+
+            # Get all companies list pages
+            list_pages = self.get_companies_list_pages()
+
+            if max_pages:
+                list_pages = list_pages[:max_pages]
+
+            total_scraped = 0
+
+            for i, page_url in enumerate(list_pages, 1):
+                if max_companies and total_scraped >= max_companies:
+                    break
+
+                print(f"\n--- Processing page {i}/{len(list_pages)} ---")
+
+                # Extract company links from this page
+                company_links = self.extract_company_links(page_url)
+
+                for j, company_url in enumerate(company_links, 1):
+                    if max_companies and total_scraped >= max_companies:
+                        break
+
+                    if company_url in self.visited_urls:
+                        continue
+
+                    self.visited_urls.add(company_url)
+
+                    print(f"Processing company {j}/{len(company_links)} on page {i}")
+
+                    # Scrape company details
+                    company_data = self.scrape_company_details(company_url)
+
+                    if company_data:
+                        self.scraped_companies.append(company_data)
+                        total_scraped += 1
+                        print(f"Successfully scraped: {company_data.get('company_name', 'Unknown')}")
+
+                    # Add delay between requests
+                    time.sleep(1)
+
+                # Save data periodically
+                if total_scraped > 0 and total_scraped % 50 == 0:
+                    self.save_data()
+                    print(f"Saved {total_scraped} companies so far...")
+
+            print(f"\n--- Scraping completed! ---")
+            print(f"Total companies scraped: {len(self.scraped_companies)}")
+
+            # Save final data
+            self.save_data()
+
+        except Exception as e:
+            print(f"Error during scraping: {e}")
+        finally:
+            self.close_driver()
+
+def main():
+    """Main function to run the scraper"""
+    print("ZaubaCorp Companies Scraper")
+    print("=" * 40)
+
+    # Configuration
+    HEADLESS = True  # Set to False to see browser window
+    MAX_COMPANIES = 100  # Set to None for unlimited
+    MAX_PAGES = 5  # Set to None for all pages
+    OUTPUT_DIR = "zaubacorp_data"
+
+    # Initialize and run scraper
+    scraper = ZaubaCorpScraper(headless=HEADLESS, output_dir=OUTPUT_DIR)
+
+    try:
+        scraper.scrape_companies(max_companies=MAX_COMPANIES, max_pages=MAX_PAGES)
+    except KeyboardInterrupt:
+        print("\nScraping interrupted by user")
+        scraper.save_data()
+        scraper.close_driver()
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        scraper.close_driver()
+
+if __name__ == "__main__":
+    main()