3engines_doc/extract_kpme_hospitals_brave.py

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

def scrape_hospital_data(url):
    # Initialize the WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    # Open the webpage
    driver.get(url)

    # List to store all hospital data
    all_data = []

    while True:
        # Wait for the table to load
        time.sleep(2)

        # Find the table by ID
        table = driver.find_element(By.ID, 'ContentPlaceHolder1_gvw_list')
        rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # Skip the header row

        for row in rows:
            cols = row.find_elements(By.TAG_NAME, 'td')
            if len(cols) > 5:  # Ensure the row has enough columns
                system_of_medicine = cols[0].text
                category = cols[1].text
                establishment_name = cols[2].text
                address = cols[3].text
                certificate_validity = cols[4].text
                certificate_number = cols[5].find_element(By.TAG_NAME, 'a').text

                # Append the data to the list
                all_data.append([
                    system_of_medicine,
                    category,
                    establishment_name,
                    address,
                    certificate_validity,
                    certificate_number
                ])

        # Check for the next page link
        pagination = driver.find_elements(By.CLASS_NAME, 'pagination')
        if pagination:
            next_page_link = pagination[0].find_elements(By.TAG_NAME, 'a')[-1]  # Assume the last link is 'Next'
            if 'Next' in next_page_link.text:
                next_page_link.click()
            else:
                break
        else:
            break

    # Close the WebDriver
    driver.quit()

    # Convert the data to a DataFrame and save to CSV
    df = pd.DataFrame(all_data, columns=[
        'System of Medicine',
        'Category',
        'Establishment Name',
        'Address',
        'Certificate Validity',
        'Certificate Number'
    ])
    df.to_csv('kpme_hospitals.csv', index=False)
    print("Data saved to kpme_hospitals.csv")

# URL of the KPME portal
url = 'https://kpme.karnataka.gov.in/AllapplicationList.aspx'
scrape_hospital_data(url)