before changing links

2025-06-19 09:01:18 +05:30
commit 6686208bf1
1277 changed files with 29692 additions and 0 deletions
--- a/extract_kpme_hospitals_brave.py
+++ b/extract_kpme_hospitals_brave.py
@ -0,0 +1,76 @@
+import time
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+import pandas as pd
+
+def scrape_hospital_data(url):
+    # Initialize the WebDriver
+    service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=service)
+    
+    # Open the webpage
+    driver.get(url)
+    
+    # List to store all hospital data
+    all_data = []
+    
+    while True:
+        # Wait for the table to load
+        time.sleep(2)
+        
+        # Find the table by ID
+        table = driver.find_element(By.ID, 'ContentPlaceHolder1_gvw_list')
+        rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # Skip the header row
+        
+        for row in rows:
+            cols = row.find_elements(By.TAG_NAME, 'td')
+            if len(cols) > 5:  # Ensure the row has enough columns
+                system_of_medicine = cols[0].text
+                category = cols[1].text
+                establishment_name = cols[2].text
+                address = cols[3].text
+                certificate_validity = cols[4].text
+                certificate_number = cols[5].find_element(By.TAG_NAME, 'a').text
+                
+                # Append the data to the list
+                all_data.append([
+                    system_of_medicine,
+                    category,
+                    establishment_name,
+                    address,
+                    certificate_validity,
+                    certificate_number
+                ])
+        
+        # Check for the next page link
+        pagination = driver.find_elements(By.CLASS_NAME, 'pagination')
+        if pagination:
+            next_page_link = pagination[0].find_elements(By.TAG_NAME, 'a')[-1]  # Assume the last link is 'Next'
+            if 'Next' in next_page_link.text:
+                next_page_link.click()
+            else:
+                break
+        else:
+            break
+    
+    # Close the WebDriver
+    driver.quit()
+    
+    # Convert the data to a DataFrame and save to CSV
+    df = pd.DataFrame(all_data, columns=[
+        'System of Medicine',
+        'Category',
+        'Establishment Name',
+        'Address',
+        'Certificate Validity',
+        'Certificate Number'
+    ])
+    df.to_csv('kpme_hospitals.csv', index=False)
+    print("Data saved to kpme_hospitals.csv")
+
+# URL of the KPME portal
+url = 'https://kpme.karnataka.gov.in/AllapplicationList.aspx'
+scrape_hospital_data(url)