Merge pull request #55 from KavyaMalviya56/main

Refactoring and Improvements for GeeksforGeeks Course Scraper
anupammaurya6767 · Oct 31, 2023 · 24530d8 · 24530d8
2 parents 3e28971 + 453ff21
commit 24530d8
Showing 1 changed file with 57 additions and 48 deletions.
diff --git a/GFG/scraper.py b/GFG/scraper.py
@@ -1,67 +1,76 @@
-from bs4 import BeautifulSoup
 import sqlite3
+from bs4 import BeautifulSoup
 from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
 
-conn = sqlite3.connect("Courses.db")
+class CourseScraper:
+    def __init__(self, db_name):
+        # Initialize SQLite database connection
+        self.conn = sqlite3.connect(db_name)
+        self.cur = self.conn.cursor()
+        self.create_courses_table()
 
-cur = conn.cursor()
-cur.execute('''
-                CREATE TABLE IF NOT EXISTS COURSES (
+    def create_courses_table(self):
+        # Create the 'COURSES' table if it doesn't exist
+        self.cur.execute('''
+            CREATE TABLE IF NOT EXISTS COURSES (
                 NAME text, 
                 RATING text,
                 INTERESTED text, 
                 PRICE text
             )
-            ''')
-conn.commit()
+        ''')
+        self.conn.commit()
 
-def insert_data(data):
-    try:
-        cur.executemany("INSERT INTO COURSES VALUES (?, ?, ?, ?)", data)
-        conn.commit()
-        print("Data successfully inserted.")
-    except sqlite3.Error as e:
-        print(f"An error occurred: {e}")
-def get_course_data(course_section):
-    courses = []
-    for item in course_section.find_all("a", class_="ui card courseListingPage_courseCardContainer__lLZiS"):
-        course_name = item.find("h4", class_="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading").text
-        course_rating = item.find("span", class_="urw-din")
-        course_rating = "Information not available" if course_rating is None else course_rating.text
-        course_interested = item.find("div", class_="courseListingPage_descriptionText__zN_K1 sofia-pro g-opacity-50 g-mb-0 grid_with__meta").text.split(" ")[0]
-        course_price = item.find("p", class_="sofia-pro g-mb-0 courseListingPage_batchFee__0NlbJ")
-        course_price = "0" if course_price is None else course_price.text
+    def insert_data(self, data):
+        try:
+            self.cur.executemany("INSERT INTO COURSES VALUES (?, ?, ?, ?)", data)
+            self.conn.commit()
+            print("Data successfully inserted.")
+        except sqlite3.Error as e:
+            print(f"An error occurred: {e}")
 
-        courses.append((
-            course_name,
-            course_rating,
-            course_interested,
-            course_price
-        ))
+    def get_course_data(self, course_section):
+        courses = []
+        for item in course_section.find_all("a", class_="ui card courseListingPage_courseCardContainer__lLZiS"):
+            course_name = item.find("h4", class_="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading").text
+            course_rating = item.find("span", class_="urw-din")
+            course_rating = "Information not available" if course_rating is None else course_rating.text
+            course_interested = item.find("div", class_="courseListingPage_descriptionText__zN_K1 sofia-pro g-opacity-50 g-mb-0 grid_with__meta").text.split(" ")[0]
+            course_price = item.find("p", class_="sofia-pro g-mb-0 courseListingPage_batchFee__0NlbJ")
+            course_price = "0" if course_price is None else course_price.text
 
-    return courses
+            courses.append((
+                course_name,
+                course_rating,
+                course_interested,
+                course_price
+            ))
 
-def scrape_geeksforgeeks():
-    url = "https://practice.geeksforgeeks.org/courses?utm_source=geeksforgeeks&utm_medium=main_header&utm_campaign=courses"
+        return courses
 
-    service = Service(ChromeDriverManager().install())
-    driver = webdriver.Chrome(service=service)
-    driver.get(url)
-    html = driver.page_source
-    soup = BeautifulSoup(html, "html.parser")
+    def scrape_geeksforgeeks(self, url):
+        # Initialize ChromeDriver and scrape the course data
+        service = webdriver.chrome.service.Service(ChromeDriverManager().install())
+        driver = webdriver.Chrome(service=service)
+        driver.get(url)
+        html = driver.page_source
+        soup = BeautifulSoup(html, "html.parser")
 
-    popular_courses = soup.find("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_toggleCourseCards__pWBVA")
-    other_courses = soup.find_all("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_courseCardsGrid__VYBzZ")
-    all_courses_data = get_course_data(popular_courses)
-    for course in other_courses:
-        course_data = get_course_data(course)
-        all_courses_data.extend(course_data)
-    driver.quit()
-    return all_courses_data
+        popular_courses = soup.find("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_toggleCourseCards__pWBVA")
+        other_courses = soup.find_all("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_courseCardsGrid__VYBzZ")
+        all_courses_data = self.get_course_data(popular_courses)
+        for course in other_courses:
+            course_data = self.get_course_data(course)
+            all_courses_data.extend(course_data)
+        driver.quit()
+        return all_courses_data
 
 if __name__ == '__main__':
-    data = scrape_geeksforgeeks()
+    db_name = "Courses.db"
+    url = "https://practice.geeksforgeeks.org/courses?utm_source=geeksforgeeks&utm_medium=main_header&utm_campaign=courses"
+
+    scraper = CourseScraper(db_name)
+    data = scraper.scrape_geeksforgeeks(url)
     print("Data to be inserted:", data)
-    insert_data(data)
+    scraper.insert_data(data)