Skip to content

Commit

Permalink
Merge pull request #55 from KavyaMalviya56/main
Browse files Browse the repository at this point in the history
Refactoring and Improvements for GeeksforGeeks Course Scraper
  • Loading branch information
anupammaurya6767 authored Oct 31, 2023
2 parents 3e28971 + 453ff21 commit 24530d8
Showing 1 changed file with 57 additions and 48 deletions.
105 changes: 57 additions & 48 deletions GFG/scraper.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,76 @@
from bs4 import BeautifulSoup
import sqlite3
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

conn = sqlite3.connect("Courses.db")
class CourseScraper:
def __init__(self, db_name):
# Initialize SQLite database connection
self.conn = sqlite3.connect(db_name)
self.cur = self.conn.cursor()
self.create_courses_table()

cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS COURSES (
def create_courses_table(self):
# Create the 'COURSES' table if it doesn't exist
self.cur.execute('''
CREATE TABLE IF NOT EXISTS COURSES (
NAME text,
RATING text,
INTERESTED text,
PRICE text
)
''')
conn.commit()
''')
self.conn.commit()

def insert_data(data):
try:
cur.executemany("INSERT INTO COURSES VALUES (?, ?, ?, ?)", data)
conn.commit()
print("Data successfully inserted.")
except sqlite3.Error as e:
print(f"An error occurred: {e}")
def get_course_data(course_section):
courses = []
for item in course_section.find_all("a", class_="ui card courseListingPage_courseCardContainer__lLZiS"):
course_name = item.find("h4", class_="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading").text
course_rating = item.find("span", class_="urw-din")
course_rating = "Information not available" if course_rating is None else course_rating.text
course_interested = item.find("div", class_="courseListingPage_descriptionText__zN_K1 sofia-pro g-opacity-50 g-mb-0 grid_with__meta").text.split(" ")[0]
course_price = item.find("p", class_="sofia-pro g-mb-0 courseListingPage_batchFee__0NlbJ")
course_price = "0" if course_price is None else course_price.text
def insert_data(self, data):
try:
self.cur.executemany("INSERT INTO COURSES VALUES (?, ?, ?, ?)", data)
self.conn.commit()
print("Data successfully inserted.")
except sqlite3.Error as e:
print(f"An error occurred: {e}")

courses.append((
course_name,
course_rating,
course_interested,
course_price
))
def get_course_data(self, course_section):
courses = []
for item in course_section.find_all("a", class_="ui card courseListingPage_courseCardContainer__lLZiS"):
course_name = item.find("h4", class_="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading").text
course_rating = item.find("span", class_="urw-din")
course_rating = "Information not available" if course_rating is None else course_rating.text
course_interested = item.find("div", class_="courseListingPage_descriptionText__zN_K1 sofia-pro g-opacity-50 g-mb-0 grid_with__meta").text.split(" ")[0]
course_price = item.find("p", class_="sofia-pro g-mb-0 courseListingPage_batchFee__0NlbJ")
course_price = "0" if course_price is None else course_price.text

return courses
courses.append((
course_name,
course_rating,
course_interested,
course_price
))

def scrape_geeksforgeeks():
url = "https://practice.geeksforgeeks.org/courses?utm_source=geeksforgeeks&utm_medium=main_header&utm_campaign=courses"
return courses

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
def scrape_geeksforgeeks(self, url):
# Initialize ChromeDriver and scrape the course data
service = webdriver.chrome.service.Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

popular_courses = soup.find("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_toggleCourseCards__pWBVA")
other_courses = soup.find_all("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_courseCardsGrid__VYBzZ")
all_courses_data = get_course_data(popular_courses)
for course in other_courses:
course_data = get_course_data(course)
all_courses_data.extend(course_data)
driver.quit()
return all_courses_data
popular_courses = soup.find("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_toggleCourseCards__pWBVA")
other_courses = soup.find_all("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_courseCardsGrid__VYBzZ")
all_courses_data = self.get_course_data(popular_courses)
for course in other_courses:
course_data = self.get_course_data(course)
all_courses_data.extend(course_data)
driver.quit()
return all_courses_data

if __name__ == '__main__':
data = scrape_geeksforgeeks()
db_name = "Courses.db"
url = "https://practice.geeksforgeeks.org/courses?utm_source=geeksforgeeks&utm_medium=main_header&utm_campaign=courses"

scraper = CourseScraper(db_name)
data = scraper.scrape_geeksforgeeks(url)
print("Data to be inserted:", data)
insert_data(data)
scraper.insert_data(data)

0 comments on commit 24530d8

Please sign in to comment.