Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring and Improvements for GeeksforGeeks Course Scraper #55

Merged
merged 1 commit into from
Oct 31, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 57 additions & 48 deletions GFG/scraper.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,76 @@
from bs4 import BeautifulSoup
import sqlite3
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

conn = sqlite3.connect("Courses.db")
class CourseScraper:
def __init__(self, db_name):
# Initialize SQLite database connection
self.conn = sqlite3.connect(db_name)
self.cur = self.conn.cursor()
self.create_courses_table()

cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS COURSES (
def create_courses_table(self):
# Create the 'COURSES' table if it doesn't exist
self.cur.execute('''
CREATE TABLE IF NOT EXISTS COURSES (
NAME text,
RATING text,
INTERESTED text,
PRICE text
)
''')
conn.commit()
''')
self.conn.commit()

def insert_data(data):
try:
cur.executemany("INSERT INTO COURSES VALUES (?, ?, ?, ?)", data)
conn.commit()
print("Data successfully inserted.")
except sqlite3.Error as e:
print(f"An error occurred: {e}")
def get_course_data(course_section):
courses = []
for item in course_section.find_all("a", class_="ui card courseListingPage_courseCardContainer__lLZiS"):
course_name = item.find("h4", class_="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading").text
course_rating = item.find("span", class_="urw-din")
course_rating = "Information not available" if course_rating is None else course_rating.text
course_interested = item.find("div", class_="courseListingPage_descriptionText__zN_K1 sofia-pro g-opacity-50 g-mb-0 grid_with__meta").text.split(" ")[0]
course_price = item.find("p", class_="sofia-pro g-mb-0 courseListingPage_batchFee__0NlbJ")
course_price = "0" if course_price is None else course_price.text
def insert_data(self, data):
try:
self.cur.executemany("INSERT INTO COURSES VALUES (?, ?, ?, ?)", data)
self.conn.commit()
print("Data successfully inserted.")
except sqlite3.Error as e:
print(f"An error occurred: {e}")

courses.append((
course_name,
course_rating,
course_interested,
course_price
))
def get_course_data(self, course_section):
courses = []
for item in course_section.find_all("a", class_="ui card courseListingPage_courseCardContainer__lLZiS"):
course_name = item.find("h4", class_="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading").text
course_rating = item.find("span", class_="urw-din")
course_rating = "Information not available" if course_rating is None else course_rating.text
course_interested = item.find("div", class_="courseListingPage_descriptionText__zN_K1 sofia-pro g-opacity-50 g-mb-0 grid_with__meta").text.split(" ")[0]
course_price = item.find("p", class_="sofia-pro g-mb-0 courseListingPage_batchFee__0NlbJ")
course_price = "0" if course_price is None else course_price.text

return courses
courses.append((
course_name,
course_rating,
course_interested,
course_price
))

def scrape_geeksforgeeks():
url = "https://practice.geeksforgeeks.org/courses?utm_source=geeksforgeeks&utm_medium=main_header&utm_campaign=courses"
return courses

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
def scrape_geeksforgeeks(self, url):
# Initialize ChromeDriver and scrape the course data
service = webdriver.chrome.service.Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

popular_courses = soup.find("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_toggleCourseCards__pWBVA")
other_courses = soup.find_all("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_courseCardsGrid__VYBzZ")
all_courses_data = get_course_data(popular_courses)
for course in other_courses:
course_data = get_course_data(course)
all_courses_data.extend(course_data)
driver.quit()
return all_courses_data
popular_courses = soup.find("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_toggleCourseCards__pWBVA")
other_courses = soup.find_all("div", class_="ui cards courseListingPage_cardLayout__multW courseListingPage_courseCardsGrid__VYBzZ")
all_courses_data = self.get_course_data(popular_courses)
for course in other_courses:
course_data = self.get_course_data(course)
all_courses_data.extend(course_data)
driver.quit()
return all_courses_data

if __name__ == '__main__':
data = scrape_geeksforgeeks()
db_name = "Courses.db"
url = "https://practice.geeksforgeeks.org/courses?utm_source=geeksforgeeks&utm_medium=main_header&utm_campaign=courses"

scraper = CourseScraper(db_name)
data = scraper.scrape_geeksforgeeks(url)
print("Data to be inserted:", data)
insert_data(data)
scraper.insert_data(data)