Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse decoupled csv files to sql #50

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
f3f18db
Initial commit for new scraper. Scrapy project creation
miguelpduarte Oct 17, 2021
7b9a05e
Add initial version of course and faculties spider
miguelpduarte Oct 17, 2021
a6d0f7e
Add README to new scraper module
miguelpduarte Oct 19, 2021
74e190c
Update course spider
miguelpduarte Oct 23, 2021
9df3ad5
Add item definitions for scraped items. Remove autogenerated pipeline…
miguelpduarte Oct 23, 2021
ce7b954
Update new scraper README
miguelpduarte Oct 23, 2021
a2bbdd1
Use settings to define a CSV feed to export scraped data
miguelpduarte Oct 23, 2021
5b1e56c
Update courses and faculties spiders to use defined Items. Update cou…
miguelpduarte Oct 23, 2021
379e359
Add: boilerplate to parse csv.
Jumaruba Nov 13, 2021
7d8d058
Add: creating faculties table.
Jumaruba Nov 13, 2021
b60eab6
Add: parsing faculties and created makefile.
Jumaruba Nov 13, 2021
44cdbb1
Modify: creating mysql tables for courses and faculty
Jumaruba Dec 1, 2021
0b56f21
Add: faculties generating insert
Jumaruba Dec 2, 2021
367049b
Add: configparser and main class
Jumaruba Dec 2, 2021
8ea0283
Add: Creating association course_faculty
Jumaruba Dec 2, 2021
e82a333
FIx: table names and some cols
Jumaruba Dec 2, 2021
8efeb05
Restore: previous db_creation
Jumaruba Dec 2, 2021
b00bc6d
Remove: deleted mysql and phpmyadmin files, since will be used in ano…
Jumaruba Dec 26, 2021
e812869
Update: add order which each file will be added to the db
Jumaruba Dec 26, 2021
e590960
Update: requirements
Jumaruba Dec 26, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Python environment
env_scrapper/**
csv_to_sql/sql_parser/**

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -114,3 +118,5 @@ node_modules/
# Intellij
.idea



2 changes: 2 additions & 0 deletions csv_to_sql/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Csv parser

9 changes: 9 additions & 0 deletions csv_to_sql/configparser.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[course]
csv = courses
faculties_col = faculties

[faculty]
csv = faculties

[course_faculty]

10 changes: 10 additions & 0 deletions csv_to_sql/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
astunparse==1.6.3
Jinja2==3.0.3
MarkupSafe==2.0.1
numpy==1.21.4
pandas==1.3.4
pdoc==8.0.1
Pygments==2.10.0
python-dateutil==2.8.2
pytz==2021.3
six==1.16.0
Empty file added csv_to_sql/src/__init__.py
Empty file.
26 changes: 26 additions & 0 deletions csv_to_sql/src/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

from .faculty import Faculty
from .course import Course
from .course_faculty import Course_Faculty

import configparser as cp
import os

# Rename the files by the order it should be executed in the database.
def rename_file(order: int, filename: str):
path = "./data/sql"
new_name = f"{path}/{order}_{filename}.sql"
old_name = f"{path}/{filename}.sql"
os.rename(old_name, new_name)

# Order that the files should be added to the database.
order = ["faculty", "course", "course_faculty"]

config = cp.ConfigParser()
config.read("./configparser.ini")
faculty = Faculty(config).parse()
course = Course(config).parse()
course_faculty = Course_Faculty(config).parse()

for i, filename in enumerate(order):
rename_file(i+1, filename)
30 changes: 30 additions & 0 deletions csv_to_sql/src/course.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from .parser import Parser
import configparser as cp

class Course(Parser):
def __init__(self, config: cp.ConfigParser):
self.config = config
super().__init__("course", config['course']['csv'])

def parse(self):
cols_list = next(self.f_reader)

# Get faculties index.
faculties_col_name = self.config['course']['faculties_col']
faculties_index = cols_list.index(faculties_col_name)

# Drop faculties col.
del cols_list[faculties_index]
cols = self.get_cols(cols_list)

# Generate inserts
for course_id, row in enumerate(self.f_reader):
del row[faculties_index] # Remove faculties position.
values = self.get_values(course_id, row)
insert = self.sql_get_insert(cols, values)
self.f_sql.write(insert)





33 changes: 33 additions & 0 deletions csv_to_sql/src/course_faculty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import configparser as cp
from .parser import Parser
import pandas as pd
from ast import literal_eval

class Course_Faculty(Parser):
def __init__(self, config: cp.ConfigParser):
self.config = config
super().__init__("course_faculty", None)
self.df_faculty = pd.read_csv(self.get_input_filepath(config['faculty']['csv']), index_col=False)
self.df_course = pd.read_csv(self.get_input_filepath(config['course']['csv']), index_col=False)

def get_faculty_id(self, faculty_acronym: str):
return self.df_faculty[self.df_faculty['acronym'] == faculty_acronym].index[0]


def parse(self):
cols = self.get_cols(["course_id", "faculty_id"], with_id=False)

# Get faculties index.
faculties_col_name = self.config['course']['faculties_col']

# For each course get's the faculties ids that it's associated.
for course_id, faculties in enumerate(self.df_course[faculties_col_name]):
# Acronym to id
faculties_acronyms = literal_eval(faculties)
faculties_ids = list(map(self.get_faculty_id, faculties_acronyms))
# For each id create on instance in the table
for faculty_id in faculties_ids:
values = self.get_values(None, [course_id, faculty_id], with_id=False)
insert = self.sql_get_insert(cols, values)
self.f_sql.write(insert)

15 changes: 15 additions & 0 deletions csv_to_sql/src/faculty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .parser import Parser
import configparser as cp

class Faculty(Parser):
def __init__(self, config: cp.ConfigParser):
self.config = config
super().__init__("faculty", config['faculty']['csv'])

def parse(self):
cols_list = next(self.f_reader)
cols = self.get_cols(cols_list)
for faculty_id, row in enumerate(self.f_reader):
values = self.get_values(faculty_id, row)
insert = self.sql_get_insert(cols, values)
self.f_sql.write(insert)
50 changes: 50 additions & 0 deletions csv_to_sql/src/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from abc import abstractclassmethod
import os
import csv

class Parser:
def __init__(self, table_name: str, csv_name: str = None):
self.current_path = os.path.dirname(os.path.abspath(__file__))
self.table_name = table_name
self.csv_name = csv_name

# Creating sql.
self.f_sql = open(self.get_output_filepath(), "w", encoding="utf-8")

# Reading csv.
if csv_name is not None:
f = open(self.get_input_filepath() , "r")
self.f_reader = csv.reader(f)


def add_brackets_vals(self, x: str):
return f"'{x}'"

def add_brackets_cols(self, x: str):
return f"`{x}`"

def sql_get_insert(self, cols: list, values: list):
return f"INSERT INTO {self.table_name} ({cols}) VALUES ({values}); \n"

def get_input_filepath(self, csv_name=None):
if csv_name is None:
return f"{self.current_path}/../data/raw/{self.csv_name}.csv"
return f"{self.current_path}/../data/raw/{csv_name}.csv"

def get_output_filepath(self):
return f"{self.current_path}/../data/sql/{self.table_name}.sql"

def get_cols(self, cols_list, with_id=True):
if with_id:
return ','.join(list(map(self.add_brackets_cols, ['id'] + cols_list)))
return ','.join(list(map(self.add_brackets_cols, cols_list)))

def get_values(self, id_, row, with_id=True):
if with_id:
return ','.join([str(id_)] + list(map(self.add_brackets_vals, row)))
return ','.join(list(map(self.add_brackets_vals, row)))


@abstractclassmethod
def parser(self):
pass
2 changes: 0 additions & 2 deletions mysql/Dockerfile

This file was deleted.

184 changes: 0 additions & 184 deletions mysql/db_creation.sql

This file was deleted.

1 change: 0 additions & 1 deletion phpmyadmin/Dockerfile

This file was deleted.

1 change: 1 addition & 0 deletions scrape_to_csv/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
output
Loading