Skip to content

Commit

Permalink
feat: scrape teacher sigarra url
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaspalma committed Sep 14, 2024
1 parent f55a788 commit f5dcbf3
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 29 deletions.
1 change: 1 addition & 0 deletions src/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ num_course_units=6500
num_course_metadata=10000
num_classes=12000
num_slots=32000
num_professor_link=3500
num_slot_professor=55000
num_slot_class=32000
num_professors=3500
Expand Down
13 changes: 11 additions & 2 deletions src/scrapper/database/dbs/create_db_sqlite3.sql
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,24 @@ CREATE TABLE `slot_class` (
PRIMARY KEY (`slot_id`, `class_id`)
);

CREATE TABLE `professor_link` (
`id` INTEGER PRIMARY KEY,
`link` varchar(256)
);

-- --------------------------------------------------------

--
-- Table structure for table `class_professor`
-- Table structure for table `schedule_professor`
--

CREATE TABLE `slot_professor` (
`slot_id` INTEGER NOT NULL,
`professor_id` INTEGER NOT NULL,
`professor_link_id` INTEGER NOT NULL,
FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
PRIMARY KEY (`slot_id`, `professor_id`)
);

Expand All @@ -133,7 +141,8 @@ CREATE TABLE `slot_professor` (
CREATE TABLE `professor` (
`id` INTEGER PRIMARY KEY,
`professor_acronym` varchar(16),
`professor_name` varchar(100)
`professor_name` varchar(100),
`professor_url` varchar(128)
);

-- --------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/scrapper/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
print("Saving scrapper info...")
db = Database()
db.insert('info', {'date': datetime.now()})
db.connection.close()
db.connection.close()
7 changes: 5 additions & 2 deletions src/scrapper/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,21 @@ class Slot(scrapy.Item):
professor_id = scrapy.Field()
last_updated = scrapy.Field()


class SlotClass(scrapy.Item):
slot_id = scrapy.Field()
class_id = scrapy.Field()

class ProfessorLink(scrapy.Item):
id = scrapy.Field()
link = scrapy.Field()

class SlotProfessor(scrapy.Item):
slot_id = scrapy.Field()
professor_id = scrapy.Field()

professor_link_id = scrapy.Field()

class Professor(scrapy.Item):
id = scrapy.Field()
professor_acronym = scrapy.Field()
professor_name = scrapy.Field()
professor_url = scrapy.Field()
49 changes: 30 additions & 19 deletions src/scrapper/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tqdm import tqdm


class MySQLPipeline():
class DBPipeline():
def __init__(self):
self.open_config()
self.db = Database()
Expand Down Expand Up @@ -79,9 +79,9 @@ def process_pbar(self):
# -------------------------------------------------------------------------


class FacultyPipeline(MySQLPipeline):
class FacultyPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_faculties'])
self.table_name = 'faculty'

Expand All @@ -91,9 +91,9 @@ def process_item(self, item, spider):
return item


class CoursePipeline(MySQLPipeline):
class CoursePipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_courses'])
self.table_name = 'course'

Expand All @@ -103,9 +103,9 @@ def process_item(self, item, spider):
return item


class CourseUnitPipeline(MySQLPipeline):
class CourseUnitPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_course_units'])
self.table_name = 'course_unit'

Expand All @@ -115,9 +115,9 @@ def process_item(self, item, spider):
return item


class CourseMetadataPipeline(MySQLPipeline):
class CourseMetadataPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(
self.config['statistics']['num_course_metadata'])
self.table_name = 'course_metadata'
Expand All @@ -128,9 +128,9 @@ def process_item(self, item, spider):
return item


class ClassPipeline(MySQLPipeline):
class ClassPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_classes'])
self.table_name = 'class'

Expand All @@ -140,9 +140,9 @@ def process_item(self, item, spider):
return item


class SlotPipeline(MySQLPipeline):
class SlotPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_slots'])
self.table_name = 'slot'

Expand All @@ -152,9 +152,9 @@ def process_item(self, item, spider):
return item


class SlotClassPipeline(MySQLPipeline):
class SlotClassPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_slot_class'])
self.table_name = 'slot_class'

Expand All @@ -164,9 +164,9 @@ def process_item(self, item, spider):
return item


class SlotProfessorPipeline(MySQLPipeline):
class SlotProfessorPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(
self.config['statistics']['num_slot_professor'])
self.table_name = 'slot_professor'
Expand All @@ -176,10 +176,21 @@ def process_item(self, item, spider):
super().process_item(item, spider)
return item

class ProfessorLinkPipeline(DBPipeline):
def __init__(self):
DBPipeline.__init__(self)
self.expected_num = int(
self.config['statistics']['num_professor_link'])
self.table_name = 'professor_link'

def process_item(self, item, spider):
if isinstance(item, items.ProfessorLink):
super().process_item(item, spider)
return item

class ProfessorsPipeline(MySQLPipeline):
class ProfessorsPipeline(DBPipeline):
def __init__(self):
MySQLPipeline.__init__(self)
DBPipeline.__init__(self)
self.expected_num = int(self.config['statistics']['num_professors'])
self.table_name = 'professor'

Expand Down
1 change: 1 addition & 0 deletions src/scrapper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
'scrapper.pipelines.ClassPipeline': 0,
'scrapper.pipelines.SlotPipeline': 0,
'scrapper.pipelines.SlotProfessorPipeline': 0,
'scrapper.pipelines.ProfessorLinkPipeline': 0,
'scrapper.pipelines.ProfessorsPipeline': 0,
'scrapper.pipelines.SlotClassPipeline': 0
}
Expand Down
32 changes: 29 additions & 3 deletions src/scrapper/spiders/slot_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from scrapper.settings import CONFIG, PASSWORD, USERNAME

from ..database.Database import Database
from ..items import Slot, Class, SlotProfessor, Professor, SlotClass
from ..items import Slot, Class, SlotProfessor, Professor, SlotClass, ProfessorLink


def get_class_id(course_unit_id, class_name):
Expand Down Expand Up @@ -102,6 +102,23 @@ def check_login_response(self, response):
response.status), flush=True)
self.log('Login Failed. HTTP Error {}'.format(response.status))

def professor_link_exists(self, id: int) -> bool:
exists = False
db = Database()
sql = """
SELECT id
FROM professor_link
WHERE id = {}
""".format(id)

db.cursor.execute(sql)
if db.cursor.fetchone() != None:
exists = True

db.connection.close()

return exists

def classUnitRequests(self):
db = Database()
sql = """
Expand Down Expand Up @@ -180,7 +197,8 @@ def extractSchedule(self, response):
yield Professor(
id=sigarra_id,
professor_acronym=teacher["acronym"],
professor_name=name
professor_name=name,
professor_url=teacher["sigarra_url"]
)

for current_class in schedule["classes"]:
Expand All @@ -205,10 +223,18 @@ def extractSchedule(self, response):
for teacher in schedule["persons"]:
(sigarra_id, name) = self.get_professor_info(
teacher)

professor_link_id = schedule["id"]
if self.professor_link_exists(professor_link_id):
yield ProfessorLink(
id=schedule["id"],
link=teacher["sigarra_url"]
)

yield SlotProfessor(
slot_id=schedule["id"],
professor_id=sigarra_id
professor_id=sigarra_id,
professor_link_id=schedule["id"]
)

for current_class in schedule["classes"]:
Expand Down
1 change: 1 addition & 0 deletions src/scripts/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def dump(self):
self.dump_table("class", con, f)
self.dump_table("slot", con, f)
self.dump_table("slot_class", con, f)
self.dump_table("professor_link", con, f)
self.dump_table("slot_professor", con, f)
f.close()

Expand Down
10 changes: 9 additions & 1 deletion src/scripts/dump/schema/create_db_sqlite3.sql
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ CREATE TABLE `slot_class` (
PRIMARY KEY (`slot_id`, `class_id`)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

CREATE TABLE `professor_link` (
`id` INTEGER PRIMARY KEY,
`link` varchar(256)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

-- --------------------------------------------------------

--
Expand All @@ -128,8 +133,10 @@ CREATE TABLE `slot_class` (
CREATE TABLE `slot_professor` (
`slot_id` INTEGER NOT NULL,
`professor_id` INTEGER NOT NULL,
`professor_link_id` INTEGER NOT NULL,
FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
PRIMARY KEY (`slot_id`, `professor_id`)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

Expand All @@ -142,7 +149,8 @@ CREATE TABLE `slot_professor` (
CREATE TABLE `professor` (
`id` INTEGER PRIMARY KEY,
`professor_acronym` varchar(16),
`professor_name` varchar(100)
`professor_name` varchar(100),
`professor_url` varchar(128)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

-- --------------------------------------------------------
Expand Down
10 changes: 9 additions & 1 deletion src/scripts/dump/schema/schema_mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,13 @@ CREATE TABLE `slot_class` (
CREATE TABLE `professor` (
`id` INTEGER PRIMARY KEY,
`professor_acronym` varchar(16),
`professor_name` varchar(100)
`professor_name` varchar(100),
`professor_url` varchar(128)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

CREATE TABLE `professor_link` (
`id` INTEGER PRIMARY KEY,
`link` varchar(256)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

-- --------------------------------------------------------
Expand All @@ -144,8 +150,10 @@ CREATE TABLE `professor` (
CREATE TABLE `slot_professor` (
`slot_id` INTEGER NOT NULL,
`professor_id` INTEGER NOT NULL,
`professor_link_id` INTEGER NOT NULL,
FOREIGN KEY (`slot_id`) REFERENCES `slot` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_id`) REFERENCES `professor` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`professor_link_id`) REFERENCES `professor_link` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
PRIMARY KEY (`slot_id`, `professor_id`)
) ENGINE=InnoDB CHARSET = utf8 COLLATE = utf8_general_ci;

Expand Down

0 comments on commit f5dcbf3

Please sign in to comment.