From 8d1bdaa8159ba992a217d5095baebb053ffb50cd Mon Sep 17 00:00:00 2001 From: dehatanes Date: Sun, 13 Mar 2022 13:46:59 -0300 Subject: [PATCH] [AC Spider][Feat] pdf parser --- covid19br/parsers/acre.py | 98 ++++++++++++++++++++++++++++ covid19br/parsers/extractor_utils.py | 13 ++++ covid19br/parsers/tocantins.py | 9 +-- covid19br/spiders/spider_ac.py | 53 ++++++++++++++- 4 files changed, 164 insertions(+), 9 deletions(-) create mode 100644 covid19br/parsers/acre.py diff --git a/covid19br/parsers/acre.py b/covid19br/parsers/acre.py new file mode 100644 index 00000000..25c273fc --- /dev/null +++ b/covid19br/parsers/acre.py @@ -0,0 +1,98 @@ +import re +from rows.plugins import pdf + +from covid19br.common.constants import State +from covid19br.common.data_normalization_utils import NormalizationUtils +from covid19br.parsers.extractor_utils import match_object_from_regexp, is_only_number + +REGEXP_DATE = re.compile("([0-9]+) de (.+) de ([0-9]{4})$") +CITY_NAME_TABLE_COLUMN = 0 +CONFIRMED_CASES_TABLE_COLUMN = 2 +DEATH_CASES_TABLE_COLUMN = 4 + + +def parse_int(value): + return int(value.replace(".", "")) + + +class AcreBulletinExtractor: + state = State.AC + + def __init__(self, filename): + self.doc = pdf.PyMuPDFBackend(filename) + + @property + def date(self): + first_page_objects = next( + self.doc.text_objects( + starts_after=re.compile("BOLETIM(.+)"), + ends_before=re.compile("SITUAÇÃO ATUAL(.+)"), + ) + ) + date_obj, *_ = match_object_from_regexp(REGEXP_DATE, first_page_objects) or [ + None + ] + if date_obj: + return NormalizationUtils.extract_in_full_date(" ".join(date_obj)) + + @property + def official_total(self): + first_page_objects = next( + self.doc.text_objects( + starts_after=re.compile("SITUAÇÃO ATUAL(.+)"), + ends_before=re.compile("DISTRIBUIÇÃO DOS CASOS(.+)"), + ) + ) + + # Unfortunately the text labels are images which makes difficult for us to get the numbers based on them + # So we are going to infer which values we need based on it's position (sometimes there are "ghost objects" + # in the page, but they are on the far left and won't interfere in this logic. + remaining_number_objs = [ + obj for obj in first_page_objects if is_only_number(obj.text) + ] + # we will start ordering the objects and drop the 2 last of the right (the little numbers on the bulletin) + ordered_by_x_axis = sorted(remaining_number_objs, key=lambda obj: obj.x0) + remaining_number_objs = ordered_by_x_axis[:-2] + # From the 3 numbers on the far right, the death cases is the one most at the bottom + *_, death_cases_obj = sorted( + remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0) + ) + remaining_number_objs = remaining_number_objs[:-3] + # From the 3 numbers on the right remaining (the middle column), the confirmed cases is the one in the middle + _, confirmed_cases_obj, _ = sorted( + remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0) + ) + + return {"confirmados": confirmed_cases_obj.text, "mortes": death_cases_obj.text} + + @property + def data(self): + table_page_number = self._get_table_page_number() + if not table_page_number: + return None + page_objs = next(self.doc.text_objects( + starts_after=re.compile(".+DISTRIBUIÇÃO DOS CASOS CONFIRMADOS.+"), + ends_before=re.compile("Fonte:.+"), + page_numbers=(table_page_number,), + )) + + # remove headers + city_column_header = next(obj for obj in page_objs if "munic" in obj.text.lower()) + table_objs = [obj for obj in page_objs if obj.y0 > city_column_header.y1] + + lines = pdf.group_objects("y", table_objs, check_group=pdf.object_contains_center) + for line in lines: + city = line[CITY_NAME_TABLE_COLUMN].text.strip() + deaths = line[DEATH_CASES_TABLE_COLUMN].text.strip() + confirmed = line[CONFIRMED_CASES_TABLE_COLUMN].text.strip() + yield { + "municipio": city, + "confirmados": confirmed, + "mortes": deaths, + } + + def _get_table_page_number(self): + for page_number, page_objs in enumerate(self.doc.text_objects(), start=1): + for obj in page_objs: + if "TABELA" in obj.text and "DISTRIBUIÇÃO DOS CASOS CONFIRMADOS" in obj.text: + return page_number diff --git a/covid19br/parsers/extractor_utils.py b/covid19br/parsers/extractor_utils.py index e69de29b..dda38c36 100644 --- a/covid19br/parsers/extractor_utils.py +++ b/covid19br/parsers/extractor_utils.py @@ -0,0 +1,13 @@ +import re + + +def match_object_from_regexp(regexp, objects): + """Return the matching result for""" + for obj in objects: + result = regexp.findall(obj.text) + if result: + return result + + +def is_only_number(value): + return re.compile("^([0-9.,]+)$").findall(value.strip()) diff --git a/covid19br/parsers/tocantins.py b/covid19br/parsers/tocantins.py index 52292d15..532af549 100644 --- a/covid19br/parsers/tocantins.py +++ b/covid19br/parsers/tocantins.py @@ -8,19 +8,12 @@ from covid19br.common.constants import State from covid19br.common.data_normalization_utils import NormalizationUtils from covid19br.common.demographic_utils import DemographicUtils +from covid19br.parsers.extractor_utils import match_object_from_regexp REGEXP_DAY_MONTH = re.compile("([0-9]+) de (.+)$") REGEXP_YEAR = re.compile("^de ([0-9]{4})$") -def match_object_from_regexp(regexp, objects): - """Return the matching result for""" - for obj in objects: - result = regexp.findall(obj.text) - if result: - return result - - def parse_int(value): return int(value.replace(".", "")) diff --git a/covid19br/spiders/spider_ac.py b/covid19br/spiders/spider_ac.py index 9df1e31b..26199019 100644 --- a/covid19br/spiders/spider_ac.py +++ b/covid19br/spiders/spider_ac.py @@ -80,7 +80,58 @@ def parse_news_bulletin(self, response, date): ) def parse_pdf_bulletin(self, response, date): - print(f"Let's que let's {response.request.url}") + source = response.request.url + with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as tmp: + tmp.write(response.body) + + extractor = AcreBulletinExtractor(tmp.name) + + pdf_date = extractor.date + if pdf_date and pdf_date != date: + self.logger.warning( + f"PDF date does not match for pdf {source}. Aborting extraction." + ) + return + + pdf_official_total = extractor.official_total + if pdf_official_total: + bulletin = StateTotalBulletinModel( + date=date, + state=self.state, + confirmed_cases=pdf_official_total["confirmados"], + deaths=pdf_official_total["mortes"], + source=response.request.url + " | Painel na primeira pag. do pdf.", + ) + self.add_new_bulletin_to_report(bulletin, date) + + pdf_data = list(extractor.data) + if not pdf_data: + if "parcial" not in source.lower(): + self.logger.error( + f"Couldn't extract data from pdf that is not parcial. Pdf source: {source}." + ) + return + + for row in pdf_data: + if row["municipio"].lower() == "total": + bulletin = StateTotalBulletinModel( + date=date, + state=self.state, + confirmed_cases=row["confirmados"], + deaths=row["mortes"], + source=response.request.url + + " | Tabela com dados dos municípios do pdf.", + ) + else: + bulletin = CountyBulletinModel( + date=date, + state=self.state, + city=row["municipio"], + confirmed_cases=row["confirmados"], + deaths=row["mortes"], + source=response.request.url, + ) + self.add_new_bulletin_to_report(bulletin, date) def _extract_cases_and_deaths_from_news(self, response, date): body_text = " ".join(