Skip to content

Commit

Permalink
[AC Spider][Feat] pdf parser
Browse files Browse the repository at this point in the history
  • Loading branch information
dehatanes committed Mar 13, 2022
1 parent 418e0ea commit 8d1bdaa
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 9 deletions.
98 changes: 98 additions & 0 deletions covid19br/parsers/acre.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import re
from rows.plugins import pdf

from covid19br.common.constants import State
from covid19br.common.data_normalization_utils import NormalizationUtils
from covid19br.parsers.extractor_utils import match_object_from_regexp, is_only_number

REGEXP_DATE = re.compile("([0-9]+) de (.+) de ([0-9]{4})$")
CITY_NAME_TABLE_COLUMN = 0
CONFIRMED_CASES_TABLE_COLUMN = 2
DEATH_CASES_TABLE_COLUMN = 4


def parse_int(value):
return int(value.replace(".", ""))


class AcreBulletinExtractor:
state = State.AC

def __init__(self, filename):
self.doc = pdf.PyMuPDFBackend(filename)

@property
def date(self):
first_page_objects = next(
self.doc.text_objects(
starts_after=re.compile("BOLETIM(.+)"),
ends_before=re.compile("SITUAÇÃO ATUAL(.+)"),
)
)
date_obj, *_ = match_object_from_regexp(REGEXP_DATE, first_page_objects) or [
None
]
if date_obj:
return NormalizationUtils.extract_in_full_date(" ".join(date_obj))

@property
def official_total(self):
first_page_objects = next(
self.doc.text_objects(
starts_after=re.compile("SITUAÇÃO ATUAL(.+)"),
ends_before=re.compile("DISTRIBUIÇÃO DOS CASOS(.+)"),
)
)

# Unfortunately the text labels are images which makes difficult for us to get the numbers based on them
# So we are going to infer which values we need based on it's position (sometimes there are "ghost objects"
# in the page, but they are on the far left and won't interfere in this logic.
remaining_number_objs = [
obj for obj in first_page_objects if is_only_number(obj.text)
]
# we will start ordering the objects and drop the 2 last of the right (the little numbers on the bulletin)
ordered_by_x_axis = sorted(remaining_number_objs, key=lambda obj: obj.x0)
remaining_number_objs = ordered_by_x_axis[:-2]
# From the 3 numbers on the far right, the death cases is the one most at the bottom
*_, death_cases_obj = sorted(
remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
)
remaining_number_objs = remaining_number_objs[:-3]
# From the 3 numbers on the right remaining (the middle column), the confirmed cases is the one in the middle
_, confirmed_cases_obj, _ = sorted(
remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
)

return {"confirmados": confirmed_cases_obj.text, "mortes": death_cases_obj.text}

@property
def data(self):
table_page_number = self._get_table_page_number()
if not table_page_number:
return None
page_objs = next(self.doc.text_objects(
starts_after=re.compile(".+DISTRIBUIÇÃO DOS CASOS CONFIRMADOS.+"),
ends_before=re.compile("Fonte:.+"),
page_numbers=(table_page_number,),
))

# remove headers
city_column_header = next(obj for obj in page_objs if "munic" in obj.text.lower())
table_objs = [obj for obj in page_objs if obj.y0 > city_column_header.y1]

lines = pdf.group_objects("y", table_objs, check_group=pdf.object_contains_center)
for line in lines:
city = line[CITY_NAME_TABLE_COLUMN].text.strip()
deaths = line[DEATH_CASES_TABLE_COLUMN].text.strip()
confirmed = line[CONFIRMED_CASES_TABLE_COLUMN].text.strip()
yield {
"municipio": city,
"confirmados": confirmed,
"mortes": deaths,
}

def _get_table_page_number(self):
for page_number, page_objs in enumerate(self.doc.text_objects(), start=1):
for obj in page_objs:
if "TABELA" in obj.text and "DISTRIBUIÇÃO DOS CASOS CONFIRMADOS" in obj.text:
return page_number
13 changes: 13 additions & 0 deletions covid19br/parsers/extractor_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import re


def match_object_from_regexp(regexp, objects):
"""Return the matching result for"""
for obj in objects:
result = regexp.findall(obj.text)
if result:
return result


def is_only_number(value):
return re.compile("^([0-9.,]+)$").findall(value.strip())
9 changes: 1 addition & 8 deletions covid19br/parsers/tocantins.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,12 @@
from covid19br.common.constants import State
from covid19br.common.data_normalization_utils import NormalizationUtils
from covid19br.common.demographic_utils import DemographicUtils
from covid19br.parsers.extractor_utils import match_object_from_regexp

REGEXP_DAY_MONTH = re.compile("([0-9]+) de (.+)$")
REGEXP_YEAR = re.compile("^de ([0-9]{4})$")


def match_object_from_regexp(regexp, objects):
"""Return the matching result for"""
for obj in objects:
result = regexp.findall(obj.text)
if result:
return result


def parse_int(value):
return int(value.replace(".", ""))

Expand Down
53 changes: 52 additions & 1 deletion covid19br/spiders/spider_ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,58 @@ def parse_news_bulletin(self, response, date):
)

def parse_pdf_bulletin(self, response, date):
print(f"Let's que let's {response.request.url}")
source = response.request.url
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as tmp:
tmp.write(response.body)

extractor = AcreBulletinExtractor(tmp.name)

pdf_date = extractor.date
if pdf_date and pdf_date != date:
self.logger.warning(
f"PDF date does not match for pdf {source}. Aborting extraction."
)
return

pdf_official_total = extractor.official_total
if pdf_official_total:
bulletin = StateTotalBulletinModel(
date=date,
state=self.state,
confirmed_cases=pdf_official_total["confirmados"],
deaths=pdf_official_total["mortes"],
source=response.request.url + " | Painel na primeira pag. do pdf.",
)
self.add_new_bulletin_to_report(bulletin, date)

pdf_data = list(extractor.data)
if not pdf_data:
if "parcial" not in source.lower():
self.logger.error(
f"Couldn't extract data from pdf that is not parcial. Pdf source: {source}."
)
return

for row in pdf_data:
if row["municipio"].lower() == "total":
bulletin = StateTotalBulletinModel(
date=date,
state=self.state,
confirmed_cases=row["confirmados"],
deaths=row["mortes"],
source=response.request.url
+ " | Tabela com dados dos municípios do pdf.",
)
else:
bulletin = CountyBulletinModel(
date=date,
state=self.state,
city=row["municipio"],
confirmed_cases=row["confirmados"],
deaths=row["mortes"],
source=response.request.url,
)
self.add_new_bulletin_to_report(bulletin, date)

def _extract_cases_and_deaths_from_news(self, response, date):
body_text = " ".join(
Expand Down

0 comments on commit 8d1bdaa

Please sign in to comment.