[AC Spider][Feat] pdf parser

turicas · Mar 13, 2022 · 8d1bdaa · 8d1bdaa
1 parent 418e0ea
commit 8d1bdaa
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 9 deletions.
diff --git a/covid19br/parsers/acre.py b/covid19br/parsers/acre.py
@@ -0,0 +1,98 @@
+import re
+from rows.plugins import pdf
+
+from covid19br.common.constants import State
+from covid19br.common.data_normalization_utils import NormalizationUtils
+from covid19br.parsers.extractor_utils import match_object_from_regexp, is_only_number
+
+REGEXP_DATE = re.compile("([0-9]+) de (.+) de ([0-9]{4})$")
+CITY_NAME_TABLE_COLUMN = 0
+CONFIRMED_CASES_TABLE_COLUMN = 2
+DEATH_CASES_TABLE_COLUMN = 4
+
+
+def parse_int(value):
+    return int(value.replace(".", ""))
+
+
+class AcreBulletinExtractor:
+    state = State.AC
+
+    def __init__(self, filename):
+        self.doc = pdf.PyMuPDFBackend(filename)
+
+    @property
+    def date(self):
+        first_page_objects = next(
+            self.doc.text_objects(
+                starts_after=re.compile("BOLETIM(.+)"),
+                ends_before=re.compile("SITUAÇÃO ATUAL(.+)"),
+            )
+        )
+        date_obj, *_ = match_object_from_regexp(REGEXP_DATE, first_page_objects) or [
+            None
+        ]
+        if date_obj:
+            return NormalizationUtils.extract_in_full_date(" ".join(date_obj))
+
+    @property
+    def official_total(self):
+        first_page_objects = next(
+            self.doc.text_objects(
+                starts_after=re.compile("SITUAÇÃO ATUAL(.+)"),
+                ends_before=re.compile("DISTRIBUIÇÃO DOS CASOS(.+)"),
+            )
+        )
+
+        # Unfortunately the text labels are images which makes difficult for us to get the numbers based on them
+        # So we are going to infer which values we need based on it's position (sometimes there are "ghost objects"
+        # in the page, but they are on the far left and won't interfere in this logic.
+        remaining_number_objs = [
+            obj for obj in first_page_objects if is_only_number(obj.text)
+        ]
+        # we will start ordering the objects and drop the 2 last of the right (the little numbers on the bulletin)
+        ordered_by_x_axis = sorted(remaining_number_objs, key=lambda obj: obj.x0)
+        remaining_number_objs = ordered_by_x_axis[:-2]
+        # From the 3 numbers on the far right, the death cases is the one most at the bottom
+        *_, death_cases_obj = sorted(
+            remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
+        )
+        remaining_number_objs = remaining_number_objs[:-3]
+        # From the 3 numbers on the right remaining (the middle column), the confirmed cases is the one in the middle
+        _, confirmed_cases_obj, _ = sorted(
+            remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
+        )
+
+        return {"confirmados": confirmed_cases_obj.text, "mortes": death_cases_obj.text}
+
+    @property
+    def data(self):
+        table_page_number = self._get_table_page_number()
+        if not table_page_number:
+            return None
+        page_objs = next(self.doc.text_objects(
+            starts_after=re.compile(".+DISTRIBUIÇÃO DOS CASOS CONFIRMADOS.+"),
+            ends_before=re.compile("Fonte:.+"),
+            page_numbers=(table_page_number,),
+        ))
+
+        # remove headers
+        city_column_header = next(obj for obj in page_objs if "munic" in obj.text.lower())
+        table_objs = [obj for obj in page_objs if obj.y0 > city_column_header.y1]
+
+        lines = pdf.group_objects("y", table_objs, check_group=pdf.object_contains_center)
+        for line in lines:
+            city = line[CITY_NAME_TABLE_COLUMN].text.strip()
+            deaths = line[DEATH_CASES_TABLE_COLUMN].text.strip()
+            confirmed = line[CONFIRMED_CASES_TABLE_COLUMN].text.strip()
+            yield {
+                "municipio": city,
+                "confirmados": confirmed,
+                "mortes": deaths,
+            }
+
+    def _get_table_page_number(self):
+        for page_number, page_objs in enumerate(self.doc.text_objects(), start=1):
+            for obj in page_objs:
+                if "TABELA" in obj.text and "DISTRIBUIÇÃO DOS CASOS CONFIRMADOS" in obj.text:
+                    return page_number
diff --git a/covid19br/parsers/extractor_utils.py b/covid19br/parsers/extractor_utils.py
@@ -0,0 +1,13 @@
+import re
+
+
+def match_object_from_regexp(regexp, objects):
+    """Return the matching result for"""
+    for obj in objects:
+        result = regexp.findall(obj.text)
+        if result:
+            return result
+
+
+def is_only_number(value):
+    return re.compile("^([0-9.,]+)$").findall(value.strip())
diff --git a/covid19br/parsers/tocantins.py b/covid19br/parsers/tocantins.py
@@ -8,19 +8,12 @@
 from covid19br.common.constants import State
 from covid19br.common.data_normalization_utils import NormalizationUtils
 from covid19br.common.demographic_utils import DemographicUtils
+from covid19br.parsers.extractor_utils import match_object_from_regexp
 
 REGEXP_DAY_MONTH = re.compile("([0-9]+) de (.+)$")
 REGEXP_YEAR = re.compile("^de ([0-9]{4})$")
 
 
-def match_object_from_regexp(regexp, objects):
-    """Return the matching result for"""
-    for obj in objects:
-        result = regexp.findall(obj.text)
-        if result:
-            return result
-
-
 def parse_int(value):
     return int(value.replace(".", ""))
 

diff --git a/covid19br/spiders/spider_ac.py b/covid19br/spiders/spider_ac.py
@@ -80,7 +80,58 @@ def parse_news_bulletin(self, response, date):
         )
 
     def parse_pdf_bulletin(self, response, date):
-        print(f"Let's que let's {response.request.url}")
+        source = response.request.url
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as tmp:
+            tmp.write(response.body)
+
+            extractor = AcreBulletinExtractor(tmp.name)
+
+            pdf_date = extractor.date
+            if pdf_date and pdf_date != date:
+                self.logger.warning(
+                    f"PDF date does not match for pdf {source}. Aborting extraction."
+                )
+                return
+
+            pdf_official_total = extractor.official_total
+            if pdf_official_total:
+                bulletin = StateTotalBulletinModel(
+                    date=date,
+                    state=self.state,
+                    confirmed_cases=pdf_official_total["confirmados"],
+                    deaths=pdf_official_total["mortes"],
+                    source=response.request.url + " | Painel na primeira pag. do pdf.",
+                )
+                self.add_new_bulletin_to_report(bulletin, date)
+
+            pdf_data = list(extractor.data)
+            if not pdf_data:
+                if "parcial" not in source.lower():
+                    self.logger.error(
+                        f"Couldn't extract data from pdf that is not parcial. Pdf source: {source}."
+                    )
+                return
+
+            for row in pdf_data:
+                if row["municipio"].lower() == "total":
+                    bulletin = StateTotalBulletinModel(
+                        date=date,
+                        state=self.state,
+                        confirmed_cases=row["confirmados"],
+                        deaths=row["mortes"],
+                        source=response.request.url
+                        + " | Tabela com dados dos municípios do pdf.",
+                    )
+                else:
+                    bulletin = CountyBulletinModel(
+                        date=date,
+                        state=self.state,
+                        city=row["municipio"],
+                        confirmed_cases=row["confirmados"],
+                        deaths=row["mortes"],
+                        source=response.request.url,
+                    )
+                self.add_new_bulletin_to_report(bulletin, date)
 
     def _extract_cases_and_deaths_from_news(self, response, date):
         body_text = " ".join(