Skip to content

Commit

Permalink
Merge pull request #423 from flairNLP/reqork-extraction-filter-for-bo…
Browse files Browse the repository at this point in the history
…olean-values

Rework `ExtractionFilter` to adept to boolean values
  • Loading branch information
MaxDall authored Apr 20, 2024
2 parents 2c369dd + 1461bae commit f9816f6
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 19 deletions.
2 changes: 1 addition & 1 deletion scripts/generate_parser_test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional
return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None)

crawler = Crawler(enum)
return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)
return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=RequiresAll()), None)


def parse_arguments() -> Namespace:
Expand Down
12 changes: 3 additions & 9 deletions scripts/publisher_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,12 @@
import sys
import traceback
from enum import EnumMeta
from typing import Any, Dict, List, Optional, cast
from typing import List, Optional, cast

from fundus import Crawler, NewsMap, PublisherCollection, RSSFeed
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article


class RequiresAllSkipBoolean:
"""Custom filter skipping boolean values"""

def __call__(self, extraction: Dict[str, Any]) -> bool:
return not all([bool(value) for value in extraction.values() if not isinstance(value, bool)])
from fundus.scraping.filter import RequiresAll


def main() -> None:
Expand All @@ -44,7 +38,7 @@ def main() -> None:

crawler: Crawler = Crawler(publisher, restrict_sources_to=[NewsMap, RSSFeed])
complete_article: Optional[Article] = next(
crawler.crawl(max_articles=1, only_complete=RequiresAllSkipBoolean(), error_handling="catch"), None
crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="catch"), None
)

if complete_article is None:
Expand Down
43 changes: 34 additions & 9 deletions src/fundus/scraping/filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from typing import Any, Callable, Dict, Protocol
from typing import Any, Callable, Dict, Protocol, cast

from typing_extensions import ParamSpec

Expand Down Expand Up @@ -119,33 +119,58 @@ def __bool__(self) -> bool:
return bool(self.missing_attributes)


def _guarded_bool(value: Any):
if isinstance(value, bool):
return True
else:
return bool(value)


class Requires:
def __init__(self, *required_attributes: str) -> None:
def __init__(self, *required_attributes: str, eval_booleans: bool = True) -> None:
"""Class to filter extractions based on attribute values
If a required_attribute is not present in the extracted data, this filter won't
be passed.
If a required_attribute is not present in the extracted data or evaluates to bool() -> False,
this filter won't be passed. By default, required boolean attributes are evaluated with bool().
I.e.,
Requires("free_access")({"free_access": False}) -> will be filtered out
You can alter this behaviour by setting `skip_bool=True`
I.e.,
Requires("free_access", skip_bool=True)({"free_access": False}) -> will pass
Args:
*required_attributes: Attributes required to evaluate to True in order to
pass the filter. If no attributes are given, all attributes will be evaluated:
pass the filter. If no attributes are given, all attributes will be evaluated
eval_booleans: If True the boolean values will also be evaluated with bool(<value>).
If False, all boolean values evaluate to True. Defaults to True.
"""
self.required_attributes = set(required_attributes)
# somehow mypy does not recognize bool as callable :(
self._eval: Callable[[Any], bool] = bool if eval_booleans else _guarded_bool # type: ignore[assignment]

def __call__(self, extraction: Dict[str, Any]) -> FilterResultWithMissingAttributes:
missing_attributes = [
attribute
for attribute in self.required_attributes or extraction.keys()
if not bool(value := extraction.get(attribute)) or isinstance(value, Exception)
if not self._eval(value := extraction.get(attribute)) or isinstance(value, Exception)
]
return FilterResultWithMissingAttributes(*missing_attributes)


class RequiresAll(Requires):
def __init__(self):
def __init__(self, eval_booleans: bool = False) -> None:
"""Name wrap for Requires()
This is for readability only. It requires all attributes of the extraction to evaluate to True.
This is for readability only. By default, it requires all non-boolean attributes of the extraction
to evaluate to True. Set `skip_boolean=False` to alter this behaviour.
See class:Requires docstring for more information.
Args:
eval_booleans: See Requires docstring for more information. Defaults to False.
"""
super().__init__()
super().__init__(eval_booleans=eval_booleans)
41 changes: 41 additions & 0 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from fundus import Requires
from fundus.scraping.filter import RequiresAll


class TestExtractionFilter:
def test_requires(self):
extraction = {"a": "Some Stuff", "b": [], "c": True}

assert not Requires("a")(extraction)

assert (result := Requires("a", "b")(extraction))

assert result.missing_attributes == ("b",)

assert not Requires("c")(extraction)

extraction = {"a": "Some Stuff", "b": [], "c": False}

assert (result := Requires("a", "b", "c")(extraction))

assert sorted(result.missing_attributes) == sorted(("b", "c"))

assert not Requires("c", eval_booleans=False)(extraction)

def test_requires_all(self):
extraction = {"a": "Some Stuff", "b": [], "c": False}

assert (result := RequiresAll()(extraction))
assert result.missing_attributes == ("b",)

extraction = {"a": "Some Stuff", "c": False}
assert not RequiresAll()(extraction)

# test skip_boolean=False
extraction = {"a": "Some Stuff", "b": [], "c": False}

assert (result := RequiresAll(eval_booleans=True)(extraction))
assert sorted(result.missing_attributes) == sorted(("b", "c"))

extraction = {"a": "Some Stuff", "c": True}
assert not RequiresAll(eval_booleans=True)(extraction)

0 comments on commit f9816f6

Please sign in to comment.