Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework ExtractionFilter to adept to boolean values #423

Merged
merged 7 commits into from
Apr 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/generate_parser_test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional
return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None)

crawler = Crawler(enum)
return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)
return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=RequiresAll()), None)


def parse_arguments() -> Namespace:
Expand Down
12 changes: 3 additions & 9 deletions scripts/publisher_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,12 @@
import sys
import traceback
from enum import EnumMeta
from typing import Any, Dict, List, Optional, cast
from typing import List, Optional, cast

from fundus import Crawler, NewsMap, PublisherCollection, RSSFeed
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article


class RequiresAllSkipBoolean:
"""Custom filter skipping boolean values"""

def __call__(self, extraction: Dict[str, Any]) -> bool:
return not all([bool(value) for value in extraction.values() if not isinstance(value, bool)])
from fundus.scraping.filter import RequiresAll


def main() -> None:
Expand All @@ -44,7 +38,7 @@ def main() -> None:

crawler: Crawler = Crawler(publisher, restrict_sources_to=[NewsMap, RSSFeed])
complete_article: Optional[Article] = next(
crawler.crawl(max_articles=1, only_complete=RequiresAllSkipBoolean(), error_handling="catch"), None
crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="catch"), None
)

if complete_article is None:
Expand Down
43 changes: 34 additions & 9 deletions src/fundus/scraping/filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from typing import Any, Callable, Dict, Protocol
from typing import Any, Callable, Dict, Protocol, cast

from typing_extensions import ParamSpec

Expand Down Expand Up @@ -119,33 +119,58 @@ def __bool__(self) -> bool:
return bool(self.missing_attributes)


def _guarded_bool(value: Any):
if isinstance(value, bool):
return True
else:
return bool(value)


class Requires:
def __init__(self, *required_attributes: str) -> None:
def __init__(self, *required_attributes: str, eval_booleans: bool = True) -> None:
"""Class to filter extractions based on attribute values

If a required_attribute is not present in the extracted data, this filter won't
be passed.
If a required_attribute is not present in the extracted data or evaluates to bool() -> False,
this filter won't be passed. By default, required boolean attributes are evaluated with bool().

I.e.,

Requires("free_access")({"free_access": False}) -> will be filtered out

You can alter this behaviour by setting `skip_bool=True`

I.e.,

Requires("free_access", skip_bool=True)({"free_access": False}) -> will pass

Args:
*required_attributes: Attributes required to evaluate to True in order to
pass the filter. If no attributes are given, all attributes will be evaluated:
pass the filter. If no attributes are given, all attributes will be evaluated
eval_booleans: If True the boolean values will also be evaluated with bool(<value>).
If False, all boolean values evaluate to True. Defaults to True.
"""
self.required_attributes = set(required_attributes)
# somehow mypy does not recognize bool as callable :(
self._eval: Callable[[Any], bool] = bool if eval_booleans else _guarded_bool # type: ignore[assignment]

def __call__(self, extraction: Dict[str, Any]) -> FilterResultWithMissingAttributes:
missing_attributes = [
attribute
for attribute in self.required_attributes or extraction.keys()
if not bool(value := extraction.get(attribute)) or isinstance(value, Exception)
if not self._eval(value := extraction.get(attribute)) or isinstance(value, Exception)
]
return FilterResultWithMissingAttributes(*missing_attributes)


class RequiresAll(Requires):
def __init__(self):
def __init__(self, eval_booleans: bool = False) -> None:
"""Name wrap for Requires()

This is for readability only. It requires all attributes of the extraction to evaluate to True.
This is for readability only. By default, it requires all non-boolean attributes of the extraction
to evaluate to True. Set `skip_boolean=False` to alter this behaviour.
See class:Requires docstring for more information.

Args:
eval_booleans: See Requires docstring for more information. Defaults to False.
"""
super().__init__()
super().__init__(eval_booleans=eval_booleans)
41 changes: 41 additions & 0 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from fundus import Requires
from fundus.scraping.filter import RequiresAll


class TestExtractionFilter:
def test_requires(self):
extraction = {"a": "Some Stuff", "b": [], "c": True}

assert not Requires("a")(extraction)

assert (result := Requires("a", "b")(extraction))

assert result.missing_attributes == ("b",)

assert not Requires("c")(extraction)

extraction = {"a": "Some Stuff", "b": [], "c": False}

assert (result := Requires("a", "b", "c")(extraction))

assert sorted(result.missing_attributes) == sorted(("b", "c"))

assert not Requires("c", eval_booleans=False)(extraction)

def test_requires_all(self):
extraction = {"a": "Some Stuff", "b": [], "c": False}

assert (result := RequiresAll()(extraction))
assert result.missing_attributes == ("b",)

extraction = {"a": "Some Stuff", "c": False}
assert not RequiresAll()(extraction)

# test skip_boolean=False
extraction = {"a": "Some Stuff", "b": [], "c": False}

assert (result := RequiresAll(eval_booleans=True)(extraction))
assert sorted(result.missing_attributes) == sorted(("b", "c"))

extraction = {"a": "Some Stuff", "c": True}
assert not RequiresAll(eval_booleans=True)(extraction)