diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index 6f131e82f..727c71536 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -23,7 +23,7 @@ def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None) crawler = Crawler(enum) - return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None) + return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=RequiresAll()), None) def parse_arguments() -> Namespace: diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py index ae0ff9f81..6cc561fe3 100644 --- a/scripts/publisher_coverage.py +++ b/scripts/publisher_coverage.py @@ -7,18 +7,12 @@ import sys import traceback from enum import EnumMeta -from typing import Any, Dict, List, Optional, cast +from typing import List, Optional, cast from fundus import Crawler, NewsMap, PublisherCollection, RSSFeed from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article - - -class RequiresAllSkipBoolean: - """Custom filter skipping boolean values""" - - def __call__(self, extraction: Dict[str, Any]) -> bool: - return not all([bool(value) for value in extraction.values() if not isinstance(value, bool)]) +from fundus.scraping.filter import RequiresAll def main() -> None: @@ -44,7 +38,7 @@ def main() -> None: crawler: Crawler = Crawler(publisher, restrict_sources_to=[NewsMap, RSSFeed]) complete_article: Optional[Article] = next( - crawler.crawl(max_articles=1, only_complete=RequiresAllSkipBoolean(), error_handling="catch"), None + crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="catch"), None ) if complete_article is None: diff --git a/src/fundus/scraping/filter.py b/src/fundus/scraping/filter.py index ef81382b8..75b1cb98f 100644 --- a/src/fundus/scraping/filter.py +++ b/src/fundus/scraping/filter.py @@ -1,5 +1,5 @@ import re -from typing import Any, Callable, Dict, Protocol +from typing import Any, Callable, Dict, Protocol, cast from typing_extensions import ParamSpec @@ -119,33 +119,58 @@ def __bool__(self) -> bool: return bool(self.missing_attributes) +def _guarded_bool(value: Any): + if isinstance(value, bool): + return True + else: + return bool(value) + + class Requires: - def __init__(self, *required_attributes: str) -> None: + def __init__(self, *required_attributes: str, eval_booleans: bool = True) -> None: """Class to filter extractions based on attribute values - If a required_attribute is not present in the extracted data, this filter won't - be passed. + If a required_attribute is not present in the extracted data or evaluates to bool() -> False, + this filter won't be passed. By default, required boolean attributes are evaluated with bool(). + + I.e., + + Requires("free_access")({"free_access": False}) -> will be filtered out + + You can alter this behaviour by setting `skip_bool=True` + + I.e., + + Requires("free_access", skip_bool=True)({"free_access": False}) -> will pass Args: *required_attributes: Attributes required to evaluate to True in order to - pass the filter. If no attributes are given, all attributes will be evaluated: + pass the filter. If no attributes are given, all attributes will be evaluated + eval_booleans: If True the boolean values will also be evaluated with bool(). + If False, all boolean values evaluate to True. Defaults to True. """ self.required_attributes = set(required_attributes) + # somehow mypy does not recognize bool as callable :( + self._eval: Callable[[Any], bool] = bool if eval_booleans else _guarded_bool # type: ignore[assignment] def __call__(self, extraction: Dict[str, Any]) -> FilterResultWithMissingAttributes: missing_attributes = [ attribute for attribute in self.required_attributes or extraction.keys() - if not bool(value := extraction.get(attribute)) or isinstance(value, Exception) + if not self._eval(value := extraction.get(attribute)) or isinstance(value, Exception) ] return FilterResultWithMissingAttributes(*missing_attributes) class RequiresAll(Requires): - def __init__(self): + def __init__(self, eval_booleans: bool = False) -> None: """Name wrap for Requires() - This is for readability only. It requires all attributes of the extraction to evaluate to True. + This is for readability only. By default, it requires all non-boolean attributes of the extraction + to evaluate to True. Set `skip_boolean=False` to alter this behaviour. See class:Requires docstring for more information. + + Args: + eval_booleans: See Requires docstring for more information. Defaults to False. """ - super().__init__() + super().__init__(eval_booleans=eval_booleans) diff --git a/tests/test_filter.py b/tests/test_filter.py new file mode 100644 index 000000000..a9e727993 --- /dev/null +++ b/tests/test_filter.py @@ -0,0 +1,41 @@ +from fundus import Requires +from fundus.scraping.filter import RequiresAll + + +class TestExtractionFilter: + def test_requires(self): + extraction = {"a": "Some Stuff", "b": [], "c": True} + + assert not Requires("a")(extraction) + + assert (result := Requires("a", "b")(extraction)) + + assert result.missing_attributes == ("b",) + + assert not Requires("c")(extraction) + + extraction = {"a": "Some Stuff", "b": [], "c": False} + + assert (result := Requires("a", "b", "c")(extraction)) + + assert sorted(result.missing_attributes) == sorted(("b", "c")) + + assert not Requires("c", eval_booleans=False)(extraction) + + def test_requires_all(self): + extraction = {"a": "Some Stuff", "b": [], "c": False} + + assert (result := RequiresAll()(extraction)) + assert result.missing_attributes == ("b",) + + extraction = {"a": "Some Stuff", "c": False} + assert not RequiresAll()(extraction) + + # test skip_boolean=False + extraction = {"a": "Some Stuff", "b": [], "c": False} + + assert (result := RequiresAll(eval_booleans=True)(extraction)) + assert sorted(result.missing_attributes) == sorted(("b", "c")) + + extraction = {"a": "Some Stuff", "c": True} + assert not RequiresAll(eval_booleans=True)(extraction)