From bab01b3a28d807abcddf216138ea37b009c244ea Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Mon, 21 Oct 2024 19:03:47 +0200 Subject: [PATCH 1/4] handle author dict bug --- src/fundus/publishers/de/braunschweiger_zeitung.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index 44864bf0..ff436bd8 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -1,6 +1,6 @@ import datetime import re -from typing import List, Optional, Pattern +from typing import List, Optional, Pattern, Union, Dict, Any from lxml.etree import XPath @@ -57,8 +57,11 @@ def topics(self) -> List[str]: @attribute def authors(self) -> List[str]: authors = [] - for author in self.precomputed.ld.bf_search("author", default=[]): - name_string = author.get("name") + author_selection: Union[List[Dict[str, Any]], Dict[str, Any]] + if isinstance(author_selection := self.precomputed.ld.bf_search("author", default=[]), dict): + author_selection = [author_selection] + for author in author_selection: + name_string: str = author.get("name", "") authors.extend(re.split(r"und|,", name_string)) return apply_substitution_pattern_over_list( [author.strip() for author in authors], self._author_substitution_pattern From cb6ec3ab6b48ba219d05701cd8d1eccb5f4afa03 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Mon, 21 Oct 2024 19:18:55 +0200 Subject: [PATCH 2/4] extend substitution pattern --- src/fundus/publishers/de/braunschweiger_zeitung.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index ff436bd8..4fdad556 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -1,6 +1,6 @@ import datetime import re -from typing import List, Optional, Pattern, Union, Dict, Any +from typing import Any, Dict, List, Optional, Pattern, Union from lxml.etree import XPath @@ -15,7 +15,7 @@ class BSZParser(ParserProxy): class V1(BaseParser): - _author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe") + _author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe|.*dpa(:|-infocom).*|^red$") _paragraph_selector = XPath( "//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'] or contains(@class, 'font-sans'))]" ) From bfc26d3a39cb5074bc3844980982f6f41dea7244 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Tue, 22 Oct 2024 18:07:14 +0200 Subject: [PATCH 3/4] use author utility function --- src/fundus/publishers/de/braunschweiger_zeitung.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index 4fdad556..7f66f372 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -8,6 +8,7 @@ from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, + generic_author_parsing, generic_date_parsing, generic_topic_parsing, ) @@ -56,16 +57,7 @@ def topics(self) -> List[str]: @attribute def authors(self) -> List[str]: - authors = [] - author_selection: Union[List[Dict[str, Any]], Dict[str, Any]] - if isinstance(author_selection := self.precomputed.ld.bf_search("author", default=[]), dict): - author_selection = [author_selection] - for author in author_selection: - name_string: str = author.get("name", "") - authors.extend(re.split(r"und|,", name_string)) - return apply_substitution_pattern_over_list( - [author.strip() for author in authors], self._author_substitution_pattern - ) + return generic_author_parsing(self.precomputed.ld.bf_search("author", default=[])) @attribute def publishing_date(self) -> Optional[datetime.datetime]: From 9a997d146cdbac1ddabd0c61168198d2bcb0e5cb Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 22 Oct 2024 20:09:21 +0200 Subject: [PATCH 4/4] add substitution pattern --- src/fundus/publishers/de/braunschweiger_zeitung.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index 7f66f372..7d148b8f 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -1,6 +1,6 @@ import datetime import re -from typing import Any, Dict, List, Optional, Pattern, Union +from typing import List, Optional, Pattern from lxml.etree import XPath @@ -57,7 +57,9 @@ def topics(self) -> List[str]: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.bf_search("author", default=[])) + return apply_substitution_pattern_over_list( + generic_author_parsing(self.precomputed.ld.bf_search("author")), self._author_substitution_pattern + ) @attribute def publishing_date(self) -> Optional[datetime.datetime]: