Skip to content

Commit

Permalink
Merge pull request #641 from flairNLP/handle-bsz-bug
Browse files Browse the repository at this point in the history
Handle author dict Bug
  • Loading branch information
MaxDall authored Oct 22, 2024
2 parents 05cc97d + 9a997d1 commit 346aae8
Showing 1 changed file with 3 additions and 6 deletions.
9 changes: 3 additions & 6 deletions src/fundus/publishers/de/braunschweiger_zeitung.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class BSZParser(ParserProxy):
class V1(BaseParser):
_author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe")
_author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe|.*dpa(:|-infocom).*|^red$")
_paragraph_selector = XPath(
"//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'] or contains(@class, 'font-sans'))]"
)
Expand Down Expand Up @@ -56,12 +57,8 @@ def topics(self) -> List[str]:

@attribute
def authors(self) -> List[str]:
authors = []
for author in self.precomputed.ld.bf_search("author", default=[]):
name_string = author.get("name")
authors.extend(re.split(r"und|,", name_string))
return apply_substitution_pattern_over_list(
[author.strip() for author in authors], self._author_substitution_pattern
generic_author_parsing(self.precomputed.ld.bf_search("author")), self._author_substitution_pattern
)

@attribute
Expand Down

0 comments on commit 346aae8

Please sign in to comment.