Skip to content

Commit

Permalink
filter topics
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxDall committed Jan 20, 2025
1 parent 57676a3 commit bcc79b8
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
8 changes: 7 additions & 1 deletion src/fundus/publishers/jp/mainichi_shimbun.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
Expand All @@ -20,6 +21,8 @@ class MainichiShimbunParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = CSSSelector("#articledetail-body > p")

_topic_bloat_pattern = re.compile("速報")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
Expand All @@ -43,7 +46,10 @@ def authors(self) -> List[str]:

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=[",", "・"])
return apply_substitution_pattern_over_list(
generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=[",", "・"]),
self._topic_bloat_pattern,
)

@attribute
def images(self) -> List[Image]:
Expand Down
1 change: 0 additions & 1 deletion tests/resources/parser/test_data/jp/MainichiShimbun.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
"title": "イスラエルとハマスの停戦「最も合意に近い」 最終案に双方同意か",
"topics": [
"国際",
"速報",
"中東",
"緊迫する中東情勢",
"松岡大地",
Expand Down

0 comments on commit bcc79b8

Please sign in to comment.