Skip to content

Commit

Permalink
filter topics and apply author regex for image extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxDall committed Jan 14, 2025
1 parent d1bd008 commit c408544
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 7 deletions.
9 changes: 8 additions & 1 deletion src/fundus/publishers/jp/asahi_shimbun.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
Expand All @@ -19,6 +21,8 @@ class V1(BaseParser):
_paragraph_selector = CSSSelector("div.nfyQp > p")
_subtitle_selector = CSSSelector("div.nfyQp > h2")

topic_bloat_pattern = re.compile(r"朝日新聞デジタル|朝日新聞|ニュース|新聞|その他・話題")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
Expand All @@ -42,12 +46,15 @@ def authors(self) -> List[str]:

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("keywords"))
return apply_substitution_pattern_over_list(
generic_topic_parsing(self.precomputed.meta.get("keywords")), self.topic_bloat_pattern
)

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
author_selector=re.compile(r"、(?P<credits>[^、]*?)撮影"),
relative_urls=True,
)
8 changes: 2 additions & 6 deletions tests/resources/parser/test_data/jp/AsahiShimbun.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,11 @@
}
],
"publishing_date": "2025-01-13 21:37:00+09:00",
"title": "宮崎県で震度5弱、高知と宮崎に1メートルの津波注意報 気象庁:朝日新聞デジタル",
"title": "宮崎県で震度5弱、高知と宮崎に1メートルの津波注意報 気象庁",
"topics": [
"朝日新聞デジタル",
"社会",
"災害・気象",
"宮崎県",
"朝日新聞",
"ニュース",
"新聞"
"宮崎県"
]
}
}

0 comments on commit c408544

Please sign in to comment.