Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MainichiShimbun #685

Merged
merged 6 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1235,6 +1235,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>MainichiShimbun</code>
</td>
<td>
<div>Mainichi Shimbun</div>
</td>
<td>
<a href="https://mainichi.jp/">
<span>mainichi.jp</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TheJapanNews</code>
Expand Down
13 changes: 11 additions & 2 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,11 +391,20 @@ def generic_text_extraction_with_css(doc, selector: XPath) -> Optional[str]:
return strip_nodes_to_text(nodes)


def generic_topic_parsing(keywords: Optional[Union[str, List[str]]], delimiter: str = ",") -> List[str]:
def generic_topic_parsing(
keywords: Optional[Union[str, List[str]]], delimiter: Union[str, List[str]] = ","
) -> List[str]:
if isinstance(delimiter, str):
delimiter = [delimiter]

if not keywords:
topics = []
elif isinstance(keywords, str):
topics = [cleaned for keyword in keywords.split(delimiter) if (cleaned := keyword.strip())]
topics = [
cleaned
for keyword in re.split(pattern=f"[{''.join(delimiter)}]", string=keywords)
if (cleaned := keyword.strip())
]
elif isinstance(keywords, list) and all(isinstance(s, str) for s in keywords):
topics = keywords
else:
Expand Down
12 changes: 11 additions & 1 deletion src/fundus/publishers/jp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser
from fundus.publishers.jp.mainichi_shimbun import MainichiShimbunParser
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
from fundus.publishers.jp.tokyo_chunichi_shimbun import TokyoChunichiShimbunParser
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
from fundus.scraping.filter import regex_filter
from fundus.scraping.url import NewsMap, Sitemap
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap


class JP(metaclass=PublisherGroup):
Expand Down Expand Up @@ -51,3 +52,12 @@ class JP(metaclass=PublisherGroup):
parser=TokyoChunichiShimbunParser,
sources=[NewsMap("https://www.chunichi.co.jp/sitemap.xml")],
)

MainichiShimbun = Publisher(
name="Mainichi Shimbun",
domain="https://mainichi.jp/",
parser=MainichiShimbunParser,
sources=[
RSSFeed("https://mainichi.jp/rss/etc/mainichi-flash.rss"),
],
)
66 changes: 66 additions & 0 deletions src/fundus/publishers/jp/mainichi_shimbun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
normalize_whitespace,
)


class MainichiShimbunParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = CSSSelector("#articledetail-body > p")
_subheadline_selector = CSSSelector("#articledetail-body > h2")

_topic_bloat_pattern = re.compile("速報")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def title(self) -> Optional[str]:
if (title := self.precomputed.meta.get("title")) is not None:
return normalize_whitespace(title)
return None

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.meta.get("cXenseParse:author"))

@attribute
def topics(self) -> List[str]:
return apply_substitution_pattern_over_list(
generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=[",", "・"]),
self._topic_bloat_pattern,
)

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
image_selector=XPath("//figure//img[not(ancestor::a[contains(@class,'articledetail-image-scale')])]"),
upper_boundary_selector=CSSSelector("#main"),
# https://regex101.com/r/awU0Rq/1
author_selector=re.compile(r"(、|=(?=.*?撮影$))(?P<credits>[^、]*?)(撮影)?\s*$"),
relative_urls=True,
)
61 changes: 61 additions & 0 deletions tests/resources/parser/test_data/jp/MainichiShimbun.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"V1": {
"authors": [
"松岡大地"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"パレスチナ自治区ガザ地区のイスラム組織ハマスとイスラエルの停戦交渉を仲介しているカタールの外務省報道官は14日、「これまでで最も合意に近づいている」と語った。対立する双方ともに停戦合意の最終草案に同意しているとみられる。早期停戦を求めるトランプ次期米大統領の就任が20日に迫る中、停戦実現の期待が高まっている。",
"米ニュースサイト「アクシオス」によると、イスラエルと仲介国は最終草案に合意。ハマスも14日の声明で、指導部が交渉内容に満足していると明らかにし、カタールでの今回の交渉で「明確で包括的な合意」がまとまることへの期待を示した。",
"アクシオスが報じた合意案によると、「第1段階」で42日間の停戦を実施。イスラエル軍はガザとエジプトの境界などから徐々に撤退し、ハマスは女性や子どもなど33人の人質を解放する。第1段階の停戦中に、恒久的停戦やイスラエル軍の完全撤退を含む「第2段階」に向けた協議を始めるという。",
"停戦に向けた交渉が大詰めを迎える中、バイデン米大統領は13日、カタールのタミム首長と停戦合意に向けて電話協議を実施。ハマスもタミム氏やトルコの諜報(ちょうほう)機関トップと協議した。",
"こうした動きにネタニヤフ連立政権の一角を占める極右政党は反発。対パレスチナ強硬派のスモトリッチ財務相は、停戦合意を「大惨事」だとし、合意に賛成しない方針を示した。ネタニヤフ首相はベングビール国家治安相と会談し、政権維持へ協力を求めた。",
"イスラエル軍は13日もガザ地区への攻撃を続け、中東メディアによると、45人が死亡した。ガザ保健当局によると、2023年10月の戦闘開始以降のガザ側の死者は、4万6584人になった。",
"一方で、イスラエル軍は13日、ガザ北部の戦闘でイスラエル兵5人が死亡したと発表した。ガザでは約100人の人質が拘束されている。【エルサレム松岡大地、カイロ金子淳】"
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://cdn.mainichi.jp/vol1/2023/06/06/20230606k0000m030221000p/9.webp?1",
"query_width": null,
"size": null,
"type": "image/webp"
},
{
"url": "https://cdn.mainichi.jp/vol1/2023/06/06/20230606k0000m030221000p/9.jpg?1",
"query_width": null,
"size": {
"width": 800,
"height": 528
},
"type": "image/jpeg"
}
],
"is_cover": true,
"description": "イスラエルの国旗=同国で2019年5月",
"caption": "イスラエルの国旗=同国で2019年5月",
"authors": [],
"position": 891
}
],
"publishing_date": "2025-01-14 21:17:27+09:00",
"title": "イスラエルとハマスの停戦「最も合意に近い」 最終案に双方同意か",
"topics": [
"国際",
"中東",
"緊迫する中東情勢",
"松岡大地",
"イスラエル",
"カタール",
"パレスチナ"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/jp/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
"url": "https://www.chunichi.co.jp/article/1011185",
"crawl_date": "2025-01-13 18:10:25.145717"
},
"MainichiShimbun_2025_01_14.html.gz": {
"url": "https://mainichi.jp/articles/20250114/k00/00m/030/335000c",
"crawl_date": "2025-01-14 14:55:19.277555"
},
"TheJapanNews_2024_10_13.html.gz": {
"url": "https://japannews.yomiuri.co.jp/politics/politics-government/20241013-216478/",
"crawl_date": "2024-10-13 16:27:01.520980"
Expand Down
Loading