Skip to content

Commit

Permalink
Merge pull request #682 from flairNLP/add-asahi-shimbun
Browse files Browse the repository at this point in the history
Add `AsahiShimbun`
  • Loading branch information
MaxDall authored Jan 16, 2025
2 parents dd094f1 + c408544 commit 0853045
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 0 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1254,6 +1254,21 @@
</tr>
</thead>
<tbody>
<tr>
<td>
<code>AsahiShimbun</code>
</td>
<td>
<div>Asahi Shimbun</div>
</td>
<td>
<a href="https://www.asahi.com/">
<span>www.asahi.com</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TheJapanNews</code>
Expand Down
8 changes: 8 additions & 0 deletions src/fundus/publishers/jp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
from fundus.scraping.filter import regex_filter
Expand Down Expand Up @@ -28,3 +29,10 @@ class JP(metaclass=PublisherGroup):
NewsMap("https://www.yomiuri.co.jp/sitemap-news-latest.xml"),
],
)

AsahiShimbun = Publisher(
name="Asahi Shimbun",
domain="https://www.asahi.com/",
parser=AsahiShimbunParser,
sources=[NewsMap("https://www.asahi.com/sitemap.xml")],
)
60 changes: 60 additions & 0 deletions src/fundus/publishers/jp/asahi_shimbun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
)


class AsahiShimbunParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("div.nfyQp > div.bv2Sj > p")
_paragraph_selector = CSSSelector("div.nfyQp > p")
_subtitle_selector = CSSSelector("div.nfyQp > h2")

topic_bloat_pattern = re.compile(r"朝日新聞デジタル|朝日新聞|ニュース|新聞|その他・話題")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
subheadline_selector=self._subtitle_selector,
)

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("TITLE")

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.meta.get("article:published_time"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def topics(self) -> List[str]:
return apply_substitution_pattern_over_list(
generic_topic_parsing(self.precomputed.meta.get("keywords")), self.topic_bloat_pattern
)

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
author_selector=re.compile(r"、(?P<credits>[^、]*?)撮影"),
relative_urls=True,
)
52 changes: 52 additions & 0 deletions tests/resources/parser/test_data/jp/AsahiShimbun.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"V1": {
"authors": [
"朝日新聞デジタル"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"気象庁は13日、午後9時19分ごろ、日向灘(北緯31.8度、東経131.6度)で震度5弱の地震があったと発表した。震源の深さは約30キロ、地震の規模(マグニチュード)は6.9と推定される。この地震で、気象庁は高知県と宮崎県に1メートルの津波注意報を出した。",
"各地の震度は次のとおり。",
"<震度5弱>",
"宮崎県:高鍋町、新富町、宮崎市",
"<震度4>",
"宮崎県:延岡市、西都市、木城町、川南町、都農町、門川町、日南市*、串間市、国富町、綾町、美郷町、高千穂町、都城市、小林市、えびの市、三股町、高原町",
"福岡県:久留米市",
"佐賀県:神埼市、白石町",
"熊本県:阿蘇市、産山村、高森町、南阿蘇村、熊本市南区、熊本市北区、八代市、菊池市、宇土市、宇城市、合志市、美里町、西原村、氷川町、人吉市、多良木町、あさぎり町、芦北町",
"大分県:大分市、臼杵市、佐伯市、竹田市",
"鹿児島県:鹿児島市、霧島市、いちき串木野市、南さつま市、伊佐市、姶良市、鹿屋市、垂水市、曽於市、大崎町、東串良町、肝付町"
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://www.asahicom.jp/imgopt/img/4ff96428f2/comm_L/AS20250113003419.jpg",
"query_width": null,
"size": null,
"type": "image/jpeg"
}
],
"is_cover": true,
"description": "写真・図版",
"caption": null,
"authors": [],
"position": 737
}
],
"publishing_date": "2025-01-13 21:37:00+09:00",
"title": "宮崎県で震度5弱、高知と宮崎に1メートルの津波注意報 気象庁",
"topics": [
"社会",
"災害・気象",
"宮崎県"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/jp/meta.info
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
{
"AsahiShimbun_2025_01_13.html.gz": {
"url": "https://www.asahi.com/articles/AST1F4445T1FUTIL02SM.html",
"crawl_date": "2025-01-13 14:12:17.527262"
},
"TheJapanNews_2024_10_13.html.gz": {
"url": "https://japannews.yomiuri.co.jp/politics/politics-government/20241013-216478/",
"crawl_date": "2024-10-13 16:27:01.520980"
Expand Down

0 comments on commit 0853045

Please sign in to comment.