-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #682 from flairNLP/add-asahi-shimbun
Add `AsahiShimbun`
- Loading branch information
Showing
6 changed files
with
139 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import datetime | ||
import re | ||
from typing import List, Optional | ||
|
||
from lxml.cssselect import CSSSelector | ||
|
||
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute | ||
from fundus.parser.utility import ( | ||
apply_substitution_pattern_over_list, | ||
extract_article_body_with_selector, | ||
generic_author_parsing, | ||
generic_date_parsing, | ||
generic_topic_parsing, | ||
image_extraction, | ||
) | ||
|
||
|
||
class AsahiShimbunParser(ParserProxy): | ||
class V1(BaseParser): | ||
_summary_selector = CSSSelector("div.nfyQp > div.bv2Sj > p") | ||
_paragraph_selector = CSSSelector("div.nfyQp > p") | ||
_subtitle_selector = CSSSelector("div.nfyQp > h2") | ||
|
||
topic_bloat_pattern = re.compile(r"朝日新聞デジタル|朝日新聞|ニュース|新聞|その他・話題") | ||
|
||
@attribute | ||
def body(self) -> Optional[ArticleBody]: | ||
return extract_article_body_with_selector( | ||
self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
summary_selector=self._summary_selector, | ||
subheadline_selector=self._subtitle_selector, | ||
) | ||
|
||
@attribute | ||
def title(self) -> Optional[str]: | ||
return self.precomputed.meta.get("TITLE") | ||
|
||
@attribute | ||
def publishing_date(self) -> Optional[datetime.datetime]: | ||
return generic_date_parsing(self.precomputed.meta.get("article:published_time")) | ||
|
||
@attribute | ||
def authors(self) -> List[str]: | ||
return generic_author_parsing(self.precomputed.ld.bf_search("author")) | ||
|
||
@attribute | ||
def topics(self) -> List[str]: | ||
return apply_substitution_pattern_over_list( | ||
generic_topic_parsing(self.precomputed.meta.get("keywords")), self.topic_bloat_pattern | ||
) | ||
|
||
@attribute | ||
def images(self) -> List[Image]: | ||
return image_extraction( | ||
doc=self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
author_selector=re.compile(r"、(?P<credits>[^、]*?)撮影"), | ||
relative_urls=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
{ | ||
"V1": { | ||
"authors": [ | ||
"朝日新聞デジタル" | ||
], | ||
"body": { | ||
"summary": [], | ||
"sections": [ | ||
{ | ||
"headline": [], | ||
"paragraphs": [ | ||
"気象庁は13日、午後9時19分ごろ、日向灘(北緯31.8度、東経131.6度)で震度5弱の地震があったと発表した。震源の深さは約30キロ、地震の規模(マグニチュード)は6.9と推定される。この地震で、気象庁は高知県と宮崎県に1メートルの津波注意報を出した。", | ||
"各地の震度は次のとおり。", | ||
"<震度5弱>", | ||
"宮崎県:高鍋町、新富町、宮崎市", | ||
"<震度4>", | ||
"宮崎県:延岡市、西都市、木城町、川南町、都農町、門川町、日南市*、串間市、国富町、綾町、美郷町、高千穂町、都城市、小林市、えびの市、三股町、高原町", | ||
"福岡県:久留米市", | ||
"佐賀県:神埼市、白石町", | ||
"熊本県:阿蘇市、産山村、高森町、南阿蘇村、熊本市南区、熊本市北区、八代市、菊池市、宇土市、宇城市、合志市、美里町、西原村、氷川町、人吉市、多良木町、あさぎり町、芦北町", | ||
"大分県:大分市、臼杵市、佐伯市、竹田市", | ||
"鹿児島県:鹿児島市、霧島市、いちき串木野市、南さつま市、伊佐市、姶良市、鹿屋市、垂水市、曽於市、大崎町、東串良町、肝付町" | ||
] | ||
} | ||
] | ||
}, | ||
"images": [ | ||
{ | ||
"versions": [ | ||
{ | ||
"url": "https://www.asahicom.jp/imgopt/img/4ff96428f2/comm_L/AS20250113003419.jpg", | ||
"query_width": null, | ||
"size": null, | ||
"type": "image/jpeg" | ||
} | ||
], | ||
"is_cover": true, | ||
"description": "写真・図版", | ||
"caption": null, | ||
"authors": [], | ||
"position": 737 | ||
} | ||
], | ||
"publishing_date": "2025-01-13 21:37:00+09:00", | ||
"title": "宮崎県で震度5弱、高知と宮崎に1メートルの津波注意報 気象庁", | ||
"topics": [ | ||
"社会", | ||
"災害・気象", | ||
"宮崎県" | ||
] | ||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters