Skip to content

Commit

Permalink
Merge pull request #416 from MaxDall/add-rheineische-post
Browse files Browse the repository at this point in the history
Add `Rheinische Post` as publisher
  • Loading branch information
MaxDall authored Apr 21, 2024
2 parents 97cc5e7 + a034eca commit 8484659
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 0 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>RheinischePost</code>
</td>
<td>
<div>Rheinische Post</div>
</td>
<td>
<a href="https://rp-online.de/">
<span>rp-online.de</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>SpiegelOnline</code>
Expand Down
12 changes: 12 additions & 0 deletions src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .merkur import MerkurParser
from .ndr import NDRParser
from .ntv import NTVParser
from .rheinische_post import RheinischePostParser
from .spon import SPONParser
from .stern import SternParser
from .sz import SZParser
Expand Down Expand Up @@ -227,3 +228,14 @@ class DE(PublisherEnum):
],
parser=BusinessInsiderDEParser,
)

RheinischePost = PublisherSpec(
name="Rheinische Post",
domain="https://rp-online.de/",
sources=[
RSSFeed("https://rp-online.de/feed.rss"),
NewsMap("https://rp-online.de/sitemap-news.xml"),
Sitemap("https://rp-online.de/sitemap.xml"),
],
parser=RheinischePostParser,
)
44 changes: 44 additions & 0 deletions src/fundus/publishers/de/rheinische_post.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class RheinischePostParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("strong[data-cy='intro']")
_paragraph_selector = CSSSelector("div[data-cy='article-content'] p")
_subheadline_selector = CSSSelector("div[data-cy='article-content'] h2")

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.meta.get("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("keywords"))
22 changes: 22 additions & 0 deletions tests/resources/parser/test_data/de/RheinischePost.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"V1": {
"authors": [
"Simon Janßen"
],
"publishing_date": "2024-04-15 16:37:00+02:00",
"title": "Schomaker an der Niederstraße: Bio-Bäckerei schließt Filiale in Neuss",
"topics": [
"Schomaker",
"Neuss",
"Bio",
"Niederstraße",
"Schließung",
"Reißleine",
"Backwaren",
"Bäcker",
"begründet",
"Biobäckerei",
"Bäckereien"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/de/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
"url": "https://www.n-tv.de/leben/Judy-Lybke-ganz-und-gar-nicht-eigenartig-article24075843.html",
"crawl_date": "2023-04-28 20:32:13.689394"
},
"RheinischePost_2024_04_15.html.gz": {
"url": "https://rp-online.de/nrw/staedte/neuss/neuss-biobaeckerei-schomaker-an-der-niederstrasse-schliesst_aid-110715299",
"crawl_date": "2024-04-15 16:40:22.430078"
},
"SZ_2023_04_28.html.gz": {
"url": "https://www.sueddeutsche.de/muenchen/landkreismuenchen/ranking-fahrradfreundlichkeit-fleissige-bundestagsabgeordnete-kolumne-1.5827206",
"crawl_date": "2023-04-28 20:21:15.488026"
Expand Down

0 comments on commit 8484659

Please sign in to comment.