From 816338542cea56c9dfe7acd99ad3b50a88e24792 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Sat, 21 Dec 2024 17:18:15 +0100 Subject: [PATCH 1/3] update bbc --- src/fundus/publishers/uk/__init__.py | 2 +- src/fundus/publishers/uk/the_bbc.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/fundus/publishers/uk/__init__.py b/src/fundus/publishers/uk/__init__.py index 57c41bd5..c5a71eb0 100644 --- a/src/fundus/publishers/uk/__init__.py +++ b/src/fundus/publishers/uk/__init__.py @@ -156,7 +156,7 @@ class UK(metaclass=PublisherGroup): domain="https://www.bbc.co.uk/", parser=TheBBCParser, sources=[ - NewsMap("https://www.bbc.co.uk/sitemaps/https-index-uk-news.xml"), + # NewsMap("https://www.bbc.co.uk/sitemaps/https-index-uk-news.xml"), Sitemap("https://www.bbc.co.uk/sitemaps/https-index-com-archive.xml", reverse=True), ], url_filter=regex_filter("video|live"), diff --git a/src/fundus/publishers/uk/the_bbc.py b/src/fundus/publishers/uk/the_bbc.py index 7a2127e9..24cae415 100644 --- a/src/fundus/publishers/uk/the_bbc.py +++ b/src/fundus/publishers/uk/the_bbc.py @@ -15,12 +15,16 @@ class TheBBCParser(ParserProxy): class V1(BaseParser): - _subheadline_selector = CSSSelector("div[data-component='subheadline-block']") - _summary_selector = XPath("//div[@data-component='text-block'][1] //p[b]") + _subheadline_selector = XPath( + "//div[@data-component='subheadline-block' or contains(@class, 'ebmt73l0')]//*[self::h2 or (self::p and b and position()>1)]" + ) + _summary_selector = XPath( + "(//div[@data-component='text-block' or contains(@class, 'ebmt73l0')])[1] //p[b and position()=1]" + ) _paragraph_selector = XPath( - "//div[@data-component='text-block'][1]//p[not(b) and text()] |" - "//div[@data-component='text-block'][position()>1] //p[text()] |" - "//div[@data-component='text-block'] //ul /li[text()]" + "//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][1]//p[not(b) and text()] |" + "//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][position()>1] //p[text()] |" + "//div[@data-component='text-block' or contains(@class, 'ebmt73l0')] //ul /li[text()]" ) _topic_selector = CSSSelector( From 2cd53ffc4764d67e9af5c21c7ec2fb829c3697ad Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Sat, 21 Dec 2024 17:22:35 +0100 Subject: [PATCH 2/3] uncomment newsmap --- src/fundus/publishers/uk/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/publishers/uk/__init__.py b/src/fundus/publishers/uk/__init__.py index c5a71eb0..57c41bd5 100644 --- a/src/fundus/publishers/uk/__init__.py +++ b/src/fundus/publishers/uk/__init__.py @@ -156,7 +156,7 @@ class UK(metaclass=PublisherGroup): domain="https://www.bbc.co.uk/", parser=TheBBCParser, sources=[ - # NewsMap("https://www.bbc.co.uk/sitemaps/https-index-uk-news.xml"), + NewsMap("https://www.bbc.co.uk/sitemaps/https-index-uk-news.xml"), Sitemap("https://www.bbc.co.uk/sitemaps/https-index-com-archive.xml", reverse=True), ], url_filter=regex_filter("video|live"), From 2eef39341afa55ff7b16d17f66beb3dd870c62d8 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Sat, 21 Dec 2024 17:43:35 +0100 Subject: [PATCH 3/3] update subheadline selector --- src/fundus/publishers/uk/the_bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/publishers/uk/the_bbc.py b/src/fundus/publishers/uk/the_bbc.py index 24cae415..ccf860f2 100644 --- a/src/fundus/publishers/uk/the_bbc.py +++ b/src/fundus/publishers/uk/the_bbc.py @@ -16,7 +16,7 @@ class TheBBCParser(ParserProxy): class V1(BaseParser): _subheadline_selector = XPath( - "//div[@data-component='subheadline-block' or contains(@class, 'ebmt73l0')]//*[self::h2 or (self::p and b and position()>1)]" + "//div[@data-component='subheadline-block' or @data-component='text-block' or contains(@class, 'ebmt73l0')]//*[self::h2 or (self::p and b and position()>1)]" ) _summary_selector = XPath( "(//div[@data-component='text-block' or contains(@class, 'ebmt73l0')])[1] //p[b and position()=1]"