Skip to content

Commit

Permalink
Merge pull request #668 from flairNLP/update_bbc
Browse files Browse the repository at this point in the history
Update BBC
  • Loading branch information
addie9800 authored Jan 2, 2025
2 parents bc58f98 + 2eef393 commit 6ec184f
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions src/fundus/publishers/uk/the_bbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@

class TheBBCParser(ParserProxy):
class V1(BaseParser):
_subheadline_selector = CSSSelector("div[data-component='subheadline-block']")
_summary_selector = XPath("//div[@data-component='text-block'][1] //p[b]")
_subheadline_selector = XPath(
"//div[@data-component='subheadline-block' or @data-component='text-block' or contains(@class, 'ebmt73l0')]//*[self::h2 or (self::p and b and position()>1)]"
)
_summary_selector = XPath(
"(//div[@data-component='text-block' or contains(@class, 'ebmt73l0')])[1] //p[b and position()=1]"
)
_paragraph_selector = XPath(
"//div[@data-component='text-block'][1]//p[not(b) and text()] |"
"//div[@data-component='text-block'][position()>1] //p[text()] |"
"//div[@data-component='text-block'] //ul /li[text()]"
"//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][1]//p[not(b) and text()] |"
"//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][position()>1] //p[text()] |"
"//div[@data-component='text-block' or contains(@class, 'ebmt73l0')] //ul /li[text()]"
)

_topic_selector = CSSSelector(
Expand Down

0 comments on commit 6ec184f

Please sign in to comment.