From 8b1e291ce504747e11e9d8caca6c3742cb5d9bf1 Mon Sep 17 00:00:00 2001 From: Unknown Date: Mon, 2 Oct 2017 10:57:10 -0300 Subject: [PATCH] Fixed selectors #24 agenciabrasil, g1, goval, govce mundoeducacao, oglobo, r7, sejabixo, senado, terra --- ze/processors/common.py | 13 ++++++++----- ze/processors/html.py | 30 ++++++++++++++++++++++++++---- ze/spiders/agenciabrasil.py | 7 ++++--- ze/spiders/g1.py | 6 ++++-- ze/spiders/goval.py | 6 ++++-- ze/spiders/govce.py | 1 + ze/spiders/mundoecucacao.py | 5 +++-- ze/spiders/oglobo.py | 3 ++- ze/spiders/r7.py | 4 +++- ze/spiders/sejabixo.py | 3 ++- ze/spiders/senado.py | 12 ++++++++---- ze/spiders/terra.py | 2 +- 12 files changed, 66 insertions(+), 26 deletions(-) diff --git a/ze/processors/common.py b/ze/processors/common.py index ee0699c..2e508be 100644 --- a/ze/processors/common.py +++ b/ze/processors/common.py @@ -45,7 +45,8 @@ def __call__(self, value, loader_context): spider_name = loader_context.get('spider_name') if spider_name == 'r7': - value=value.split('(')[1].split(')')[0] + if '(' in value: + value=value.split('(')[1].split(')')[0] if spider_name == 'correiobraziliense': return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'}) @@ -54,7 +55,7 @@ def __call__(self, value, loader_context): return datetime.fromtimestamp(int(value)) if spider_name == 'mundoeducacao': - value = value.replace('em','') + value = value.replace(' em','').replace('às','') return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'}) if (self.field == 'datePublished'): @@ -138,6 +139,8 @@ def __call__(self, value, loader_context): return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'}) if spider_name == 'govce': + if 'em' in value: + value=value.split('em')[1] value=value.split(',')[0] return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'}) @@ -207,9 +210,9 @@ def __call__(self, value, loader_context): value = value.split(' - ')[1].replace('ATUALIZADO EM','') return dateparser.parse(value, settings={'TIMEZONE': '+0300'}) - if spider_name=='govce': - value=value.split('em')[1] - return dateparser.parse(value, settings={'TIMEZONE': '+0300'}) + # if spider_name=='govce': + # value=value.split('em')[1] + # return dateparser.parse(value, settings={'TIMEZONE': '+0300'}) if spider_name == 'govrj': if 'Atualizado em' in value: diff --git a/ze/processors/html.py b/ze/processors/html.py index 3655ea8..91df883 100644 --- a/ze/processors/html.py +++ b/ze/processors/html.py @@ -204,9 +204,13 @@ def __call__(self, value, loader_context): fg.append(fc) el.replace_with(fg) - selector = 'div section p' - for el in html.select(selector): - el.decompose() + + estadao_decompose=[ 'div section p', + '.documento' + ] + for selector in estadao_decompose: + for el in html.select(selector): + el.decompose() except Exception as e: logger.error('Failed to replace "%s" selector from %s:\n%s', selector, spider_name, e) @@ -760,6 +764,18 @@ def __call__(self, value, loader_context): logger.error('Failed to replace "%s" selector from %s:\n%s', selector, spider_name, e) + if spider_name is 'goval': + al_decompose=[ 'h1', + 'time' + ] + try: + for selector in al_decompose: + for el in html.select(selector): + el.decompose() + except Exception as e: + logger.error('Failed to replace "%s" selector from %s:\n%s', + selector, spider_name, e) + if spider_name is 'goves': try: @@ -823,6 +839,9 @@ def __call__(self, value, loader_context): for el in html.select(selector): el.decompose() + selector = 'audio' + for el in html.select(selector): + el.decompose() except Exception as e: logger.error('Failed to replace "%s" selector from %s:\n%s', selector, spider_name, e) @@ -1141,7 +1160,8 @@ def __call__(self, value, loader_context): 'marquee', 'menu', '.navegacao', - 'n--noticia__newsletter', + '.n--noticia__newsletter', + # 'n--noticia__newsletter' '#noticia_vinculadas', # 'meta', 'figure meta', @@ -1158,6 +1178,7 @@ def __call__(self, value, loader_context): '.publicidade-content', '.publicado', '#recomendadosParaVoce', + '.RedesSociais', '.relacionadas', '.related-news-shell', '#respond', @@ -1180,6 +1201,7 @@ def __call__(self, value, loader_context): '.top-artigos', '#viewlet-above-content-title', 'video', + 'videoEmbed', 'xml', '[data-ng-controller="compartilhamentoController"]', '[data-ng-controller="newsletterControllerCardapio"]', diff --git a/ze/spiders/agenciabrasil.py b/ze/spiders/agenciabrasil.py index 1b7f5de..9492fb5 100644 --- a/ze/spiders/agenciabrasil.py +++ b/ze/spiders/agenciabrasil.py @@ -70,9 +70,10 @@ class AgenciaBrasilSpider(ZeSpider): "css": [ '[itemprop=articleBody]', '[property=articleBody]', - '.news', - '.content', - '.node-noticia .content' + # '.news', + # '.content', + '.node-noticia .content', + ] } }, diff --git a/ze/spiders/g1.py b/ze/spiders/g1.py index b28b62d..5e67e8e 100644 --- a/ze/spiders/g1.py +++ b/ze/spiders/g1.py @@ -62,7 +62,8 @@ class G1Spider(ZeSpider): "[itemprop=datePublished]::text", "time[datetime]::text", "time::attr(datetime)" , - ".data::text" + ".data::text", + ".published::text" ] } }, @@ -71,7 +72,8 @@ class G1Spider(ZeSpider): "css": [ "[itemprop=dateModified]::attr(datetime)" , "[itemprop=dateModified]::text", - ".updated" + ".updated::text", + ] } }, diff --git a/ze/spiders/goval.py b/ze/spiders/goval.py index 96d8d97..be7080c 100644 --- a/ze/spiders/goval.py +++ b/ze/spiders/goval.py @@ -54,7 +54,8 @@ class GovAlagoasSpider(ZeSpider): "css": [ '[itemprop=datePublished]::attr(content)', '.data::text', - '.data-post::text' + '.data-post::text', + 'time' ] } }, @@ -72,7 +73,8 @@ class GovAlagoasSpider(ZeSpider): "selectors": { "css": [ '[itemprop=articleBody]', - '[class=card-content]' + '[class=card-content]', + '.texto' ] } }, diff --git a/ze/spiders/govce.py b/ze/spiders/govce.py index f311761..a71273b 100644 --- a/ze/spiders/govce.py +++ b/ze/spiders/govce.py @@ -74,6 +74,7 @@ class GovCearaSpider(ZeSpider): '[itemprop=articleBody]', '[class=card-content]', '#conteudo_central', + '.SingleContent' ] } }, diff --git a/ze/spiders/mundoecucacao.py b/ze/spiders/mundoecucacao.py index 738c7bd..2813592 100644 --- a/ze/spiders/mundoecucacao.py +++ b/ze/spiders/mundoecucacao.py @@ -44,7 +44,7 @@ class MundoEducacaoSpider(ZeSpider): '[itemprop=author]::text', '.autor-nome::text', '.node-author-inner strong::text', - '.publicado b::text' + '.publicado b::text', ] } }, @@ -54,7 +54,8 @@ class MundoEducacaoSpider(ZeSpider): '[itemprop=datePublished]::attr(content)', '.data::text', 'p.meta::text', - '.publicado p' + '.publicado p', + '.publicado::text' ] } }, diff --git a/ze/spiders/oglobo.py b/ze/spiders/oglobo.py index 731f69c..3c9fbfc 100644 --- a/ze/spiders/oglobo.py +++ b/ze/spiders/oglobo.py @@ -77,7 +77,8 @@ class OGloboSpider(ZeSpider): "selectors": { "css": [ "[itemprop=articleBody]", - ".corpo" + ".corpo", + '.n--noticia__body .content' ] } }, diff --git a/ze/spiders/r7.py b/ze/spiders/r7.py index d83df9c..52ff351 100644 --- a/ze/spiders/r7.py +++ b/ze/spiders/r7.py @@ -54,7 +54,9 @@ class R7Spider(ZeSpider): "selectors": { "css": [ "[itemprop=datePublished]::text", - "[property='article:published_time']::attr(content)" + "[property='article:published_time']::attr(content)", + ".published_at::attr(datetime)" + ] } }, diff --git a/ze/spiders/sejabixo.py b/ze/spiders/sejabixo.py index 499a5b2..bae50b1 100644 --- a/ze/spiders/sejabixo.py +++ b/ze/spiders/sejabixo.py @@ -52,7 +52,8 @@ class SejaBixoSpider(ZeSpider): '[itemprop=datePublished]::attr(content)', '.data::text', 'article div[align=center] strong::text', - 'article i strong::text' + 'article i strong::text', + '"#content i strong"::text' ] } }, diff --git a/ze/spiders/senado.py b/ze/spiders/senado.py index 56a2575..e5a68cd 100644 --- a/ze/spiders/senado.py +++ b/ze/spiders/senado.py @@ -14,7 +14,8 @@ class SenadoSpider(ZeSpider): "meta[property='og:title']::attr(content)", "meta[name=title]::attr(content)", "[itemprop=headline]::text", - "#tituloNoticia h2::text" + "#tituloNoticia h2::text", + ".tituloVerNoticia::text" ] } }, @@ -57,7 +58,8 @@ class SenadoSpider(ZeSpider): '[itemprop=author]::text', '[class*=autor]::text', '#materia > p small::text', - '.ByLine-autor a::text' + '.ByLine-autor a::text', + # '.editoriaVerNoticia b::text' ] } }, @@ -67,7 +69,8 @@ class SenadoSpider(ZeSpider): '[itemprop=datePublished]::attr(content)', '.datahoraNoticia::text', '#materia span.text-muted::text', - '.ByLine-data::text' + '.ByLine-data::text', + '.editoriaVerNoticia::text' ] } }, @@ -85,7 +88,8 @@ class SenadoSpider(ZeSpider): '[itemprop=articleBody]', '[property=articleBody]', '#textoMateria', - '#content' + '#content', + '.textoNovo' ] } }, diff --git a/ze/spiders/terra.py b/ze/spiders/terra.py index 18773a7..f60b362 100644 --- a/ze/spiders/terra.py +++ b/ze/spiders/terra.py @@ -44,9 +44,9 @@ class TerraSpider(ZeSpider): "author": { "selectors": { "css": [ + ".author [itemprop=name]::attr(content)", ".authorName::text", "[itemprop=author]::text", - # "[itemprop=creator] [itemprop=name]::text", ] } },