Skip to content

Commit

Permalink
Fixed selectors labic#24
Browse files Browse the repository at this point in the history
agenciabrasil, g1, goval, govce mundoeducacao, oglobo, r7, sejabixo, senado, terra
  • Loading branch information
ligiaiv committed Oct 2, 2017
1 parent 754334c commit 8b1e291
Show file tree
Hide file tree
Showing 12 changed files with 66 additions and 26 deletions.
13 changes: 8 additions & 5 deletions ze/processors/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def __call__(self, value, loader_context):
spider_name = loader_context.get('spider_name')

if spider_name == 'r7':
value=value.split('(')[1].split(')')[0]
if '(' in value:
value=value.split('(')[1].split(')')[0]

if spider_name == 'correiobraziliense':
return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'})
Expand All @@ -54,7 +55,7 @@ def __call__(self, value, loader_context):
return datetime.fromtimestamp(int(value))

if spider_name == 'mundoeducacao':
value = value.replace('em','')
value = value.replace(' em','').replace('às','')
return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'})

if (self.field == 'datePublished'):
Expand Down Expand Up @@ -138,6 +139,8 @@ def __call__(self, value, loader_context):
return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'})

if spider_name == 'govce':
if 'em' in value:
value=value.split('em')[1]
value=value.split(',')[0]
return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'})

Expand Down Expand Up @@ -207,9 +210,9 @@ def __call__(self, value, loader_context):
value = value.split(' - ')[1].replace('ATUALIZADO EM','')
return dateparser.parse(value, settings={'TIMEZONE': '+0300'})

if spider_name=='govce':
value=value.split('em')[1]
return dateparser.parse(value, settings={'TIMEZONE': '+0300'})
# if spider_name=='govce':
# value=value.split('em')[1]
# return dateparser.parse(value, settings={'TIMEZONE': '+0300'})

if spider_name == 'govrj':
if 'Atualizado em' in value:
Expand Down
30 changes: 26 additions & 4 deletions ze/processors/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,13 @@ def __call__(self, value, loader_context):
fg.append(fc)

el.replace_with(fg)
selector = 'div section p'
for el in html.select(selector):
el.decompose()

estadao_decompose=[ 'div section p',
'.documento'
]
for selector in estadao_decompose:
for el in html.select(selector):
el.decompose()
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s', selector, spider_name, e)

Expand Down Expand Up @@ -760,6 +764,18 @@ def __call__(self, value, loader_context):
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)

if spider_name is 'goval':
al_decompose=[ 'h1',
'time'
]
try:
for selector in al_decompose:
for el in html.select(selector):
el.decompose()
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)


if spider_name is 'goves':
try:
Expand Down Expand Up @@ -823,6 +839,9 @@ def __call__(self, value, loader_context):
for el in html.select(selector):
el.decompose()

selector = 'audio'
for el in html.select(selector):
el.decompose()
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)
Expand Down Expand Up @@ -1141,7 +1160,8 @@ def __call__(self, value, loader_context):
'marquee',
'menu',
'.navegacao',
'n--noticia__newsletter',
'.n--noticia__newsletter',
# 'n--noticia__newsletter'
'#noticia_vinculadas',
# 'meta',
'figure meta',
Expand All @@ -1158,6 +1178,7 @@ def __call__(self, value, loader_context):
'.publicidade-content',
'.publicado',
'#recomendadosParaVoce',
'.RedesSociais',
'.relacionadas',
'.related-news-shell',
'#respond',
Expand All @@ -1180,6 +1201,7 @@ def __call__(self, value, loader_context):
'.top-artigos',
'#viewlet-above-content-title',
'video',
'videoEmbed',
'xml',
'[data-ng-controller="compartilhamentoController"]',
'[data-ng-controller="newsletterControllerCardapio"]',
Expand Down
7 changes: 4 additions & 3 deletions ze/spiders/agenciabrasil.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ class AgenciaBrasilSpider(ZeSpider):
"css": [
'[itemprop=articleBody]',
'[property=articleBody]',
'.news',
'.content',
'.node-noticia .content'
# '.news',
# '.content',
'.node-noticia .content',

]
}
},
Expand Down
6 changes: 4 additions & 2 deletions ze/spiders/g1.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ class G1Spider(ZeSpider):
"[itemprop=datePublished]::text",
"time[datetime]::text",
"time::attr(datetime)" ,
".data::text"
".data::text",
".published::text"
]
}
},
Expand All @@ -71,7 +72,8 @@ class G1Spider(ZeSpider):
"css": [
"[itemprop=dateModified]::attr(datetime)" ,
"[itemprop=dateModified]::text",
".updated"
".updated::text",

]
}
},
Expand Down
6 changes: 4 additions & 2 deletions ze/spiders/goval.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class GovAlagoasSpider(ZeSpider):
"css": [
'[itemprop=datePublished]::attr(content)',
'.data::text',
'.data-post::text'
'.data-post::text',
'time'
]
}
},
Expand All @@ -72,7 +73,8 @@ class GovAlagoasSpider(ZeSpider):
"selectors": {
"css": [
'[itemprop=articleBody]',
'[class=card-content]'
'[class=card-content]',
'.texto'
]
}
},
Expand Down
1 change: 1 addition & 0 deletions ze/spiders/govce.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class GovCearaSpider(ZeSpider):
'[itemprop=articleBody]',
'[class=card-content]',
'#conteudo_central',
'.SingleContent'
]
}
},
Expand Down
5 changes: 3 additions & 2 deletions ze/spiders/mundoecucacao.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class MundoEducacaoSpider(ZeSpider):
'[itemprop=author]::text',
'.autor-nome::text',
'.node-author-inner strong::text',
'.publicado b::text'
'.publicado b::text',
]
}
},
Expand All @@ -54,7 +54,8 @@ class MundoEducacaoSpider(ZeSpider):
'[itemprop=datePublished]::attr(content)',
'.data::text',
'p.meta::text',
'.publicado p'
'.publicado p',
'.publicado::text'
]
}
},
Expand Down
3 changes: 2 additions & 1 deletion ze/spiders/oglobo.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ class OGloboSpider(ZeSpider):
"selectors": {
"css": [
"[itemprop=articleBody]",
".corpo"
".corpo",
'.n--noticia__body .content'
]
}
},
Expand Down
4 changes: 3 additions & 1 deletion ze/spiders/r7.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ class R7Spider(ZeSpider):
"selectors": {
"css": [
"[itemprop=datePublished]::text",
"[property='article:published_time']::attr(content)"
"[property='article:published_time']::attr(content)",
".published_at::attr(datetime)"

]
}
},
Expand Down
3 changes: 2 additions & 1 deletion ze/spiders/sejabixo.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ class SejaBixoSpider(ZeSpider):
'[itemprop=datePublished]::attr(content)',
'.data::text',
'article div[align=center] strong::text',
'article i strong::text'
'article i strong::text',
'"#content i strong"::text'
]
}
},
Expand Down
12 changes: 8 additions & 4 deletions ze/spiders/senado.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class SenadoSpider(ZeSpider):
"meta[property='og:title']::attr(content)",
"meta[name=title]::attr(content)",
"[itemprop=headline]::text",
"#tituloNoticia h2::text"
"#tituloNoticia h2::text",
".tituloVerNoticia::text"
]
}
},
Expand Down Expand Up @@ -57,7 +58,8 @@ class SenadoSpider(ZeSpider):
'[itemprop=author]::text',
'[class*=autor]::text',
'#materia > p small::text',
'.ByLine-autor a::text'
'.ByLine-autor a::text',
# '.editoriaVerNoticia b::text'
]
}
},
Expand All @@ -67,7 +69,8 @@ class SenadoSpider(ZeSpider):
'[itemprop=datePublished]::attr(content)',
'.datahoraNoticia::text',
'#materia span.text-muted::text',
'.ByLine-data::text'
'.ByLine-data::text',
'.editoriaVerNoticia::text'
]
}
},
Expand All @@ -85,7 +88,8 @@ class SenadoSpider(ZeSpider):
'[itemprop=articleBody]',
'[property=articleBody]',
'#textoMateria',
'#content'
'#content',
'.textoNovo'
]
}
},
Expand Down
2 changes: 1 addition & 1 deletion ze/spiders/terra.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ class TerraSpider(ZeSpider):
"author": {
"selectors": {
"css": [
".author [itemprop=name]::attr(content)",
".authorName::text",
"[itemprop=author]::text",
# "[itemprop=creator] [itemprop=name]::text",
]
}
},
Expand Down

0 comments on commit 8b1e291

Please sign in to comment.