Skip to content

Commit

Permalink
Fixed selectors, removed agenciabrasil labic#24
Browse files Browse the repository at this point in the history
There were tro spiders for agenciabrasil.ebc.com.br:
ebc and agenciabrasil, removed agenciabrasil and left ebc.
  • Loading branch information
ligiaiv committed Oct 6, 2017
1 parent 84e8ee7 commit ef671f3
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 131 deletions.
22 changes: 20 additions & 2 deletions ze/processors/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@ def __call__(self, value, loader_context):
section.append(fg)

el.replace_with(section)

folha_decompose=[ 'h5'
]
for selector in folha_decompose:
for el in html.select(selector):
el.decompose()
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)
Expand Down Expand Up @@ -730,7 +736,17 @@ def __call__(self, value, loader_context):
selector, spider_name, e)



if spider_name is 'guiadoestudante':
guia_decompose=[ '.top_helper ',
'#links-uteis-',
]
try:
for selector in guia_decompose:
for el in html.select(selector):
el.decompose()
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)



Expand Down Expand Up @@ -821,7 +837,7 @@ def __call__(self, value, loader_context):

except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)
selector, spider_name, e)

# if spider_name is 'govpa':
# try:
Expand Down Expand Up @@ -1105,6 +1121,7 @@ def __call__(self, value, loader_context):
if not el_to_decompose:
el_to_decompose = {
'geral': [
'.ad-content',
'.additional',
'.advertising',
'.articleCredit',
Expand Down Expand Up @@ -1198,6 +1215,7 @@ def __call__(self, value, loader_context):
'.single__conteudo--galeria-de-fotos',
'.single__conteudo--tags',
'.social-share-buttons',
'.social-share',
'.story-body__unordered-list',
'.sumario_apoyos',
"#sponsored-links",
Expand Down
91 changes: 0 additions & 91 deletions ze/spiders/agenciabrasil.py

This file was deleted.

4 changes: 3 additions & 1 deletion ze/spiders/ebc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ class EBCSpider(ZeSpider):
"css": [
'meta[property="og:image"]::attr(content)',
'[itemprop="image"] img::attr(src)',
'.node-noticia figure img::attr(src)'
'.node-noticia figure img::attr(src)',
'.teaser img::attr(src)'

]
}
},
Expand Down
49 changes: 27 additions & 22 deletions ze/spiders/folhadesp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ class FolhaDeSaoPauloSpider(ZeSpider):
allowed_domains = ['folha.uol.com.br']
items_refs = [{
"item": "ze.items.creativework.ArticleItem",
"fields": {
"fields": {
"name": {
"selectors": {
"css": [
"meta[property='og:title']::attr(content)",
"meta[name=title]::attr(content)",
".news header h1::text",
"[itemprop=name]::text",
"[itemprop='headline']::text",
".news header h1::text",
"[itemprop=name]::text",
"[itemprop='headline']::text",
"[itemprop=alternativeHeadline]::attr(content)"
]
}
Expand All @@ -27,56 +27,61 @@ class FolhaDeSaoPauloSpider(ZeSpider):
"meta[property='og:description']::attr(content)",
"meta[name=description]::attr(content)",
'meta[property="og:image"]::attr(content)',
"[itemprop=image]::attr(content)",
"[itemprop=image]::attr(content)",
"[property='og:image']::attr(content)"
]
}
},
},
"description": {
"selectors": {
"css": [
".documentDescription::text",
"[itemprop=description]::text"
".documentDescription::text",
"[itemprop=description]::text",
'[property="og:description"]::attr(content)'
]
}
},
},
"author": {
"selectors": {
"css": [
".news .author p b",
".news .author p b",
"[itemprop=author] b::text",
".news__byline p strong::text"
".news__byline p strong::text",
'.post-autor::text'
]
}
},
},
"datePublished": {
"selectors": {
"css": [
".news time::attr(datetime)",
"[itemprop=datePublished]::text"
".news time::attr(datetime)",
"[itemprop=datePublished]::text",
'[property="article:published_time"]::attr(content)'
]
}
},
},
"dateModified": {
"selectors": {
"css": [
"[itemprop=dateModified]::text"
"[itemprop=dateModified]::text",
'[property="article:modified_time"]::attr(content)'
]
}
},
},
"articleBody": {
"selectors": {
"css": [
".news .content",
"[itemprop=articleBody]"
".news .content",
"[itemprop=articleBody]",
".single-post-content"
]
}
},
},
"keywords": {
"selectors": {
"css": [
"meta[name=keywords]::attr(content)",
"[itemprop=keywords]::text",
"meta[name=keywords]::attr(content)",
"[itemprop=keywords]::text",
"[itemprop=keywords]::attr(content)"
]
}
Expand Down
21 changes: 11 additions & 10 deletions ze/spiders/g1.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class G1Spider(ZeSpider):
"[itemprop=dateModified]::attr(datetime)" ,
"[itemprop=dateModified]::text",
".updated::text",

]
}
},
Expand All @@ -97,22 +97,23 @@ class G1Spider(ZeSpider):
"css": [
"meta[name=keywords]::attr(content)",
"[itemprop=keywords]::text",
".entities__list-itemLink::text"
".entities__list-itemLink::text",
'.entities__list a::text',
]
}
}
}
}]

# def harvest_metadata(self, resp: Response, item, **kargs):
# # TODO: Move to DownloadMiddleware
# item['meta']['jsonLDSchemas'] = self.jsonLDSchemas(reponse)
# item['meta']['otherLinks'] = self.jsonLD

@staticmethod
def improve_html(html, spider_name=None):
exceptions = []; exceptions_append = exceptions.append

try:
selector = '[data-block-type="backstage-photo"]'
for el in html.select(selector):
Expand All @@ -122,24 +123,24 @@ def improve_html(html, spider_name=None):
fc = html.new_tag('figcaption')
fc.string = el.select_one('.content-media__description__caption').get_text()
fg.append(fc)

el.replace_with(fg)
except Exception as e:
exceptions_append(e)

try:
selector = '[data-block-type="backstage-video"]'
for el in html.select(selector):
video_id = el.select('.content-video__placeholder')[0]['data-video-id']

fg = html.new_tag('figure')
fg.append(html.new_tag('img', src='https://s02.video.glbimg.com/x720/%s.jpg' % video_id))
fc = html.new_tag('figcaption')
fc.string = el.select('[itemprop="description"]')[0].get_text() #antes tava itemprop='caption'
fg.append(fc)
a = html.new_tag('a', href='https://globoplay.globo.com/v/%s/' % video_id)
a.append(fg)

el.replace_with(a)
except Exception as e:
exceptions_append(e)
Expand All @@ -148,5 +149,5 @@ def improve_html(html, spider_name=None):
el.replace_with(el.get_text())
except Exception as e:
exceptions_append(e)

return html, exceptions
4 changes: 3 additions & 1 deletion ze/spiders/guiadoestudante.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ class GuiadoEstudanteSpider(ZeSpider):
"selectors": {
"css": [
"[itemprop=articleBody]",
".article-content"
'article'
# ".article-content",

]
}
},
Expand Down
3 changes: 2 additions & 1 deletion ze/spiders/istoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ class IstoESpider(ZeSpider):
"css": [
'meta[property="og:image"]::attr(content)',
"[itemprop=image]::attr(content)",
"[property=og:image]::attr(content)"
"[property=og:image]::attr(content)",
'.teaser img::attr(src)'
]
}
},
Expand Down
6 changes: 4 additions & 2 deletions ze/spiders/portaluai.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class PortalUAISpider(ZeSpider):
"meta[name=title]::attr(content)",
'[itemprop=headline]::text',
'.title-post::text',
'.entry-title::text'
'.entry-title::text',
'.title::text'
]
}
},
Expand Down Expand Up @@ -56,7 +57,8 @@ class PortalUAISpider(ZeSpider):
"selectors": {
"css": [
'[itemprop=datePublished]::attr(content)',
'.entry-date::text'
'[name="DC.date.created"]::attr(content)',
'.entry-date::text',
]
}
},
Expand Down
3 changes: 2 additions & 1 deletion ze/spiders/sejabixo.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ class SejaBixoSpider(ZeSpider):
'.data::text',
'article div[align=center] strong::text',
'article i strong::text',
'"#content i strong"::text'
'"#content i strong"::text',
'.lrec1 i strong::text',
]
}
},
Expand Down

0 comments on commit ef671f3

Please sign in to comment.