Skip to content

Commit

Permalink
New spiders
Browse files Browse the repository at this point in the history
Estaduais - RN,RO,RR,SE,TO
  • Loading branch information
ligiaiv committed Sep 29, 2017
1 parent cdf9455 commit 754334c
Show file tree
Hide file tree
Showing 7 changed files with 580 additions and 4 deletions.
5 changes: 5 additions & 0 deletions ze/processors/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,11 @@ def __call__(self, value, loader_context):
value=value.split('Atualizado em')[0].replace(' - ',' ')
return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'})

if spider_name == 'govto':
value=value.split('-')[0]
return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'})


if (self.field == 'dateModified'):
if spider_name == 'zh':
value=value.split('|')[1].replace(' - ',' ')\
Expand Down
77 changes: 73 additions & 4 deletions ze/processors/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,14 +934,83 @@ def __call__(self, value, loader_context):
fg.append(html.new_tag('img', src='http://www.sc.gov.br/'+el['src']))
el.parent.replace_with(fg)

# selector='a'
# for el in html.select(selector):
# print('------xxxxxxxxxxx----------')
# el.replace(el.get_text())
selector='a'
for el in html.select(selector):
el.replace_with(el.get_text())
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)

if spider_name is 'govrn':
rn_decompose=[ 'h1',
'.compartilhar',
'.credito',
'.tags',
'.noticias_relacionadas'
]
try:
for selector in rn_decompose:
for el in html.select(selector):
el.decompose()
selector = '.imagem'
for el in html.select(selector):
fg = html.new_tag('figure')
fg.append(html.new_tag('img', src=el.select('img')[0]['src']))
fc = html.new_tag('figcaption')
fc.string = el.select('.legenda')[0].get_text()
fg.append(fc)

el.replace_with(fg)
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)


if spider_name is 'govro':
try:
selector='.wp-caption'
for el in html.select(selector):
fg = html.new_tag('figure')
fg.append(html.new_tag('img', src=el.select('img')[0]['src']))
fc = html.new_tag('figcaption')
fc.string = el.select('.wp-caption-text')[0].get_text()
fg.append(fc)
el.replace_with(fg)

selector='a'
for el in html.select(selector):
el.replace_with(el.get_text())
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)

if spider_name is 'govrr':
try:
selector='a'
for el in html.select(selector):
el.replace_with(el.get_text())
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)

if spider_name is 'govto':
try:
selector = '#fotos'

for el in html.select(selector):

section = html.new_tag('section',**{'class':'gallery'})
for photo in el.select('img'):
fg = html.new_tag('figure')
fg.append(html.new_tag('img', src=photo['src']))
fc = html.new_tag('figcaption')
fc.string = el.parent.select('p')[0].get_text()
fg.append(fc)
section.append(fg)
el.replace_with(section)
except Exception as e:
logger.error('Failed to replace "%s" selector from %s:\n%s',
selector, spider_name, e)

# all spiders
try:
Expand Down
100 changes: 100 additions & 0 deletions ze/spiders/govrn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
from . import ZeSpider


class GovernoRioGrandedoNorte(ZeSpider):

name = 'govrn'
allowed_domains = ['rn.gov.br']
items_refs = [{
"item": "ze.items.creativework.ArticleItem",
"fields": {
"name": {
"selectors": {
"css": [
"meta[property='og:title']::attr(content)",
"meta[name=title]::attr(content)",
'[itemprop=headline]::text',
'.title-post::text',
'[itemprop="headline"] a::text',
'h1 a::text'
]
}
},
"image": {
"selectors": {
"css": [
"meta[property='og:description']::attr(content)",
"meta[name=description]::attr(content)",
'meta[property="og:image"]::attr(content)',
'[itemprop="image"]::attr(src)',
'.lazy::attr("data-lazy-src")'
]
}
},
"description": {
"selectors": {
"css": [
'[itemprop=description]::attr(content)',
'[itemprop=description]::text',
'.entry-content h2::text',
'.linha-fina::text',
'.entry-content blockquote p::text',
'[property="og:description"]::attr(content)'
]
}
},
"author": {
"selectors": {
"css": [
'[name=author]::attr(content)',
'[itemprop=author]::text',
'.author a::text',
'.credito::text'
]
}
},
"datePublished": {
"selectors": {
"css": [
'[itemprop=datePublished]::attr(content)',
'.entry-date::text',
'[property="article:published_time"]::attr(content)',
'[itemprop="datePublished"]::attr(datetime)',
'.credito span::text'

]
}
},
"dateModified": {
"selectors": {
"css": [
'[itemprop=dateModified]::attr(content)',
'[property="article:modified_time"]::attr(content)',
'[itemprop="dateModified"]::attr(datetime)'

]
}
},
"articleBody": {
"selectors": {
"css": [
'[itemprop=articleBody]',
'.noticia',
'article.article-main',
'.Conteiner #P000'
]
}
},
"keywords": {
"selectors": {
"css": [
'[itemprop=keywords] a::text',
'[rel=tag]::text',
'.categories a::text',
'.tags a::text'
]
}
}
}
}]
100 changes: 100 additions & 0 deletions ze/spiders/govro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
from . import ZeSpider


class GovernoRondonia(ZeSpider):

name = 'govro'
allowed_domains = ['ro.gov.br']
items_refs = [{
"item": "ze.items.creativework.ArticleItem",
"fields": {
"name": {
"selectors": {
"css": [
"meta[property='og:title']::attr(content)",
"meta[name=title]::attr(content)",
'[itemprop=headline]::text',
'.title-post::text',
'[itemprop="headline"] a::text',
'h1 a::text'
]
}
},
"image": {
"selectors": {
"css": [
"meta[property='og:description']::attr(content)",
"meta[name=description]::attr(content)",
'meta[property="og:image"]::attr(content)',
'[itemprop="image"]::attr(src)',
'.lazy::attr("data-lazy-src")'
]
}
},
"description": {
"selectors": {
"css": [
'[itemprop=description]::attr(content)',
'[itemprop=description]::text',
'.entry-content h2::text',
'.linha-fina::text',
'.entry-content blockquote p::text',
'[property="og:description"]::attr(content)'
]
}
},
"author": {
"selectors": {
"css": [
'[name=author]::attr(content)',
'[itemprop=author]::text',
'.author a::text',
'.credito::text'
]
}
},
"datePublished": {
"selectors": {
"css": [
'[itemprop=datePublished]::attr(content)',
'.entry-date::text',
'[property="article:published_time"]::attr(content)',
'[itemprop="datePublished"]::attr(datetime)',
'.data span::text'

]
}
},
"dateModified": {
"selectors": {
"css": [
'[itemprop=dateModified]::attr(content)',
'[property="article:modified_time"]::attr(content)',
'[itemprop="dateModified"]::attr(datetime)'

]
}
},
"articleBody": {
"selectors": {
"css": [
'[itemprop=articleBody]',
'.noticia',
'article.article-main',
'.entry'
]
}
},
"keywords": {
"selectors": {
"css": [
'[itemprop=keywords] a::text',
'[rel=tag]::text',
'.categories a::text',
'.tags a::text'
]
}
}
}
}]
Loading

0 comments on commit 754334c

Please sign in to comment.