From 754334cf72ed58975415f3b521255901b2a34e69 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 29 Sep 2017 09:59:54 -0300 Subject: [PATCH] New spiders Estaduais - RN,RO,RR,SE,TO --- ze/processors/common.py | 5 ++ ze/processors/html.py | 77 ++++++++++++++++++++++++++++-- ze/spiders/govrn.py | 100 ++++++++++++++++++++++++++++++++++++++ ze/spiders/govro.py | 100 ++++++++++++++++++++++++++++++++++++++ ze/spiders/govrr.py | 101 +++++++++++++++++++++++++++++++++++++++ ze/spiders/govse.py | 98 ++++++++++++++++++++++++++++++++++++++ ze/spiders/govto.py | 103 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 580 insertions(+), 4 deletions(-) create mode 100644 ze/spiders/govrn.py create mode 100644 ze/spiders/govro.py create mode 100644 ze/spiders/govrr.py create mode 100644 ze/spiders/govse.py create mode 100644 ze/spiders/govto.py diff --git a/ze/processors/common.py b/ze/processors/common.py index fa3982c..ee0699c 100644 --- a/ze/processors/common.py +++ b/ze/processors/common.py @@ -159,6 +159,11 @@ def __call__(self, value, loader_context): value=value.split('Atualizado em')[0].replace(' - ',' ') return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'}) + if spider_name == 'govto': + value=value.split('-')[0] + return dateparser.parse(value, settings={'TIMEZONE': '+0300','DATE_ORDER': 'DMY'}) + + if (self.field == 'dateModified'): if spider_name == 'zh': value=value.split('|')[1].replace(' - ',' ')\ diff --git a/ze/processors/html.py b/ze/processors/html.py index 082187d..3655ea8 100644 --- a/ze/processors/html.py +++ b/ze/processors/html.py @@ -934,14 +934,83 @@ def __call__(self, value, loader_context): fg.append(html.new_tag('img', src='http://www.sc.gov.br/'+el['src'])) el.parent.replace_with(fg) - # selector='a' - # for el in html.select(selector): - # print('------xxxxxxxxxxx----------') - # el.replace(el.get_text()) + selector='a' + for el in html.select(selector): + el.replace_with(el.get_text()) + except Exception as e: + logger.error('Failed to replace "%s" selector from %s:\n%s', + selector, spider_name, e) + + if spider_name is 'govrn': + rn_decompose=[ 'h1', + '.compartilhar', + '.credito', + '.tags', + '.noticias_relacionadas' + ] + try: + for selector in rn_decompose: + for el in html.select(selector): + el.decompose() + selector = '.imagem' + for el in html.select(selector): + fg = html.new_tag('figure') + fg.append(html.new_tag('img', src=el.select('img')[0]['src'])) + fc = html.new_tag('figcaption') + fc.string = el.select('.legenda')[0].get_text() + fg.append(fc) + + el.replace_with(fg) + except Exception as e: + logger.error('Failed to replace "%s" selector from %s:\n%s', + selector, spider_name, e) + + + if spider_name is 'govro': + try: + selector='.wp-caption' + for el in html.select(selector): + fg = html.new_tag('figure') + fg.append(html.new_tag('img', src=el.select('img')[0]['src'])) + fc = html.new_tag('figcaption') + fc.string = el.select('.wp-caption-text')[0].get_text() + fg.append(fc) + el.replace_with(fg) + + selector='a' + for el in html.select(selector): + el.replace_with(el.get_text()) except Exception as e: logger.error('Failed to replace "%s" selector from %s:\n%s', selector, spider_name, e) + if spider_name is 'govrr': + try: + selector='a' + for el in html.select(selector): + el.replace_with(el.get_text()) + except Exception as e: + logger.error('Failed to replace "%s" selector from %s:\n%s', + selector, spider_name, e) + + if spider_name is 'govto': + try: + selector = '#fotos' + + for el in html.select(selector): + + section = html.new_tag('section',**{'class':'gallery'}) + for photo in el.select('img'): + fg = html.new_tag('figure') + fg.append(html.new_tag('img', src=photo['src'])) + fc = html.new_tag('figcaption') + fc.string = el.parent.select('p')[0].get_text() + fg.append(fc) + section.append(fg) + el.replace_with(section) + except Exception as e: + logger.error('Failed to replace "%s" selector from %s:\n%s', + selector, spider_name, e) # all spiders try: diff --git a/ze/spiders/govrn.py b/ze/spiders/govrn.py new file mode 100644 index 0000000..3a5e673 --- /dev/null +++ b/ze/spiders/govrn.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +from . import ZeSpider + + +class GovernoRioGrandedoNorte(ZeSpider): + + name = 'govrn' + allowed_domains = ['rn.gov.br'] + items_refs = [{ + "item": "ze.items.creativework.ArticleItem", + "fields": { + "name": { + "selectors": { + "css": [ + "meta[property='og:title']::attr(content)", + "meta[name=title]::attr(content)", + '[itemprop=headline]::text', + '.title-post::text', + '[itemprop="headline"] a::text', + 'h1 a::text' + ] + } + }, + "image": { + "selectors": { + "css": [ + "meta[property='og:description']::attr(content)", + "meta[name=description]::attr(content)", + 'meta[property="og:image"]::attr(content)', + '[itemprop="image"]::attr(src)', + '.lazy::attr("data-lazy-src")' + ] + } + }, + "description": { + "selectors": { + "css": [ + '[itemprop=description]::attr(content)', + '[itemprop=description]::text', + '.entry-content h2::text', + '.linha-fina::text', + '.entry-content blockquote p::text', + '[property="og:description"]::attr(content)' + ] + } + }, + "author": { + "selectors": { + "css": [ + '[name=author]::attr(content)', + '[itemprop=author]::text', + '.author a::text', + '.credito::text' + ] + } + }, + "datePublished": { + "selectors": { + "css": [ + '[itemprop=datePublished]::attr(content)', + '.entry-date::text', + '[property="article:published_time"]::attr(content)', + '[itemprop="datePublished"]::attr(datetime)', + '.credito span::text' + + ] + } + }, + "dateModified": { + "selectors": { + "css": [ + '[itemprop=dateModified]::attr(content)', + '[property="article:modified_time"]::attr(content)', + '[itemprop="dateModified"]::attr(datetime)' + + ] + } + }, + "articleBody": { + "selectors": { + "css": [ + '[itemprop=articleBody]', + '.noticia', + 'article.article-main', + '.Conteiner #P000' + ] + } + }, + "keywords": { + "selectors": { + "css": [ + '[itemprop=keywords] a::text', + '[rel=tag]::text', + '.categories a::text', + '.tags a::text' + ] + } + } + } + }] diff --git a/ze/spiders/govro.py b/ze/spiders/govro.py new file mode 100644 index 0000000..b888aa4 --- /dev/null +++ b/ze/spiders/govro.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +from . import ZeSpider + + +class GovernoRondonia(ZeSpider): + + name = 'govro' + allowed_domains = ['ro.gov.br'] + items_refs = [{ + "item": "ze.items.creativework.ArticleItem", + "fields": { + "name": { + "selectors": { + "css": [ + "meta[property='og:title']::attr(content)", + "meta[name=title]::attr(content)", + '[itemprop=headline]::text', + '.title-post::text', + '[itemprop="headline"] a::text', + 'h1 a::text' + ] + } + }, + "image": { + "selectors": { + "css": [ + "meta[property='og:description']::attr(content)", + "meta[name=description]::attr(content)", + 'meta[property="og:image"]::attr(content)', + '[itemprop="image"]::attr(src)', + '.lazy::attr("data-lazy-src")' + ] + } + }, + "description": { + "selectors": { + "css": [ + '[itemprop=description]::attr(content)', + '[itemprop=description]::text', + '.entry-content h2::text', + '.linha-fina::text', + '.entry-content blockquote p::text', + '[property="og:description"]::attr(content)' + ] + } + }, + "author": { + "selectors": { + "css": [ + '[name=author]::attr(content)', + '[itemprop=author]::text', + '.author a::text', + '.credito::text' + ] + } + }, + "datePublished": { + "selectors": { + "css": [ + '[itemprop=datePublished]::attr(content)', + '.entry-date::text', + '[property="article:published_time"]::attr(content)', + '[itemprop="datePublished"]::attr(datetime)', + '.data span::text' + + ] + } + }, + "dateModified": { + "selectors": { + "css": [ + '[itemprop=dateModified]::attr(content)', + '[property="article:modified_time"]::attr(content)', + '[itemprop="dateModified"]::attr(datetime)' + + ] + } + }, + "articleBody": { + "selectors": { + "css": [ + '[itemprop=articleBody]', + '.noticia', + 'article.article-main', + '.entry' + ] + } + }, + "keywords": { + "selectors": { + "css": [ + '[itemprop=keywords] a::text', + '[rel=tag]::text', + '.categories a::text', + '.tags a::text' + ] + } + } + } + }] diff --git a/ze/spiders/govrr.py b/ze/spiders/govrr.py new file mode 100644 index 0000000..c20a17b --- /dev/null +++ b/ze/spiders/govrr.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +from . import ZeSpider + + +class GovernoRoraima(ZeSpider): + + name = 'govrr' + allowed_domains = ['rr.gov.br'] + items_refs = [{ + "item": "ze.items.creativework.ArticleItem", + "fields": { + "name": { + "selectors": { + "css": [ + "meta[property='og:title']::attr(content)", + "meta[name=title]::attr(content)", + '[itemprop=headline]::text', + '.title-post::text', + '[itemprop="headline"] a::text', + '[itemprop="headline "]::attr(content)', + '.entry-title::text' + ] + } + }, + "image": { + "selectors": { + "css": [ + "meta[property='og:description']::attr(content)", + "meta[name=description]::attr(content)", + 'meta[property="og:image"]::attr(content)', + '[itemprop="image"]::attr(src)', + '.lazy::attr("data-lazy-src")' + ] + } + }, + "description": { + "selectors": { + "css": [ + '[itemprop=description]::attr(content)', + '[itemprop=description]::text', + '.entry-content h2::text', + '.linha-fina::text', + '.entry-content blockquote p::text', + '[property="og:description"]::attr(content)' + ] + } + }, + "author": { + "selectors": { + "css": [ + '[name=author]::attr(content)', + '[itemprop=author]::text', + '.author a::text', + '.credito::text' + ] + } + }, + "datePublished": { + "selectors": { + "css": [ + '[itemprop=datePublished]::attr(content)', + '.entry-date::text', + '[property="article:published_time"]::attr(content)', + '[itemprop="datePublished"]::attr(datetime)', + '.data span::text' + + ] + } + }, + "dateModified": { + "selectors": { + "css": [ + '[itemprop=dateModified]::attr(content)', + '[property="article:modified_time"]::attr(content)', + '[itemprop="dateModified"]::attr(datetime)' + + ] + } + }, + "articleBody": { + "selectors": { + "css": [ + '[itemprop=articleBody]', + '.noticia', + 'article.article-main', + '.td-post-content' + ] + } + }, + "keywords": { + "selectors": { + "css": [ + '[itemprop=keywords] a::text', + '[rel=tag]::text', + '.categories a::text', + '.tags a::text' + ] + } + } + } + }] diff --git a/ze/spiders/govse.py b/ze/spiders/govse.py new file mode 100644 index 0000000..4c87818 --- /dev/null +++ b/ze/spiders/govse.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +from . import ZeSpider + + +class GovernoSergipeSpider(ZeSpider): + + name = 'govse' + allowed_domains = ['se.gov.br'] + items_refs = [{ + "item": "ze.items.creativework.ArticleItem", + "fields": { + "name": { + "selectors": { + "css": [ + "meta[property='og:title']::attr(content)", + "meta[name=title]::attr(content)", + '[itemprop=headline]::text', + '.title-post::text', + '[itemprop="headline"] a::text' + ] + } + }, + "image": { + "selectors": { + "css": [ + "meta[property='og:description']::attr(content)", + "meta[name=description]::attr(content)", + 'meta[property="og:image"]::attr(content)', + '[itemprop="image"]::attr(src)', + '.lazy::attr("data-lazy-src")' + ] + } + }, + "description": { + "selectors": { + "css": [ + '[itemprop=description]::attr(content)', + '[itemprop=description]::text', + '.entry-content h2::text', + '.linha-fina::text', + '.entry-content blockquote p::text', + '[property="og:description"]::attr(content)' + ] + } + }, + "author": { + "selectors": { + "css": [ + '[name=author]::attr(content)', + '[itemprop=author]::text', + '.author a::text', + '.date b::text' + ] + } + }, + "datePublished": { + "selectors": { + "css": [ + '[itemprop=datePublished]::attr(content)', + '.entry-date::text', + '[property="article:published_time"]::attr(content)', + '[itemprop="datePublished"]::attr(datetime)', + '#show-article [class="col-lg-9 col-md-9 col-sm-9 col-xs-12"] .mb-30 div:first-child::text' + ] + } + }, + "dateModified": { + "selectors": { + "css": [ + '[itemprop=dateModified]::attr(content)', + '[property="article:modified_time"]::attr(content)', + '[itemprop="dateModified"]::attr(datetime)' + + ] + } + }, + "articleBody": { + "selectors": { + "css": [ + '[itemprop=articleBody]', + '.noticia', + 'article.article-main', + '.content' + + ] + } + }, + "keywords": { + "selectors": { + "css": [ + '[itemprop=keywords] a::text', + '[rel=tag]::text', + '.categories a::text', + ] + } + } + } + }] diff --git a/ze/spiders/govto.py b/ze/spiders/govto.py new file mode 100644 index 0000000..451e13d --- /dev/null +++ b/ze/spiders/govto.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +from . import ZeSpider + + +class GovernoTocantins(ZeSpider): + + name = 'govto' + allowed_domains = ['to.gov.br'] + items_refs = [{ + "item": "ze.items.creativework.ArticleItem", + "fields": { + "name": { + "selectors": { + "css": [ + "meta[property='og:title']::attr(content)", + "meta[name=title]::attr(content)", + '[itemprop=headline]::text', + '.title-post::text', + '[itemprop="headline"] a::text', + 'h1 a::text' + ] + } + }, + "image": { + "selectors": { + "css": [ + "meta[property='og:description']::attr(content)", + "meta[name=description]::attr(content)", + 'meta[property="og:image"]::attr(content)', + '[itemprop="image"]::attr(src)', + '.img-wrapper img::attr(src)' + ] + } + }, + "description": { + "selectors": { + "css": [ + '[itemprop=description]::attr(content)', + '[itemprop=description]::text', + '.entry-content h2::text', + '.linha-fina::text', + '.entry-content blockquote p::text', + '[property="og:description"]::attr(content)' + ] + } + }, + "author": { + "selectors": { + "css": [ + '[name=author]::attr(content)', + '[itemprop=author]::text', + '.author a::text', + # '.credito::text', + 'article>p::text' + + ] + } + }, + "datePublished": { + "selectors": { + "css": [ + '[itemprop=datePublished]::attr(content)', + '.entry-date::text', + '[property="article:published_time"]::attr(content)', + '[itemprop="datePublished"]::attr(datetime)', + 'article>p::text' + + ] + } + }, + "dateModified": { + "selectors": { + "css": [ + '[itemprop=dateModified]::attr(content)', + '[property="article:modified_time"]::attr(content)', + '[itemprop="dateModified"]::attr(datetime)' + + ] + } + }, + "articleBody": { + "selectors": { + "css": [ + '[itemprop=articleBody]', + # '.noticia', + 'article.article-main', + '.entry', + 'article' + ] + } + }, + "keywords": { + "selectors": { + "css": [ + '[itemprop=keywords] a::text', + '[rel=tag]::text', + '.categories a::text', + '.tags a::text' + ] + } + } + } + }]