-
Notifications
You must be signed in to change notification settings - Fork 0
/
n_scrape_urllib.py
executable file
·131 lines (111 loc) · 4.26 KB
/
n_scrape_urllib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#! /usr/bin/env python
# -*- coding:utf-8 -*-
from functools import wraps
import sys
import time
import urllib2
import lxml.html
_DOMAIN_URL = 'http://hoge.n.com'
TIME_SLEEP = 0.8
class NScrape(object):
def __init__(self):
self.latest_image_url = ''
self.new_latest_image_url = ''
self.image_number = 0
self.error_list = []
self.dler = None
sys.setrecursionlimit(100000) # recursion limit
def _decorator(explain):
def _deco(function):
@wraps(function)
def __deco(*args, **kw):
print '==>', explain, '\n\t',
function(*args, **kw)
print '-' * 46
return __deco
return _deco
@_decorator('Latest image URL')
def set_latest_image_url(self, url):
self.latest_image_url = url
self.new_latest_image_url = url
print self.latest_image_url
def set_dler(self, dler):
self.dler = dler
def get_new_latest_image_url(self):
return self.new_latest_image_url
def get_image_urls(self, img_tags):
last = False
image_urls = []
for img_tag in img_tags:
parent_tag = img_tag.getparent()
if parent_tag is not None:
if (parent_tag.tag == 'a') and\
('href' in parent_tag.attrib) and\
('dcimg' in parent_tag.attrib['href']) and\
(parent_tag.attrib['href'] not in image_urls):
if parent_tag.attrib['href'] == self.latest_image_url:
last = True
break
if parent_tag.attrib['href'] not in image_urls:
image_urls.append(parent_tag.attrib['href'])
elif 'src' in img_tag.attrib:
_conditions = [
('gif' not in img_tag.attrib['src']),
('n46_list' not in img_tag.attrib['src'])
]
__conditions = [
('.jpeg' in img_tag.attrib['src'].lower()),
('.jpg' in img_tag.attrib['src'].lower()),
('.png' in img_tag.attrib['src'].lower())
]
if all(_ for _ in _conditions) and\
any(_ for _ in __conditions):
if img_tag.attrib['src'] == self.latest_image_url:
last = True
break
if img_tag.attrib['src'] not in image_urls:
image_urls.append(img_tag.attrib['src'])
image_urls.reverse()
return last, image_urls
def get_next_page(self, a_tags):
next_page = ''
next_month = ''
for a_tag in a_tags:
if a_tag.text == u'>':
next_page = _DOMAIN_URL + '/' + a_tag.attrib['href']
return next_page
if 'class' in a_tag.attrib:
if a_tag.attrib['class'] == 'prev':
next_month = a_tag.attrib['href']
if next_month != '':
next_page = next_month
return next_page
@_decorator('Target page')
def scrape(self, target_page):
print target_page
hdr={'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(target_page, headers=hdr)
responses = urllib2.urlopen(req)
target_html = responses.read()
root = lxml.html.fromstring(target_html)
img_tags = root.cssselect('img')
a_tags = root.cssselect('a')
last, image_urls = self.get_image_urls(img_tags)
if last is False:
next_page = self.get_next_page(a_tags)
self.scrape(next_page)
else:
print 'Found latst image URL', '-' * 24, '\n'
print target_page
for image_url in image_urls:
try:
if 'dcimg' in image_url:
self.dler.dcimg_download(image_url)
else:
self.dler.download(image_url)
self.image_number += 1
except:
self.error_list.append(image_url)
time.sleep(TIME_SLEEP)
if len(image_urls) > 0:
self.new_latest_image_url = image_urls[-1]