-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathshiwan_category.py
41 lines (37 loc) · 1.33 KB
/
shiwan_category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/python
#coding=utf8
#author:gausszh
#E-mail:[email protected]
#2012-09-25
from pyquery import PyQuery as pq
import re,urllib2,cookielib
proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler)
opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
MAXINF=999999
reg=re.compile('id\d{7,}')
def download_one_subcategory(beginUrl):
"""beginUrl like "http://apple.shiwan.com/list/cat-371" """
beginUrl+='/pf-0/price-0/age-0/ft-0/ps-0/order-0/p-%d'
for i in range(1,MAXINF):
theUrl=beginUrl % i
try:
content=opener_normal.open(theUrl,timeout=5).read()
except Exception,e:
print str(e),theUrl
d=pq(content)
dir_ido=d('div.dir_ido')
if not re.findall('\d{3,}',dir_ido.eq(0)('a').attr('href')):
break
for alink in range(len(dir_ido)):
div=dir_ido.eq(alink)
s=div.html()
regid=reg.findall(s)
if regid:
id=regid[0][2:]
print id
else:
print s
if __name__=='__main__':
download_one_subcategory('http://apple.shiwan.com/list/cat-371')