From a3c2e7d123bfa73344bb84989ecd8d86fce19f68 Mon Sep 17 00:00:00 2001 From: gausszh Date: Tue, 30 Oct 2012 17:19:27 +0800 Subject: [PATCH] add files --- download91app.py | 4 + downloadWebUsingRabbitmq.py | 76 ++++++ getAllUrl.py | 95 ++++++++ getAllUrl_91.py | 30 +++ getDownloadUrl_91.py | 105 +++++++++ html_parser.py | 410 +++++++++++++++++++++++++++++++++ improvingWebCrawlingByQuery.py | 230 ++++++++++++++++++ shiwan_category.py | 41 ++++ usingRabbitmqAndPyquery.py | 119 ++++++++++ 9 files changed, 1110 insertions(+) create mode 100644 download91app.py create mode 100644 downloadWebUsingRabbitmq.py create mode 100644 getAllUrl.py create mode 100644 getAllUrl_91.py create mode 100644 getDownloadUrl_91.py create mode 100755 html_parser.py create mode 100644 improvingWebCrawlingByQuery.py create mode 100644 shiwan_category.py create mode 100644 usingRabbitmqAndPyquery.py diff --git a/download91app.py b/download91app.py new file mode 100644 index 0000000..5c95818 --- /dev/null +++ b/download91app.py @@ -0,0 +1,4 @@ +#coding=utf8 +from pyquery import PyQuery as pq +import urllib,urllib2,cookielib + diff --git a/downloadWebUsingRabbitmq.py b/downloadWebUsingRabbitmq.py new file mode 100644 index 0000000..09b9272 --- /dev/null +++ b/downloadWebUsingRabbitmq.py @@ -0,0 +1,76 @@ +#coding=utf8 + +import pika,urllib2,urllib,cookielib,random,time +import threading +from Queue import Queue +q=Queue() +webBuff=Queue() +c=pika.ConnectionParameters(host='192.168.1.102') +conn=pika.BlockingConnection(c) +channel=conn.channel() +channel.queue_declare(queue='appurl') +channel.queue_declare(queue='webstring') +proxy_suport=urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'}) +cookie_suport=urllib2.HTTPCookieProcessor(cookielib.CookieJar()) +opener_proxy=urllib2.build_opener(proxy_suport,cookie_suport,urllib2.HTTPHandler) +opener_normal=urllib2.build_opener(cookie_suport,urllib2.HTTPHandler) + +class downloadWeb(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + def run(self): + global q + while True: + while not q.empty(): + url=q.get() + retF='' + print 'Downloading %s' % url + flag=False + for i in range(3): + try: + retF=opener_normal.open(url,timeout=5).read() + flag=True + break + except Exception,e: + print 'error' + time.sleep(i) + if not flag: + try: + retF=opener_proxy.open(url,timeout=5).read() + flag=True + except Exception,e: + print 'error' + if flag and webBuff.qsize()<100: + webBuff.put(retF) + while webBuff.qsize()>=100: + time.sleep(1) + + +threadList=[] +threadNum=2 +def cleanWebBuff(): + while True: + while not webBuff.empty(): + s=webBuff.get() + channel.basic_publish(exchange='',routing_key='webstring',body=s) +threading.Thread(target=cleanWebBuff,args=()).start() +for i in range(threadNum): + threadList.append(downloadWeb()) +for one in threadList: + one.start() +def callback(cn,method,pro,body): + url=str(body) + print 'Received %s' %url + q.put(url) + while q.qsize()>3: + time.sleep(1) + cn.basic_ack(delivery_tag=method.delivery_tag) + + +channel.basic_qos(prefetch_count=1) +channel.basic_consume(callback,queue='appurl') +print 'start...' +try: + channel.start_consuming() +except Exception,e: + channel.start_consuming() \ No newline at end of file diff --git a/getAllUrl.py b/getAllUrl.py new file mode 100644 index 0000000..ee9e532 --- /dev/null +++ b/getAllUrl.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +#coding=utf8 + +#E-mail:gauss.zh@gmail.com +import urllib2 +import re +import time +import json,pika +from sgmllib import SGMLParser +connection=pika.BlockingConnection(pika.ConnectionParameters(host='192.168.1.102')) +channel=connection.channel() +channel.queue_declare(queue='appurl') +class ListName(SGMLParser): + def __init__(self): + SGMLParser.__init__(self) + self.appurl=[] + self.appname=[] + self.appnameFlag='' + self.tempname='' + def handle_data(self, text): + if self.appnameFlag==1: + self.tempname+=text + + def start_a(self,attrs): + for n,k in attrs: + if n=='href': + if re.findall(r'\.*itunes.apple.com/.*/app.*id.*\d',k): + self.appurl.append(k) + self.appnameFlag=1 + def end_a(self): + if self.appnameFlag==1: + self.appname.append(self.tempname) + self.tempname='' + self.appnameFlag='' +def geturl(homeurl,letter,page): + t='&letter=%s&page=%d' % (letter,page) + oneappurl=homeurl+t + print oneappurl + #oneappurl='http://itunes.apple.com/us/genre/ios-music/id6011?mt=8&letter=A&page=1' + returl=[] + retname=[] + while True: + try: + returnfile=urllib2.urlopen(oneappurl) + content = returnfile.read() + #print content + returnfile.close() + listname = ListName() + listname.feed(content) + retname=listname.appname + returl=listname.appurl + except Exception,e: + if e.reason.errno==10054: + time.sleep(1) + else: + break + break + for one in returl: + channel.basic_publish(exchange='',routing_key='appurl',body=one) + return (returl,retname) +def main(homeurl): + returl=[]#http://itunes.apple.com/us/genre/ios-games/id6014?mt=8&letter=A&page=26 + retname=[] + #homeurl='http://itunes.apple.com/us/genre/ios-games/id6014?mt=8' + for i in range(65,91):#A-Z 还有* + page=1#65 66 67 68 69 70 + letter=chr(i) + while True: + (appurl,appname)=geturl(homeurl,letter,page) + if len(appurl)<=1: + break + page+=1 + print 'page%s ok' % page + returl+=appurl + retname+=appname + page=1 + while True: + (appurl,appname)=geturl(homeurl,'*',page) + if len(appurl)<=1: + break + page+=1 + returl+=appurl + retname+=appname + return (returl,retname) +if __name__=='__main__': + (a,b)=main('http://itunes.apple.com/cn/genre/ios-xiao-lu/id6007?mt=8') + +# urlfilename='cn/'+'tu-shuappurl.txt' +# namefilename='cn/'+'tu-shuappname.txt' +# urlfile=open(urlfilename,'w') +# namefile=open(namefilename,'w') +# a=json.dumps(a) +# b=json.dumps(b) +# print >>urlfile,a +# print >>namefile,b \ No newline at end of file diff --git a/getAllUrl_91.py b/getAllUrl_91.py new file mode 100644 index 0000000..4fcb73f --- /dev/null +++ b/getAllUrl_91.py @@ -0,0 +1,30 @@ +#coding=utf8 +from pyquery import PyQuery as pq +import urllib,urllib2,cookielib,json,os +from Queue import Queue +urlQueue=[] +def getUrl(homeUrl): + '''homeUrl such like http://app.91.com/Soft/iPhone/album/旅游/2690_%d_4''' + global urlQueue + i=1 + while True: + url=homeUrl % i + i+=1 + d=pq(url) + table=d('#AlbumList') + td=table('td') + if len(td)==0: + break + for j in range(len(td)): + onetd=td.eq(j) + aNode=onetd('a') + urlQueue.append(aNode.attr('href')) + print url + return urlQueue +if __name__=='__main__': + getUrl('http://app.91.com/soft/iPhone/album/摄影/4886_%d_5')#旅游、导航 + folder='91app/摄影' + os.system('mkdir -p '+folder) + jsonstruct=json.dumps(urlQueue) + urlfile=open(folder+'/appurl.txt','w') + print >>urlfile,jsonstruct \ No newline at end of file diff --git a/getDownloadUrl_91.py b/getDownloadUrl_91.py new file mode 100644 index 0000000..588ec3a --- /dev/null +++ b/getDownloadUrl_91.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +#coding=utf8 +#按类别收集,多线程,更完善的错误处理(网络连接错误) +#E-mail:gauss.zh@gmail.com +import urllib2,re,json +import cookielib,urllib +import getAllUrl_91 +from sgmllib import SGMLParser +from Queue import Queue +import threading +from pyquery import PyQuery as pq +import random,time +proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'}) +cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) +opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler) +opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler) + +saveAppdetailInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveReviewsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveScreenshortsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveDownloadAppInfo='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveDownloadAppInfo&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +isExistTheId_91='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=isExistTheId_91&id_91=%s&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' + +urlQueue=Queue() +logQueue=Queue() +user_agents = [ + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', + 'Opera/9.25 (Windows NT 5.1; U; en)', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', + 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', + 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', + "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", + "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", +] +rootUrl='http://app.91.com' +L='摄影' +folder='91app/'+L +def main(homeUrl): + + urlfile=open(folder+'/appurl.txt','r') + appurlList=json.load(urlfile) + for one in appurlList: + urlQueue.put(one) + threadList=[] + threadNum=5 + for i in range(threadNum): + threadList.append(threading.Thread(target=parserHtmlGetDownloadUrl,args=())) + for i in range(threadNum): + threadList[i].start() + for i in range(threadNum): + threadList[i].join() +def parserHtmlGetDownloadUrl(): + + aflag=True + while not urlQueue.empty(): + url=urlQueue.get() + agent = random.choice(user_agents) + download_url=id_91=url_91=url_apple=id_apple=version=name=content='' + try: + opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')] + content = opener_proxy.open(url,timeout=5).read() + except Exception,e: + content = opener_proxy.open(url,timeout=10).read() + if not content: + continue + d=pq(content) + try: + detail=d('.soft_detail_h3') + name=detail('h3').text() + version=re.findall('[\d\.]+',detail('span').text())[0] + download=d('div.soft_detail_btn') + link=download('a') + for j in range(len(link)): + onea=link.eq(j) + if onea.attr('title')==u'iTunes \u4e0b\u8f7d':#iTunes 下载 + url_apple=onea.attr('href') + id_apple=re.findall('\d{5,}',url_apple)[0] + if onea.text()==u'\u4e0b\u8f7d\u5230\u7535\u8111':#下载到电脑 + download_url=rootUrl+onea.attr('href') + id_91=re.findall('\d{5,10}',download_url)[0] + if not download_url: + script=d('script').text() + download_url=re.findall(r'http://app.91.com/soft/download/.+?\.[ipaz]{3}',script)[0] + id_91=re.findall('\d{5,10}',download_url)[0] + count=urllib2.urlopen(isExistTheId_91 % id_91).read() + if count == '0': + returnid=urllib2.urlopen(saveDownloadAppInfo,urllib.urlencode({'url_apple':url_apple,'id_apple':id_apple,'id_91':id_91,\ + 'url_91':url,'download_url':download_url,'version':version,'name':name,'category':L,'download_link':"%s" % (download_url,download_url)})).read() + print url,returnid + else: + print 'It is already exist ',name,url_91 + except Exception,e: + print str(e),url + print link.text() + logQueue.put(url) + +if __name__=='__main__': + main('34') + t=[] + while not logQueue.empty(): + t.append(logQueue.get()) + jsonstruct=json.dumps(t) + urlfile=open(folder+'/log.txt','w') + print >>urlfile,jsonstruct \ No newline at end of file diff --git a/html_parser.py b/html_parser.py new file mode 100755 index 0000000..8af2250 --- /dev/null +++ b/html_parser.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python +#coding=utf8 +#按类别收集,多线程,更完善的错误处理(网络连接错误) +#E-mail:gauss.zh@gmail.com +import urllib2,re,json +import cookielib,urllib +import getAllUrl +from sgmllib import SGMLParser +from Queue import Queue +import threading +from pyquery import PyQuery as pq +import random,time +proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'}) +cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) +opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler) +opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler) + +MONTH={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} + +saveAppdetailInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveReviewsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveScreenshortsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' + +user_agents = [ + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', + 'Opera/9.25 (Windows NT 5.1; U; en)', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', + 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', + 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', + "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", + "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", +] + +urlpat=re.compile(r'\.*itunes.apple.com/us/app.*id.*\d') +category='' +folder='' +subcategory='' +queue=Queue() +errorqueue=Queue()#Mozilla/5.0 (X11; Linux x86_64; rv:10.0.5) Gecko/20120606 Firefox/10.0.5 +headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'} +class ListName(SGMLParser): + def __init__(self): + SGMLParser.__init__(self) + self.infoFlag = "" + self.appurl=[] + self.appname=[] + self.descriptionFlag='' + self.descriptionPFlag='' + self.description=[] + self.appnameFlag='' + self.info=[] + self.iphonescreenshots=[] + self.ipadscreenshots=[] + self.whatisnewFlag='' + self.whatisnew=[] + self.allstartFlag='' + self.allstart=[] + self.customerstartFlag='' + self.customerstart=[] + self.customerFlag='' + self.customer=[] + self.iphonescreenshotsFlag='' + self.ipadscreenshotsFlag='' + self.nameFlag='' + self.name='' + self.icon='' + def handle_data(self, text): + if self.infoFlag == 1: + self.info.append(text) + if self.appnameFlag==1: + self.appname.append(text) + self.appnameFlag='' + if self.descriptionFlag==1 and self.descriptionPFlag==1 : + self.description.append(text) + if self.customerFlag==1: + self.customer.append(text) + if self.whatisnewFlag==1: + self.whatisnew.append(text) + if self.nameFlag==1: + self.name+=text + def start_p(self,attrs): + if self.descriptionFlag==1: + self.descriptionPFlag=1 + def end_p(self): + self.descriptionFlag='' + self.descriptionPFlag='' + def start_img(self,attrs): + for n,k in attrs: + if (k=='landscape' or k=='portrait'): + if self.ipadscreenshotsFlag==1: + self.ipadscreenshots.append(attrs[2][1]) + if self.iphonescreenshotsFlag==1: + self.iphonescreenshots.append(attrs[2][1]) + if n=='height' and int(k)>=150: + self.icon=attrs[4][1] + def start_a(self,attrs): + for n,k in attrs: + if n=='href': + if re.findall(r'\.*itunes.apple.com/us/app.*id.*\d',k): + self.appurl.append(k) + self.appnameFlag=1 + def start_div(self,attrs): + for n,k in attrs: + if n=='metrics-loc' and k=='Titledbox_Description':#http://itunes.apple.com/us/genre/ios-games/id6014?mt=8 + self.descriptionFlag=1#http://itunes.apple.com/us/app/angry-birds/id343200656?mt=8 + if n=='id' and k=='left-stack': + self.infoFlag=1 + self.customerFlag='' + if n=='metrics-loc': + if re.findall(r"What's New",k): + self.whatisnewFlag=1 + if n=='class' and k=='rating' and self.infoFlag==1: + self.allstart.append(attrs[3][1]) + self.customerstartFlag=1 + self.allstartFlag=1 + if n=='class' and k=='customer-reviews': + self.customerFlag=1 + if n=='class' and k=='rating' and self.customerFlag==1: + self.customerstart.append(attrs[3][1]) + if n=='class' : + if re.findall(r'iphone-screen-shots',k): + self.iphonescreenshotsFlag=1 + self.ipadscreenshotsFlag='' + if n=='class' : + if re.findall(r'ipad-screen-shots',k): + self.ipadscreenshotsFlag=1 + self.iphonescreenshotsFlag='' + def start_h1(self,attrs): + self.nameFlag=1 + def end_h1(self): + self.nameFlag='' + def end_div(self): + self.whatisnewFlag='' +def simplify(lt): + """去除list中的杂项,比如\n,' '""" + ret=[] + for one in lt: + temp=one.strip() + if temp: + ret.append(temp) + return ret + +def main(rootUrl): + global folder,subcategory + try: + homeurl=rootUrl + (a,b)=getAllUrl.main(homeurl)#ios-abc + t=re.findall(r'ios-.*/',homeurl) + + folder=t[0][4:-1] + # subcategory=t[0][4:-1] + urlfilename=folder+'appurl.txt' + namefilename=folder+'appname.txt' + urlfile=open(urlfilename,'w') + namefile=open(namefilename,'w') + a=json.dumps(a) + b=json.dumps(b) + print >>urlfile,a + print >>namefile,b + urlfile.close() + namefile.close() + +# global queue +# logfile=open('F_starerrornameandurl.txt','r') +# log=logfile.readlines() +# logfile.close() +# for one in log: +# url=re.findall(r'http.*mt=8',one)[0] +# n='null' +# queue.put((n,url)) + + appnamefile=open(namefilename,'r') + appname=json.load(appnamefile) + appurlfile=open(urlfilename,'r') + appurl=json.load(appurlfile) + appnamefile.close() + appurlfile.close() + for i in range(len(appname)): + n=appname[i] + u=appurl[i] + queue.put((n,u))# (name url) tuple + templatefile=open('template.xml','r') + template=''.join(templatefile.readlines()) + templatefile.close() + except Exception,e: + print str(e) + threadlist=[] +# while True: +# (name,url)=queue.get() +# tmpid=re.findall('/id\d*',url)[0][3:] +# if tmpid=='476005657': +# break + for i in range(10): + add=folder+'/thread%d__' % i + threadlist.append(threading.Thread(target=threaddownload,args=(add,template,))) + for i in range(len(threadlist)): + threadlist[i].start() + for i in range(len(threadlist)): + threadlist[i].join() +# threaddownload('musicData/123/logF_starthreadfive',template,) + +def threaddownload(filename,template): + global queue + global errorqueue + timeoutcounts=0 + outputFile=open(filename+'.xml','w') + print >>outputFile,'\n' + i=-1 + counts=0 + while True: + i+=1 + if queue.empty(): + break + try: + (name,url)=queue.get() + while True: + try: + # req = urllib2.Request(url = "http://itunes.apple.com/us/a5pp/numbers/id361304891?mt=8",headers = headers) + agent = random.choice(user_agents) + opener_normal.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')] + content = opener_normal.open(url,timeout=5).read() + #returnfile=urllib2.urlopen(url=url,timeout=5) +# content=returnfile.read() +# returnfile.close() + timeoutcounts=0 + break + except Exception,e: + if timeoutcounts<5: + timeoutcounts+=1 + time.sleep(timeoutcounts) + else: + print name,url + print str(e) + errorqueue.put((name,url)) + break + if timeoutcounts>=5: + try: + agent = random.choice(user_agents) + opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')] + content = opener_proxy.open(url,timeout=5).read() + print 'proxy' + except Exception,e: + timeoutcounts=0 + continue + listname=ListName() + listname.feed(content) + d=pq(content) +# name=re.findall(r'app/.*/id',appurl[i])[0] +# name=name[4:-3] + name=listname.name + info={} + j=0 + tmpinfo=simplify(listname.info[0:50]) + + info['price']='Free' + for one in tmpinfo: + if re.findall(r'\$\d*',one): + info['price']=one + if one in ['Category:','Released','Updated:','Version:','Size:','Languages:','Language:',\ + 'Seller:','Requirements:','Released:','All Versions:','Current Version:']: + info[one]=tmpinfo[j+1] + if one=='Seller:': + info['copyright:']=tmpinfo[j+2] + info['app_rating:']=tmpinfo[j+3] + info['reasons:']=tmpinfo[j+4] + j+=1 +# if locals().has_key('category'): +# if category!=info['Category:']: +# continue +# else: + category=info['Category:'] + icon=listname.icon + id=str(re.findall(r'[\d]{5,}',url)[0]) + customer=simplify(listname.customer) + price=info['price'] + if info.has_key('Released:'): + Updated=info['Released:'] + elif info.has_key('Updated:'): + Updated=info['Updated:'] + version=info['Version:'] + size=info['Size:'] + languages='' + if info.has_key('Languages:'): + languages=info['Languages:'] + elif info.has_key('Language:'): + languages=info['Language:'] + + seller=info['Seller:'] + copyright=info['copyright:'] + app_rating=info['app_rating:'] + reason=info['reasons:'] + requirements=info['Requirements:'] + stars1=rating_counts1='' + tmpi=0 + if info.has_key('Current Version:'): + t=listname.allstart[tmpi].split(',') + tmpi+=1 + stars1=t[0] + rating_counts1=info['Current Version:'] + stars2=rating_counts2='' + if info.has_key('All Versions:'): + t=listname.allstart[tmpi].split(',') + stars2=t[0] + rating_counts2=info['All Versions:'] + description=''.join(listname.description) + + t=simplify(listname.whatisnew[3:]) + whats_new=''.join(t) + iphonescreenshot='' + screenshotDictList=[] + for one in listname.iphonescreenshots: + iphonescreenshot+='\n' % one#xml + # save in mysql throught thinksns + ######################### + screenshotDictList.append({'screenshots':one,'aid':id}) + + ipadscreenshot='' + for one in listname.ipadscreenshots: + ipadscreenshot+='\n' % one#xml + # save in mysql throught thinksns + ############################# + screenshotDictList.append({'screenshots':one,'aid':id}) + usernameindex=[] + for k in range(len(customer)): + if re.findall(r'by\n',customer[k]): + usernameindex.append(k) + review=[] + reviewDictList=[] + for k in range(len(usernameindex)): + if k!=len(usernameindex)-1: + t=usernameindex[k+1]-1 + else: + t=usernameindex[k]+2 + s='\n\n\n<![CDATA[%s]]>\n\n\n' % \ + (re.findall(r'.*$',customer[usernameindex[k]])[0].strip(),listname.customerstart[k],customer[usernameindex[k]-1],''.join(customer[usernameindex[k]+1:t])) + #save review in mysql throught by thinksns + ########################## + reviewDictList.append({'aid':id,'uname':re.findall(r'.*$',customer[usernameindex[k]])[0].strip(),'rating':\ + listname.customerstart[k],'title':customer[usernameindex[k]-1],'content':''.join(customer[usernameindex[k]+1:t])}) + + review.append(s) + + review=''.join(review)#s = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', str) + output=template % (id,icon,url,name, price,category,subcategory,Updated,version,size,languages,seller,copyright,app_rating,reason,requirements,\ + stars1,rating_counts1,stars2,rating_counts2,description,whats_new,iphonescreenshot,ipadscreenshot,review) + #save appdetail in mysql throught by thinksns + pydes=d('.product-review').eq(0) + pydes=pydes.html()#Jan 04, 2011 + Updated=Updated.strip() + updateTime=Updated[-4:]+'-'+str(MONTH[Updated[:3]])+'-'+Updated[4:6] + appdetailData=urllib.urlencode({'aid':id,'appurl':url,'name':name,'icon_url':icon,'price':price,'category':category,\ + 'subcategory':subcategory,'updated':updateTime,'version':version,'size':size,'languages':\ + languages,'seller':seller,'copyright':copyright,'des':app_rating,'reason':reason,'requirement':requirements,\ + 'cstars':stars1,'crating_count':rating_counts1,'stars':stars2,'rating_count':rating_counts2,\ + 'description':pydes,'whatsnew':whats_new}) +## APPDETAIL + aid=urllib2.urlopen(saveAppdetailInMysqlByUrl,appdetailData).read() + for one in reviewDictList: + one['aid']=aid + #reivew + urllib2.urlopen(saveReviewsInMysqlByUrl,urllib.urlencode(one)) + + for one in screenshotDictList: + one['aid']=aid + urllib2.urlopen(saveScreenshortsInMysqlByUrl,urllib.urlencode(one)) + + + + + + + + + print '%s %dth over!' % (threading.currentThread().getName(),i) + output = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', output) + print >>outputFile,output + counts+=1 + if counts==10: + counts=0 + print >>outputFile,'' + outputFile.close() + tempfilename='%s%d.xml' % (filename,i) + outputFile=open(tempfilename,'w') + print >>outputFile,'\n' + except Exception,e: + print str(e) + print url + print info + errorqueue.put((name,url)) + print >>outputFile,'' + outputFile.close() + tempfilename='%s%d.xml' % (filename,i) + outputFile=open(tempfilename,'w') + print >>outputFile,'\n' + if not counts==0: + print >>outputFile,'' + outputFile.close() + # outputFile.close() +if __name__=='__main__': + s=['http://itunes.apple.com/us/genre/ios-music/id6011?mt=8'] + for one in s: + main(one) + errorfile=open('log%s.txt' % subcategory,'w') + tempqueue=[] + while not errorqueue.empty(): + tempqueue.append(errorqueue.get()) + t=json.dumps(tempqueue) + print >>errorfile,t + print '***********ok**********' diff --git a/improvingWebCrawlingByQuery.py b/improvingWebCrawlingByQuery.py new file mode 100644 index 0000000..790216c --- /dev/null +++ b/improvingWebCrawlingByQuery.py @@ -0,0 +1,230 @@ +#!/usr/bin/python27 +#coding=UTF-8 +#author:gausszh +#e-mail:gausszh@gmail.com + +import urllib2,re,json,os +import cookielib,urllib +import getAllUrl +from Queue import Queue +import threading +from pyquery import PyQuery as pq +import random,time + +proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'}) +cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) +opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler) +opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler) + +saveAppdetailInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveReviewsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveScreenshortsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' + +user_agents = [ + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', + 'Opera/9.25 (Windows NT 5.1; U; en)', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', + 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', + 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', + "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", + "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", +] +queue=Queue() +errorqueue=Queue() +def getC(node): + try: + return node.text().split(':')[1].strip() + except Exception,e: + return node.text().split(u'\uff1a')[1].strip() +def main(rootUrl): + global folder,subcategory + try: + homeurl=rootUrl + # (a,b)=getAllUrl.main(homeurl)#ios-abc + t=re.findall(r'ios-.*/',homeurl) + + folder='cn/'+t[0][4:-1] + os.system('mkdir -p '+folder) + # subcategory=t[0][4:-1] + urlfilename=folder+'appurl.txt' + namefilename=folder+'appname.txt' +# urlfile=open(urlfilename,'w') +# namefile=open(namefilename,'w') +# a=json.dumps(a) +# b=json.dumps(b) +# print >>urlfile,a +# print >>namefile,b +# urlfile.close() +# namefile.close() + + appnamefile=open(namefilename,'r') + appname=json.load(appnamefile) + appurlfile=open(urlfilename,'r') + appurl=json.load(appurlfile) + appnamefile.close() + appurlfile.close() + for i in range(len(appname)): + n=appname[i] + u=appurl[i] + queue.put((n,u))# (name url) tuple + templatefile=open('template.xml','r') + template=''.join(templatefile.readlines()) + templatefile.close() + except Exception,e: + print str(e) + threadlist=[] +# while True: +# (name,url)=queue.get() +# tmpid=re.findall('/id\d*',url)[0][3:] +# if tmpid=='476005657': +# break + for i in range(9): + add=folder+'/thread%d__' % i + threadlist.append(threading.Thread(target=threaddownload,args=(add,template,))) + for i in range(len(threadlist)): + threadlist[i].start() + for i in range(len(threadlist)): + threadlist[i].join() +# threaddownload('musicData/123/logF_starthreadfive',template,) + +def threaddownload(filename,template): + global queue + global errorqueue + timeoutcounts=0 + outputFile=open(filename+'.xml','w') + print >>outputFile,'\n' + cases=-1 + counts=0 + while True: + cases+=1 + if queue.empty(): + break + try: + (name,url)=queue.get() + while True: + try: + # req = urllib2.Request(url = "http://itunes.apple.com/us/a5pp/numbers/id361304891?mt=8",headers = headers) + agent = random.choice(user_agents) + opener_normal.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')] + content = opener_normal.open(url,timeout=5).read() + timeoutcounts=0 + break + except Exception,e: + if timeoutcounts<5: + timeoutcounts+=1 + time.sleep(timeoutcounts) + else: + break + if timeoutcounts>=5: + try: + agent = random.choice(user_agents) + opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')] + content = opener_proxy.open(url,timeout=5).read() + print 'proxy' + timeoutcounts=0 + except Exception,e: + print name,url + print str(e) + errorqueue.put((name,url)) + timeoutcounts=0 + continue + d=pq(content) + name=d('h1').text() + id=re.findall('\d{5,}',url)[0] + ls=d('#left-stack') + lschildren=ls.children() + detail=lschildren.eq(0) + price=detail('.price').text() + icon=detail('img').attr('src') + detail_ul=detail('ul') + categoryN=detail_ul('.genre') + dateN=categoryN.next() + versionN=dateN.next() + sizeN=versionN.next() + languagesN=sizeN.next() + sellerN=languagesN.next() + copyrightN=sellerN.next() + ratingN=detail_ul.next() + requirementsN=ratingN.next() + stars1=rating_counts1=stars2=rating_counts2='' + list_customer_ratingN=detail.next() + customer_ratingN=list_customer_ratingN('.rating') + if len(customer_ratingN)==2: + aria_label=customer_ratingN.eq(0).attr('aria-label') + aria_label=re.findall('[\d\.]+',aria_label) + stars1=aria_label[0] + rating_counts1=aria_label[1] + aria_label=customer_ratingN.eq(1).attr('aria-label') + aria_label=re.findall('[\d\.]+',aria_label) + stars2=aria_label[0] + rating_counts2=aria_label[1] + elif len(customer_ratingN)==1: + pN=customer_ratingN.eq(0).prev()#上一个节点 + aria_label=customer_ratingN.eq(0).attr('aria-label') + aria_label=re.findall('[\d\.]+',aria_label) + if pN.text().find(u'\u5f53'): + stars1=aria_label[0] + rating_counts1=aria_label[1] + else: + stars2=aria_label[0] + rating_counts2=aria_label[1] + description=d('.product-review').eq(0) + descriptionData=description.html() + descriptionXml=description.text() + whats_new=d('.product-review').eq(1) + whats_newData=whats_new.html() + whats_newXml=whats_new.text() + date='-'.join(re.findall('\d+',getC(dateN))) + output=template % (id,url,name,getC(categoryN),'',descriptionXml,whats_newXml) + appdetailData=urllib.urlencode({'aid':id,'appurl':url,'name':name,'icon_url':icon,'price':price,'category':getC(categoryN),\ + 'subcategory':'','updated':date,'version':getC(versionN),'size':getC(sizeN),'languages':\ + getC(languagesN),'seller':getC(sellerN),'copyright':copyrightN.text(),'des':ratingN('a').text(),\ + 'reason':ratingN('ul').text(),'requirement':getC(requirementsN),'cstars':stars1,'crating_count':rating_counts1,\ + 'stars':stars2,'rating_count':rating_counts2,'description':descriptionData,'whatsnew':whats_newData}) + aid=urllib2.urlopen(saveAppdetailInMysqlByUrl,appdetailData).read() + reviewN=d('.customer-review') + for i in range(len(reviewN)): + oneN=reviewN.eq(i) + s=oneN('.rating').attr('aria-label') + c_rating=re.findall('[\d\.]+',s)[0] + data={'aid':aid,'uname':getC(oneN('.user-info')),'rating':c_rating,'title':oneN('.customerReviewTitle').text(),'content':oneN('.content').text()} + urllib2.urlopen(saveReviewsInMysqlByUrl,urllib.urlencode(data)) + screenshotsN=d('.swoosh.lockup-container.application.large.screenshots')('img') + for i in range(len(screenshotsN)): + urllib2.urlopen(saveScreenshortsInMysqlByUrl,urllib.urlencode({'screenshots':screenshotsN.eq(i).attr('src'),'aid':aid})) + print '%s %dth over!' % (threading.currentThread().getName(),cases) + output = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', output) + print >>outputFile,output + counts+=1 + if counts==1000: + counts=0 + print >>outputFile,'' + outputFile.close() + tempfilename='%s%d.xml' % (filename,i) + outputFile=open(tempfilename,'w') + print >>outputFile,'\n' + except Exception,e: + print str(e) + print url + errorqueue.put((name,url)) + print >>outputFile,'' + outputFile.close() + tempfilename='%s%d.xml' % (filename,i) + outputFile=open(tempfilename,'w') + print >>outputFile,'\n' + if not counts==0: + print >>outputFile,'' + outputFile.close() + # outputFile.close() +if __name__=='__main__': + s=['http://itunes.apple.com/cn/genre/ios-tu-shu/id6018?mt=8'] + for one in s: + main(one) + errorfile=open('%slog.txt' % folder,'w') + tempqueue=[] + while not errorqueue.empty(): + tempqueue.append(errorqueue.get()) + t=json.dumps(tempqueue) + print >>errorfile,t + print '***********ok**********' \ No newline at end of file diff --git a/shiwan_category.py b/shiwan_category.py new file mode 100644 index 0000000..f0d00c5 --- /dev/null +++ b/shiwan_category.py @@ -0,0 +1,41 @@ +#!/usr/bin/python +#coding=utf8 +#author:gausszh +#E-mail:gauss.zh@gmail.com +#2012-09-25 +from pyquery import PyQuery as pq +import re,urllib2,cookielib + +proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'}) +cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) +opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler) +opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler) +MAXINF=999999 +reg=re.compile('id\d{7,}') + +def download_one_subcategory(beginUrl): + """beginUrl like "http://apple.shiwan.com/list/cat-371" """ + + beginUrl+='/pf-0/price-0/age-0/ft-0/ps-0/order-0/p-%d' + for i in range(1,MAXINF): + theUrl=beginUrl % i + try: + content=opener_normal.open(theUrl,timeout=5).read() + except Exception,e: + print str(e),theUrl + d=pq(content) + dir_ido=d('div.dir_ido') + if not re.findall('\d{3,}',dir_ido.eq(0)('a').attr('href')): + break + for alink in range(len(dir_ido)): + div=dir_ido.eq(alink) + s=div.html() + regid=reg.findall(s) + if regid: + id=regid[0][2:] + print id + else: + print s +if __name__=='__main__': + download_one_subcategory('http://apple.shiwan.com/list/cat-371') + \ No newline at end of file diff --git a/usingRabbitmqAndPyquery.py b/usingRabbitmqAndPyquery.py new file mode 100644 index 0000000..de256b0 --- /dev/null +++ b/usingRabbitmqAndPyquery.py @@ -0,0 +1,119 @@ +#coding=utf8 + +import pika,urllib2,urllib,cookielib,random,time +import re +from pyquery import PyQuery as pq +from Queue import Queue +saveAppdetailInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveReviewsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' +saveScreenshortsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e' + +htmlStringQ=Queue() +c=pika.ConnectionParameters(host='192.168.1.102') +conn=pika.BlockingConnection(c) +channel=conn.channel() +channel.queue_declare(queue='appurl') +channel.queue_declare(queue='webstring') +COUNTS=1 +XML_COUNTS=1 +templatefile=open('template.xml','r') +template=''.join(templatefile.readlines()) +outputFile=open('samsung.xml','w') +print >>outputFile,'\n' + +def getC(node): + try: + return node.text().split(':')[1].strip() + except Exception,e: + return node.text().split(u'\uff1a')[1].strip() +def convertHtmlToXmlAndDatabase(htmlString): + global XML_COUNTS,COUNTS + d=pq(htmlString) + name=d('h1').text() + link=d('link') + url=link.attr('href') + print url + id=re.findall('\d{5,}',url)[0] + ls=d('#left-stack') + lschildren=ls.children() + detail=lschildren.eq(0) + price=detail('.price').text() + icon=detail('img').attr('src') + detail_ul=detail('ul') + categoryN=detail_ul('.genre') + dateN=categoryN.next() + versionN=dateN.next() + sizeN=versionN.next() + languagesN=sizeN.next() + sellerN=languagesN.next() + copyrightN=sellerN.next() + ratingN=detail_ul.next() + requirementsN=ratingN.next() + stars1=rating_counts1=stars2=rating_counts2='' + list_customer_ratingN=detail.next() + customer_ratingN=list_customer_ratingN('.rating') + if len(customer_ratingN)==2: + aria_label=customer_ratingN.eq(0).attr('aria-label') + aria_label=re.findall('[\d\.]+',aria_label) + stars1=aria_label[0] + rating_counts1=aria_label[1] + aria_label=customer_ratingN.eq(1).attr('aria-label') + aria_label=re.findall('[\d\.]+',aria_label) + stars2=aria_label[0] + rating_counts2=aria_label[1] + elif len(customer_ratingN)==1: + pN=customer_ratingN.eq(0).prev()#上一个节点 + aria_label=customer_ratingN.eq(0).attr('aria-label') + aria_label=re.findall('[\d\.]+',aria_label) + if pN.text().find(u'\u5f53'): + stars1=aria_label[0] + rating_counts1=aria_label[1] + else: + stars2=aria_label[0] + rating_counts2=aria_label[1] + description=d('.product-review').eq(0) + descriptionData=description.html() + descriptionXml=description.text() + whats_new=d('.product-review').eq(1) + whats_newData=whats_new.html() + whats_newXml=whats_new.text() + date='-'.join(re.findall('\d+',getC(dateN))) + output=template % (id,url,name,getC(categoryN),'',descriptionXml,whats_newXml) + output = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', output) + if XML_COUNTS==1000: + XML_COUNTS=0 + print >>outputFile,'' + outputFile.close() + tempfilename='samsung%d.xml' % (COUNTS) + outputFile=open(tempfilename,'w') + print >>outputFile,'\n' + appdetailData=urllib.urlencode({'aid':id,'appurl':url,'name':name,'icon_url':icon,'price':price,'category':getC(categoryN),\ + 'subcategory':'','updated':date,'version':getC(versionN),'size':getC(sizeN),'languages':\ + getC(languagesN),'seller':getC(sellerN),'copyright':copyrightN.text(),'des':ratingN('a').text(),\ + 'reason':ratingN('ul').text(),'requirement':getC(requirementsN),'cstars':stars1,'crating_count':rating_counts1,\ + 'stars':stars2,'rating_count':rating_counts2,'description':descriptionData,'whatsnew':whats_newData}) + aid=urllib2.urlopen(saveAppdetailInMysqlByUrl,appdetailData).read() + reviewN=d('.customer-review') + for i in range(len(reviewN)): + oneN=reviewN.eq(i) + s=oneN('.rating').attr('aria-label') + c_rating=re.findall('[\d\.]+',s)[0] + data={'aid':aid,'uname':getC(oneN('.user-info')),'rating':c_rating,'title':oneN('.customerReviewTitle').text(),'content':oneN('.content').text()} + urllib2.urlopen(saveReviewsInMysqlByUrl,urllib.urlencode(data)) + screenshotsN=d('.swoosh.lockup-container.application.large.screenshots')('img') + for i in range(len(screenshotsN)): + urllib2.urlopen(saveScreenshortsInMysqlByUrl,urllib.urlencode({'screenshots':screenshotsN.eq(i).attr('src'),'aid':aid})) + print '%dth over!' % (COUNTS) + COUNTS+=1 +def callback(cn,method,pro,body): + htmlString=str(body) + htmlStringQ.put(htmlString) + try: + convertHtmlToXmlAndDatabase(htmlString) + except Exception,e: + print str(e) + cn.basic_ack(delivery_tag=method.delivery_tag) +channel.basic_qos(prefetch_count=1) +channel.basic_consume(callback,queue='webstring') +print 'start...' +channel.start_consuming() \ No newline at end of file