add files

gausszh · Oct 30, 2012 · a3c2e7d · a3c2e7d
commit a3c2e7d
Show file tree

Hide file tree

Showing 9 changed files with 1,110 additions and 0 deletions.
diff --git a/download91app.py b/download91app.py
@@ -0,0 +1,4 @@
+#coding=utf8
+from pyquery import PyQuery as pq
+import urllib,urllib2,cookielib
+
diff --git a/downloadWebUsingRabbitmq.py b/downloadWebUsingRabbitmq.py
@@ -0,0 +1,76 @@
+#coding=utf8
+
+import pika,urllib2,urllib,cookielib,random,time
+import threading
+from Queue import Queue
+q=Queue()
+webBuff=Queue()
+c=pika.ConnectionParameters(host='192.168.1.102')
+conn=pika.BlockingConnection(c)
+channel=conn.channel()
+channel.queue_declare(queue='appurl')
+channel.queue_declare(queue='webstring')
+proxy_suport=urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
+cookie_suport=urllib2.HTTPCookieProcessor(cookielib.CookieJar())
+opener_proxy=urllib2.build_opener(proxy_suport,cookie_suport,urllib2.HTTPHandler)
+opener_normal=urllib2.build_opener(cookie_suport,urllib2.HTTPHandler)
+
+class downloadWeb(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+    def run(self):
+        global q
+        while True:
+            while not q.empty():
+                url=q.get()
+                retF=''
+                print 'Downloading %s' % url
+                flag=False
+                for i in range(3):
+                    try:
+                        retF=opener_normal.open(url,timeout=5).read()
+                        flag=True
+                        break
+                    except Exception,e:
+                        print 'error'
+                        time.sleep(i)
+                if not flag:
+                    try:
+                        retF=opener_proxy.open(url,timeout=5).read()
+                        flag=True
+                    except Exception,e:
+                        print 'error'
+                if flag and webBuff.qsize()<100:
+                    webBuff.put(retF)
+                while webBuff.qsize()>=100:
+                    time.sleep(1)
+
+
+threadList=[]
+threadNum=2
+def cleanWebBuff():
+    while True:
+        while not webBuff.empty():
+            s=webBuff.get()
+            channel.basic_publish(exchange='',routing_key='webstring',body=s)
+threading.Thread(target=cleanWebBuff,args=()).start()
+for i in range(threadNum):
+    threadList.append(downloadWeb())
+for one in threadList:
+    one.start()
+def callback(cn,method,pro,body):
+    url=str(body)
+    print 'Received %s' %url
+    q.put(url)
+    while q.qsize()>3:
+        time.sleep(1)
+    cn.basic_ack(delivery_tag=method.delivery_tag)
+
+
+channel.basic_qos(prefetch_count=1)
+channel.basic_consume(callback,queue='appurl')
+print 'start...'
+try:
+    channel.start_consuming()
+except Exception,e:
+    channel.start_consuming()
diff --git a/getAllUrl.py b/getAllUrl.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+#coding=utf8
+
+#E-mail:[email protected]
+import urllib2
+import re
+import time
+import json,pika
+from sgmllib import SGMLParser
+connection=pika.BlockingConnection(pika.ConnectionParameters(host='192.168.1.102'))
+channel=connection.channel()
+channel.queue_declare(queue='appurl')
+class ListName(SGMLParser):
+    def __init__(self):
+        SGMLParser.__init__(self)
+        self.appurl=[]
+        self.appname=[]
+        self.appnameFlag=''
+        self.tempname=''
+    def handle_data(self, text):
+        if self.appnameFlag==1:
+            self.tempname+=text
+
+    def start_a(self,attrs):
+        for n,k in attrs:
+            if n=='href':
+                if re.findall(r'\.*itunes.apple.com/.*/app.*id.*\d',k):
+                    self.appurl.append(k)
+                    self.appnameFlag=1
+    def end_a(self):
+        if self.appnameFlag==1:
+            self.appname.append(self.tempname)
+            self.tempname=''           
+            self.appnameFlag=''
+def geturl(homeurl,letter,page):
+    t='&letter=%s&page=%d' % (letter,page)
+    oneappurl=homeurl+t
+    print oneappurl
+    #oneappurl='http://itunes.apple.com/us/genre/ios-music/id6011?mt=8&letter=A&page=1'
+    returl=[]
+    retname=[]
+    while True:
+        try:
+            returnfile=urllib2.urlopen(oneappurl)
+            content = returnfile.read()
+            #print content
+            returnfile.close()
+            listname = ListName()
+            listname.feed(content)
+            retname=listname.appname
+            returl=listname.appurl
+        except Exception,e:
+            if e.reason.errno==10054:
+                time.sleep(1)
+            else:
+                break
+        break
+    for one in returl:
+        channel.basic_publish(exchange='',routing_key='appurl',body=one)
+    return (returl,retname)
+def main(homeurl):
+    returl=[]#http://itunes.apple.com/us/genre/ios-games/id6014?mt=8&letter=A&page=26
+    retname=[]
+    #homeurl='http://itunes.apple.com/us/genre/ios-games/id6014?mt=8'
+    for i in range(65,91):#A-Z  还有* 
+        page=1#65 66 67 68 69 70
+        letter=chr(i)
+        while True:            
+            (appurl,appname)=geturl(homeurl,letter,page)
+            if len(appurl)<=1:
+                break
+            page+=1
+            print 'page%s ok' % page
+            returl+=appurl
+            retname+=appname
+    page=1
+    while True:
+        (appurl,appname)=geturl(homeurl,'*',page)
+        if len(appurl)<=1:
+            break
+        page+=1
+        returl+=appurl
+        retname+=appname
+    return (returl,retname)
+if __name__=='__main__':
+    (a,b)=main('http://itunes.apple.com/cn/genre/ios-xiao-lu/id6007?mt=8')
+
+#    urlfilename='cn/'+'tu-shuappurl.txt'
+#    namefilename='cn/'+'tu-shuappname.txt'
+#    urlfile=open(urlfilename,'w')
+#    namefile=open(namefilename,'w')
+#    a=json.dumps(a)
+#    b=json.dumps(b)
+#    print >>urlfile,a
+#    print >>namefile,b
diff --git a/getAllUrl_91.py b/getAllUrl_91.py
@@ -0,0 +1,30 @@
+#coding=utf8
+from pyquery import PyQuery as pq
+import urllib,urllib2,cookielib,json,os
+from Queue import Queue
+urlQueue=[]
+def getUrl(homeUrl):
+    '''homeUrl such like http://app.91.com/Soft/iPhone/album/旅游/2690_%d_4'''
+    global urlQueue
+    i=1
+    while True:
+        url=homeUrl % i
+        i+=1
+        d=pq(url)
+        table=d('#AlbumList')
+        td=table('td')
+        if len(td)==0:
+            break
+        for j in range(len(td)):
+            onetd=td.eq(j)
+            aNode=onetd('a')
+            urlQueue.append(aNode.attr('href'))
+        print url
+    return urlQueue
+if __name__=='__main__':
+    getUrl('http://app.91.com/soft/iPhone/album/摄影/4886_%d_5')#旅游、导航
+    folder='91app/摄影'
+    os.system('mkdir -p '+folder)
+    jsonstruct=json.dumps(urlQueue)
+    urlfile=open(folder+'/appurl.txt','w')
+    print >>urlfile,jsonstruct
diff --git a/getDownloadUrl_91.py b/getDownloadUrl_91.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+#coding=utf8
+#按类别收集，多线程,更完善的错误处理（网络连接错误）
+#E-mail:[email protected]
+import urllib2,re,json
+import cookielib,urllib
+import getAllUrl_91
+from sgmllib import SGMLParser
+from Queue import Queue
+import threading
+from pyquery import PyQuery as pq
+import random,time
+proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
+cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
+opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler)
+opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
+
+saveAppdetailInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveReviewsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveScreenshortsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveDownloadAppInfo='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveDownloadAppInfo&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+isExistTheId_91='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=isExistTheId_91&id_91=%s&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+
+urlQueue=Queue()   
+logQueue=Queue()
+user_agents = [
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
+    'Opera/9.25 (Windows NT 5.1; U; en)',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
+    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
+    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
+    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
+    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
+] 
+rootUrl='http://app.91.com'
+L='摄影'
+folder='91app/'+L
+def main(homeUrl):
+
+    urlfile=open(folder+'/appurl.txt','r')
+    appurlList=json.load(urlfile)
+    for one in appurlList:
+        urlQueue.put(one)
+    threadList=[]
+    threadNum=5
+    for i in range(threadNum):
+        threadList.append(threading.Thread(target=parserHtmlGetDownloadUrl,args=()))
+    for i in range(threadNum):
+        threadList[i].start()
+    for i in range(threadNum):
+        threadList[i].join()
+def parserHtmlGetDownloadUrl():
+
+    aflag=True
+    while not urlQueue.empty():
+        url=urlQueue.get()
+        agent = random.choice(user_agents)
+        download_url=id_91=url_91=url_apple=id_apple=version=name=content=''
+        try:
+            opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')]
+            content = opener_proxy.open(url,timeout=5).read()
+        except Exception,e:
+            content = opener_proxy.open(url,timeout=10).read()
+        if not content:
+            continue
+        d=pq(content)
+        try:
+            detail=d('.soft_detail_h3')
+            name=detail('h3').text()
+            version=re.findall('[\d\.]+',detail('span').text())[0]
+            download=d('div.soft_detail_btn')
+            link=download('a')
+            for j in range(len(link)):
+                onea=link.eq(j)
+                if onea.attr('title')==u'iTunes \u4e0b\u8f7d':#iTunes 下载
+                    url_apple=onea.attr('href')
+                    id_apple=re.findall('\d{5,}',url_apple)[0]
+                if onea.text()==u'\u4e0b\u8f7d\u5230\u7535\u8111':#下载到电脑
+                    download_url=rootUrl+onea.attr('href')
+                    id_91=re.findall('\d{5,10}',download_url)[0]
+            if not download_url:
+                script=d('script').text()  
+                download_url=re.findall(r'http://app.91.com/soft/download/.+?\.[ipaz]{3}',script)[0]
+                id_91=re.findall('\d{5,10}',download_url)[0]
+            count=urllib2.urlopen(isExistTheId_91 % id_91).read()
+            if count  == '0':
+                returnid=urllib2.urlopen(saveDownloadAppInfo,urllib.urlencode({'url_apple':url_apple,'id_apple':id_apple,'id_91':id_91,\
+                                'url_91':url,'download_url':download_url,'version':version,'name':name,'category':L,'download_link':"<a href='%s'>%s</a>" % (download_url,download_url)})).read()
+                print url,returnid 
+            else:
+                print 'It is already exist ',name,url_91
+        except Exception,e:
+            print str(e),url
+            print link.text()
+            logQueue.put(url)
+
+if __name__=='__main__':
+    main('34')
+    t=[]
+    while not logQueue.empty():
+        t.append(logQueue.get())
+    jsonstruct=json.dumps(t)
+    urlfile=open(folder+'/log.txt','w')
+    print >>urlfile,jsonstruct