Skip to content

Commit

Permalink
add files
Browse files Browse the repository at this point in the history
  • Loading branch information
gausszh committed Oct 30, 2012
0 parents commit a3c2e7d
Show file tree
Hide file tree
Showing 9 changed files with 1,110 additions and 0 deletions.
4 changes: 4 additions & 0 deletions download91app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#coding=utf8
from pyquery import PyQuery as pq
import urllib,urllib2,cookielib

76 changes: 76 additions & 0 deletions downloadWebUsingRabbitmq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#coding=utf8

import pika,urllib2,urllib,cookielib,random,time
import threading
from Queue import Queue
q=Queue()
webBuff=Queue()
c=pika.ConnectionParameters(host='192.168.1.102')
conn=pika.BlockingConnection(c)
channel=conn.channel()
channel.queue_declare(queue='appurl')
channel.queue_declare(queue='webstring')
proxy_suport=urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
cookie_suport=urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener_proxy=urllib2.build_opener(proxy_suport,cookie_suport,urllib2.HTTPHandler)
opener_normal=urllib2.build_opener(cookie_suport,urllib2.HTTPHandler)

class downloadWeb(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
global q
while True:
while not q.empty():
url=q.get()
retF=''
print 'Downloading %s' % url
flag=False
for i in range(3):
try:
retF=opener_normal.open(url,timeout=5).read()
flag=True
break
except Exception,e:
print 'error'
time.sleep(i)
if not flag:
try:
retF=opener_proxy.open(url,timeout=5).read()
flag=True
except Exception,e:
print 'error'
if flag and webBuff.qsize()<100:
webBuff.put(retF)
while webBuff.qsize()>=100:
time.sleep(1)


threadList=[]
threadNum=2
def cleanWebBuff():
while True:
while not webBuff.empty():
s=webBuff.get()
channel.basic_publish(exchange='',routing_key='webstring',body=s)
threading.Thread(target=cleanWebBuff,args=()).start()
for i in range(threadNum):
threadList.append(downloadWeb())
for one in threadList:
one.start()
def callback(cn,method,pro,body):
url=str(body)
print 'Received %s' %url
q.put(url)
while q.qsize()>3:
time.sleep(1)
cn.basic_ack(delivery_tag=method.delivery_tag)


channel.basic_qos(prefetch_count=1)
channel.basic_consume(callback,queue='appurl')
print 'start...'
try:
channel.start_consuming()
except Exception,e:
channel.start_consuming()
95 changes: 95 additions & 0 deletions getAllUrl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python
#coding=utf8

#E-mail:[email protected]
import urllib2
import re
import time
import json,pika
from sgmllib import SGMLParser
connection=pika.BlockingConnection(pika.ConnectionParameters(host='192.168.1.102'))
channel=connection.channel()
channel.queue_declare(queue='appurl')
class ListName(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.appurl=[]
self.appname=[]
self.appnameFlag=''
self.tempname=''
def handle_data(self, text):
if self.appnameFlag==1:
self.tempname+=text

def start_a(self,attrs):
for n,k in attrs:
if n=='href':
if re.findall(r'\.*itunes.apple.com/.*/app.*id.*\d',k):
self.appurl.append(k)
self.appnameFlag=1
def end_a(self):
if self.appnameFlag==1:
self.appname.append(self.tempname)
self.tempname=''
self.appnameFlag=''
def geturl(homeurl,letter,page):
t='&letter=%s&page=%d' % (letter,page)
oneappurl=homeurl+t
print oneappurl
#oneappurl='http://itunes.apple.com/us/genre/ios-music/id6011?mt=8&letter=A&page=1'
returl=[]
retname=[]
while True:
try:
returnfile=urllib2.urlopen(oneappurl)
content = returnfile.read()
#print content
returnfile.close()
listname = ListName()
listname.feed(content)
retname=listname.appname
returl=listname.appurl
except Exception,e:
if e.reason.errno==10054:
time.sleep(1)
else:
break
break
for one in returl:
channel.basic_publish(exchange='',routing_key='appurl',body=one)
return (returl,retname)
def main(homeurl):
returl=[]#http://itunes.apple.com/us/genre/ios-games/id6014?mt=8&letter=A&page=26
retname=[]
#homeurl='http://itunes.apple.com/us/genre/ios-games/id6014?mt=8'
for i in range(65,91):#A-Z 还有*
page=1#65 66 67 68 69 70
letter=chr(i)
while True:
(appurl,appname)=geturl(homeurl,letter,page)
if len(appurl)<=1:
break
page+=1
print 'page%s ok' % page
returl+=appurl
retname+=appname
page=1
while True:
(appurl,appname)=geturl(homeurl,'*',page)
if len(appurl)<=1:
break
page+=1
returl+=appurl
retname+=appname
return (returl,retname)
if __name__=='__main__':
(a,b)=main('http://itunes.apple.com/cn/genre/ios-xiao-lu/id6007?mt=8')

# urlfilename='cn/'+'tu-shuappurl.txt'
# namefilename='cn/'+'tu-shuappname.txt'
# urlfile=open(urlfilename,'w')
# namefile=open(namefilename,'w')
# a=json.dumps(a)
# b=json.dumps(b)
# print >>urlfile,a
# print >>namefile,b
30 changes: 30 additions & 0 deletions getAllUrl_91.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#coding=utf8
from pyquery import PyQuery as pq
import urllib,urllib2,cookielib,json,os
from Queue import Queue
urlQueue=[]
def getUrl(homeUrl):
'''homeUrl such like http://app.91.com/Soft/iPhone/album/旅游/2690_%d_4'''
global urlQueue
i=1
while True:
url=homeUrl % i
i+=1
d=pq(url)
table=d('#AlbumList')
td=table('td')
if len(td)==0:
break
for j in range(len(td)):
onetd=td.eq(j)
aNode=onetd('a')
urlQueue.append(aNode.attr('href'))
print url
return urlQueue
if __name__=='__main__':
getUrl('http://app.91.com/soft/iPhone/album/摄影/4886_%d_5')#旅游、导航
folder='91app/摄影'
os.system('mkdir -p '+folder)
jsonstruct=json.dumps(urlQueue)
urlfile=open(folder+'/appurl.txt','w')
print >>urlfile,jsonstruct
105 changes: 105 additions & 0 deletions getDownloadUrl_91.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python
#coding=utf8
#按类别收集,多线程,更完善的错误处理(网络连接错误)
#E-mail:[email protected]
import urllib2,re,json
import cookielib,urllib
import getAllUrl_91
from sgmllib import SGMLParser
from Queue import Queue
import threading
from pyquery import PyQuery as pq
import random,time
proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler)
opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)

saveAppdetailInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
saveReviewsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
saveScreenshortsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
saveDownloadAppInfo='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveDownloadAppInfo&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
isExistTheId_91='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=isExistTheId_91&id_91=%s&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'

urlQueue=Queue()
logQueue=Queue()
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
]
rootUrl='http://app.91.com'
L='摄影'
folder='91app/'+L
def main(homeUrl):

urlfile=open(folder+'/appurl.txt','r')
appurlList=json.load(urlfile)
for one in appurlList:
urlQueue.put(one)
threadList=[]
threadNum=5
for i in range(threadNum):
threadList.append(threading.Thread(target=parserHtmlGetDownloadUrl,args=()))
for i in range(threadNum):
threadList[i].start()
for i in range(threadNum):
threadList[i].join()
def parserHtmlGetDownloadUrl():

aflag=True
while not urlQueue.empty():
url=urlQueue.get()
agent = random.choice(user_agents)
download_url=id_91=url_91=url_apple=id_apple=version=name=content=''
try:
opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')]
content = opener_proxy.open(url,timeout=5).read()
except Exception,e:
content = opener_proxy.open(url,timeout=10).read()
if not content:
continue
d=pq(content)
try:
detail=d('.soft_detail_h3')
name=detail('h3').text()
version=re.findall('[\d\.]+',detail('span').text())[0]
download=d('div.soft_detail_btn')
link=download('a')
for j in range(len(link)):
onea=link.eq(j)
if onea.attr('title')==u'iTunes \u4e0b\u8f7d':#iTunes 下载
url_apple=onea.attr('href')
id_apple=re.findall('\d{5,}',url_apple)[0]
if onea.text()==u'\u4e0b\u8f7d\u5230\u7535\u8111':#下载到电脑
download_url=rootUrl+onea.attr('href')
id_91=re.findall('\d{5,10}',download_url)[0]
if not download_url:
script=d('script').text()
download_url=re.findall(r'http://app.91.com/soft/download/.+?\.[ipaz]{3}',script)[0]
id_91=re.findall('\d{5,10}',download_url)[0]
count=urllib2.urlopen(isExistTheId_91 % id_91).read()
if count == '0':
returnid=urllib2.urlopen(saveDownloadAppInfo,urllib.urlencode({'url_apple':url_apple,'id_apple':id_apple,'id_91':id_91,\
'url_91':url,'download_url':download_url,'version':version,'name':name,'category':L,'download_link':"<a href='%s'>%s</a>" % (download_url,download_url)})).read()
print url,returnid
else:
print 'It is already exist ',name,url_91
except Exception,e:
print str(e),url
print link.text()
logQueue.put(url)

if __name__=='__main__':
main('34')
t=[]
while not logQueue.empty():
t.append(logQueue.get())
jsonstruct=json.dumps(t)
urlfile=open(folder+'/log.txt','w')
print >>urlfile,jsonstruct
Loading

0 comments on commit a3c2e7d

Please sign in to comment.