From a3c2e7d123bfa73344bb84989ecd8d86fce19f68 Mon Sep 17 00:00:00 2001
From: gausszh <gauss.zh@gmail.com>
Date: Tue, 30 Oct 2012 17:19:27 +0800
Subject: [PATCH] add files

---
 download91app.py               |   4 +
 downloadWebUsingRabbitmq.py    |  76 ++++++
 getAllUrl.py                   |  95 ++++++++
 getAllUrl_91.py                |  30 +++
 getDownloadUrl_91.py           | 105 +++++++++
 html_parser.py                 | 410 +++++++++++++++++++++++++++++++++
 improvingWebCrawlingByQuery.py | 230 ++++++++++++++++++
 shiwan_category.py             |  41 ++++
 usingRabbitmqAndPyquery.py     | 119 ++++++++++
 9 files changed, 1110 insertions(+)
 create mode 100644 download91app.py
 create mode 100644 downloadWebUsingRabbitmq.py
 create mode 100644 getAllUrl.py
 create mode 100644 getAllUrl_91.py
 create mode 100644 getDownloadUrl_91.py
 create mode 100755 html_parser.py
 create mode 100644 improvingWebCrawlingByQuery.py
 create mode 100644 shiwan_category.py
 create mode 100644 usingRabbitmqAndPyquery.py

diff --git a/download91app.py b/download91app.py
new file mode 100644
index 0000000..5c95818
--- /dev/null
+++ b/download91app.py
@@ -0,0 +1,4 @@
+#coding=utf8
+from pyquery import PyQuery as pq
+import urllib,urllib2,cookielib
+
diff --git a/downloadWebUsingRabbitmq.py b/downloadWebUsingRabbitmq.py
new file mode 100644
index 0000000..09b9272
--- /dev/null
+++ b/downloadWebUsingRabbitmq.py
@@ -0,0 +1,76 @@
+#coding=utf8
+
+import pika,urllib2,urllib,cookielib,random,time
+import threading
+from Queue import Queue
+q=Queue()
+webBuff=Queue()
+c=pika.ConnectionParameters(host='192.168.1.102')
+conn=pika.BlockingConnection(c)
+channel=conn.channel()
+channel.queue_declare(queue='appurl')
+channel.queue_declare(queue='webstring')
+proxy_suport=urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
+cookie_suport=urllib2.HTTPCookieProcessor(cookielib.CookieJar())
+opener_proxy=urllib2.build_opener(proxy_suport,cookie_suport,urllib2.HTTPHandler)
+opener_normal=urllib2.build_opener(cookie_suport,urllib2.HTTPHandler)
+
+class downloadWeb(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+    def run(self):
+        global q
+        while True:
+            while not q.empty():
+                url=q.get()
+                retF=''
+                print 'Downloading %s' % url
+                flag=False
+                for i in range(3):
+                    try:
+                        retF=opener_normal.open(url,timeout=5).read()
+                        flag=True
+                        break
+                    except Exception,e:
+                        print 'error'
+                        time.sleep(i)
+                if not flag:
+                    try:
+                        retF=opener_proxy.open(url,timeout=5).read()
+                        flag=True
+                    except Exception,e:
+                        print 'error'
+                if flag and webBuff.qsize()<100:
+                    webBuff.put(retF)
+                while webBuff.qsize()>=100:
+                    time.sleep(1)
+               
+
+threadList=[]
+threadNum=2
+def cleanWebBuff():
+    while True:
+        while not webBuff.empty():
+            s=webBuff.get()
+            channel.basic_publish(exchange='',routing_key='webstring',body=s)
+threading.Thread(target=cleanWebBuff,args=()).start()
+for i in range(threadNum):
+    threadList.append(downloadWeb())
+for one in threadList:
+    one.start()
+def callback(cn,method,pro,body):
+    url=str(body)
+    print 'Received %s' %url
+    q.put(url)
+    while q.qsize()>3:
+        time.sleep(1)
+    cn.basic_ack(delivery_tag=method.delivery_tag)
+    
+
+channel.basic_qos(prefetch_count=1)
+channel.basic_consume(callback,queue='appurl')
+print 'start...'
+try:
+    channel.start_consuming()
+except Exception,e:
+    channel.start_consuming()
\ No newline at end of file
diff --git a/getAllUrl.py b/getAllUrl.py
new file mode 100644
index 0000000..ee9e532
--- /dev/null
+++ b/getAllUrl.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+#coding=utf8
+
+#E-mail:gauss.zh@gmail.com
+import urllib2
+import re
+import time
+import json,pika
+from sgmllib import SGMLParser
+connection=pika.BlockingConnection(pika.ConnectionParameters(host='192.168.1.102'))
+channel=connection.channel()
+channel.queue_declare(queue='appurl')
+class ListName(SGMLParser):
+    def __init__(self):
+        SGMLParser.__init__(self)
+        self.appurl=[]
+        self.appname=[]
+        self.appnameFlag=''
+        self.tempname=''
+    def handle_data(self, text):
+        if self.appnameFlag==1:
+            self.tempname+=text
+            
+    def start_a(self,attrs):
+        for n,k in attrs:
+            if n=='href':
+                if re.findall(r'\.*itunes.apple.com/.*/app.*id.*\d',k):
+                    self.appurl.append(k)
+                    self.appnameFlag=1
+    def end_a(self):
+        if self.appnameFlag==1:
+            self.appname.append(self.tempname)
+            self.tempname=''           
+            self.appnameFlag=''
+def geturl(homeurl,letter,page):
+    t='&letter=%s&page=%d' % (letter,page)
+    oneappurl=homeurl+t
+    print oneappurl
+    #oneappurl='http://itunes.apple.com/us/genre/ios-music/id6011?mt=8&letter=A&page=1'
+    returl=[]
+    retname=[]
+    while True:
+        try:
+            returnfile=urllib2.urlopen(oneappurl)
+            content = returnfile.read()
+            #print content
+            returnfile.close()
+            listname = ListName()
+            listname.feed(content)
+            retname=listname.appname
+            returl=listname.appurl
+        except Exception,e:
+            if e.reason.errno==10054:
+                time.sleep(1)
+            else:
+                break
+        break
+    for one in returl:
+        channel.basic_publish(exchange='',routing_key='appurl',body=one)
+    return (returl,retname)
+def main(homeurl):
+    returl=[]#http://itunes.apple.com/us/genre/ios-games/id6014?mt=8&letter=A&page=26
+    retname=[]
+    #homeurl='http://itunes.apple.com/us/genre/ios-games/id6014?mt=8'
+    for i in range(65,91):#A-Z  还有* 
+        page=1#65 66 67 68 69 70
+        letter=chr(i)
+        while True:            
+            (appurl,appname)=geturl(homeurl,letter,page)
+            if len(appurl)<=1:
+                break
+            page+=1
+            print 'page%s ok' % page
+            returl+=appurl
+            retname+=appname
+    page=1
+    while True:
+        (appurl,appname)=geturl(homeurl,'*',page)
+        if len(appurl)<=1:
+            break
+        page+=1
+        returl+=appurl
+        retname+=appname
+    return (returl,retname)
+if __name__=='__main__':
+    (a,b)=main('http://itunes.apple.com/cn/genre/ios-xiao-lu/id6007?mt=8')
+    
+#    urlfilename='cn/'+'tu-shuappurl.txt'
+#    namefilename='cn/'+'tu-shuappname.txt'
+#    urlfile=open(urlfilename,'w')
+#    namefile=open(namefilename,'w')
+#    a=json.dumps(a)
+#    b=json.dumps(b)
+#    print >>urlfile,a
+#    print >>namefile,b
\ No newline at end of file
diff --git a/getAllUrl_91.py b/getAllUrl_91.py
new file mode 100644
index 0000000..4fcb73f
--- /dev/null
+++ b/getAllUrl_91.py
@@ -0,0 +1,30 @@
+#coding=utf8
+from pyquery import PyQuery as pq
+import urllib,urllib2,cookielib,json,os
+from Queue import Queue
+urlQueue=[]
+def getUrl(homeUrl):
+    '''homeUrl such like http://app.91.com/Soft/iPhone/album/旅游/2690_%d_4'''
+    global urlQueue
+    i=1
+    while True:
+        url=homeUrl % i
+        i+=1
+        d=pq(url)
+        table=d('#AlbumList')
+        td=table('td')
+        if len(td)==0:
+            break
+        for j in range(len(td)):
+            onetd=td.eq(j)
+            aNode=onetd('a')
+            urlQueue.append(aNode.attr('href'))
+        print url
+    return urlQueue
+if __name__=='__main__':
+    getUrl('http://app.91.com/soft/iPhone/album/摄影/4886_%d_5')#旅游、导航
+    folder='91app/摄影'
+    os.system('mkdir -p '+folder)
+    jsonstruct=json.dumps(urlQueue)
+    urlfile=open(folder+'/appurl.txt','w')
+    print >>urlfile,jsonstruct
\ No newline at end of file
diff --git a/getDownloadUrl_91.py b/getDownloadUrl_91.py
new file mode 100644
index 0000000..588ec3a
--- /dev/null
+++ b/getDownloadUrl_91.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+#coding=utf8
+#按类别收集，多线程,更完善的错误处理（网络连接错误）
+#E-mail:gauss.zh@gmail.com
+import urllib2,re,json
+import cookielib,urllib
+import getAllUrl_91
+from sgmllib import SGMLParser
+from Queue import Queue
+import threading
+from pyquery import PyQuery as pq
+import random,time
+proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
+cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
+opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler)
+opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
+
+saveAppdetailInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveReviewsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveScreenshortsInMysqlByUrl='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveDownloadAppInfo='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveDownloadAppInfo&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+isExistTheId_91='http://192.168.1.104/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=isExistTheId_91&id_91=%s&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+
+urlQueue=Queue()   
+logQueue=Queue()
+user_agents = [
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
+    'Opera/9.25 (Windows NT 5.1; U; en)',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
+    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
+    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
+    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
+    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
+] 
+rootUrl='http://app.91.com'
+L='摄影'
+folder='91app/'+L
+def main(homeUrl):
+    
+    urlfile=open(folder+'/appurl.txt','r')
+    appurlList=json.load(urlfile)
+    for one in appurlList:
+        urlQueue.put(one)
+    threadList=[]
+    threadNum=5
+    for i in range(threadNum):
+        threadList.append(threading.Thread(target=parserHtmlGetDownloadUrl,args=()))
+    for i in range(threadNum):
+        threadList[i].start()
+    for i in range(threadNum):
+        threadList[i].join()
+def parserHtmlGetDownloadUrl():
+     
+    aflag=True
+    while not urlQueue.empty():
+        url=urlQueue.get()
+        agent = random.choice(user_agents)
+        download_url=id_91=url_91=url_apple=id_apple=version=name=content=''
+        try:
+            opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')]
+            content = opener_proxy.open(url,timeout=5).read()
+        except Exception,e:
+            content = opener_proxy.open(url,timeout=10).read()
+        if not content:
+            continue
+        d=pq(content)
+        try:
+            detail=d('.soft_detail_h3')
+            name=detail('h3').text()
+            version=re.findall('[\d\.]+',detail('span').text())[0]
+            download=d('div.soft_detail_btn')
+            link=download('a')
+            for j in range(len(link)):
+                onea=link.eq(j)
+                if onea.attr('title')==u'iTunes \u4e0b\u8f7d':#iTunes 下载
+                    url_apple=onea.attr('href')
+                    id_apple=re.findall('\d{5,}',url_apple)[0]
+                if onea.text()==u'\u4e0b\u8f7d\u5230\u7535\u8111':#下载到电脑
+                    download_url=rootUrl+onea.attr('href')
+                    id_91=re.findall('\d{5,10}',download_url)[0]
+            if not download_url:
+                script=d('script').text()  
+                download_url=re.findall(r'http://app.91.com/soft/download/.+?\.[ipaz]{3}',script)[0]
+                id_91=re.findall('\d{5,10}',download_url)[0]
+            count=urllib2.urlopen(isExistTheId_91 % id_91).read()
+            if count  == '0':
+                returnid=urllib2.urlopen(saveDownloadAppInfo,urllib.urlencode({'url_apple':url_apple,'id_apple':id_apple,'id_91':id_91,\
+                                'url_91':url,'download_url':download_url,'version':version,'name':name,'category':L,'download_link':"<a href='%s'>%s</a>" % (download_url,download_url)})).read()
+                print url,returnid 
+            else:
+                print 'It is already exist ',name,url_91
+        except Exception,e:
+            print str(e),url
+            print link.text()
+            logQueue.put(url)
+       
+if __name__=='__main__':
+    main('34')
+    t=[]
+    while not logQueue.empty():
+        t.append(logQueue.get())
+    jsonstruct=json.dumps(t)
+    urlfile=open(folder+'/log.txt','w')
+    print >>urlfile,jsonstruct
\ No newline at end of file
diff --git a/html_parser.py b/html_parser.py
new file mode 100755
index 0000000..8af2250
--- /dev/null
+++ b/html_parser.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python
+#coding=utf8
+#按类别收集，多线程,更完善的错误处理（网络连接错误）
+#E-mail:gauss.zh@gmail.com
+import urllib2,re,json
+import cookielib,urllib
+import getAllUrl
+from sgmllib import SGMLParser
+from Queue import Queue
+import threading
+from pyquery import PyQuery as pq
+import random,time
+proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
+cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
+opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler)
+opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
+
+MONTH={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
+
+saveAppdetailInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveReviewsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveScreenshortsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+   
+user_agents = [
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
+    'Opera/9.25 (Windows NT 5.1; U; en)',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
+    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
+    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
+    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
+    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
+] 
+        
+urlpat=re.compile(r'\.*itunes.apple.com/us/app.*id.*\d')
+category=''
+folder=''
+subcategory=''
+queue=Queue()
+errorqueue=Queue()#Mozilla/5.0 (X11; Linux x86_64; rv:10.0.5) Gecko/20120606 Firefox/10.0.5
+headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'}
+class ListName(SGMLParser):
+    def __init__(self):
+        SGMLParser.__init__(self)
+        self.infoFlag = ""
+        self.appurl=[]
+        self.appname=[]
+        self.descriptionFlag=''
+        self.descriptionPFlag=''
+        self.description=[]
+        self.appnameFlag=''
+        self.info=[]
+        self.iphonescreenshots=[]
+        self.ipadscreenshots=[]
+        self.whatisnewFlag=''
+        self.whatisnew=[]
+        self.allstartFlag=''
+        self.allstart=[]
+        self.customerstartFlag=''
+        self.customerstart=[]
+        self.customerFlag=''
+        self.customer=[]
+        self.iphonescreenshotsFlag=''
+        self.ipadscreenshotsFlag=''
+        self.nameFlag=''
+        self.name=''
+        self.icon=''
+    def handle_data(self, text):
+        if self.infoFlag == 1:
+            self.info.append(text)
+        if self.appnameFlag==1:
+            self.appname.append(text)
+            self.appnameFlag=''
+        if self.descriptionFlag==1 and self.descriptionPFlag==1 :
+            self.description.append(text)     
+        if self.customerFlag==1:
+            self.customer.append(text)
+        if self.whatisnewFlag==1:
+            self.whatisnew.append(text)
+        if self.nameFlag==1:
+            self.name+=text
+    def start_p(self,attrs):
+        if self.descriptionFlag==1:
+            self.descriptionPFlag=1
+    def end_p(self):
+        self.descriptionFlag=''
+        self.descriptionPFlag=''
+    def start_img(self,attrs):
+        for n,k in attrs:
+            if (k=='landscape' or k=='portrait'):
+                if self.ipadscreenshotsFlag==1:
+                    self.ipadscreenshots.append(attrs[2][1])
+                if self.iphonescreenshotsFlag==1:
+                    self.iphonescreenshots.append(attrs[2][1])
+            if n=='height' and int(k)>=150:
+                self.icon=attrs[4][1]
+    def start_a(self,attrs):
+        for n,k in attrs:
+            if n=='href':
+                if re.findall(r'\.*itunes.apple.com/us/app.*id.*\d',k):
+                    self.appurl.append(k)
+                    self.appnameFlag=1
+    def start_div(self,attrs):
+        for n,k in attrs:
+            if n=='metrics-loc' and k=='Titledbox_Description':#http://itunes.apple.com/us/genre/ios-games/id6014?mt=8
+                self.descriptionFlag=1#http://itunes.apple.com/us/app/angry-birds/id343200656?mt=8
+            if n=='id' and k=='left-stack':
+                self.infoFlag=1
+                self.customerFlag=''
+            if n=='metrics-loc':
+                if re.findall(r"What's New",k):
+                    self.whatisnewFlag=1
+            if n=='class' and k=='rating' and self.infoFlag==1:
+                self.allstart.append(attrs[3][1])
+                self.customerstartFlag=1
+                self.allstartFlag=1
+            if n=='class' and k=='customer-reviews':
+                self.customerFlag=1
+            if n=='class' and k=='rating' and self.customerFlag==1:
+                self.customerstart.append(attrs[3][1])
+            if n=='class' :
+                if re.findall(r'iphone-screen-shots',k):
+                    self.iphonescreenshotsFlag=1
+                    self.ipadscreenshotsFlag=''
+            if n=='class' :
+                if re.findall(r'ipad-screen-shots',k):
+                    self.ipadscreenshotsFlag=1
+                    self.iphonescreenshotsFlag=''  
+    def start_h1(self,attrs):
+        self.nameFlag=1  
+    def end_h1(self):
+        self.nameFlag=''
+    def end_div(self):
+        self.whatisnewFlag=''
+def simplify(lt):
+    """去除list中的杂项，比如\n,' '"""
+    ret=[]
+    for one in lt:
+        temp=one.strip()
+        if temp:
+            ret.append(temp)
+    return ret
+   
+def main(rootUrl):
+    global folder,subcategory    
+    try:
+        homeurl=rootUrl
+        (a,b)=getAllUrl.main(homeurl)#ios-abc
+        t=re.findall(r'ios-.*/',homeurl)
+        
+        folder=t[0][4:-1]
+       # subcategory=t[0][4:-1]
+        urlfilename=folder+'appurl.txt'
+        namefilename=folder+'appname.txt'
+        urlfile=open(urlfilename,'w')
+        namefile=open(namefilename,'w')
+        a=json.dumps(a)
+        b=json.dumps(b)
+        print >>urlfile,a
+        print >>namefile,b
+        urlfile.close()
+        namefile.close()
+        
+#        global queue
+#        logfile=open('F_starerrornameandurl.txt','r')
+#        log=logfile.readlines()
+#        logfile.close()
+#        for one in log:
+#            url=re.findall(r'http.*mt=8',one)[0]
+#            n='null'
+#            queue.put((n,url))
+            
+        appnamefile=open(namefilename,'r')
+        appname=json.load(appnamefile)
+        appurlfile=open(urlfilename,'r')
+        appurl=json.load(appurlfile)
+        appnamefile.close()
+        appurlfile.close()
+        for i in range(len(appname)):
+            n=appname[i]
+            u=appurl[i]
+            queue.put((n,u))# (name url) tuple
+        templatefile=open('template.xml','r')
+        template=''.join(templatefile.readlines())
+        templatefile.close()
+    except Exception,e:
+       print str(e)
+    threadlist=[]
+#    while True:
+#        (name,url)=queue.get()
+#        tmpid=re.findall('/id\d*',url)[0][3:]
+#        if tmpid=='476005657':
+#            break
+    for i in range(10):
+        add=folder+'/thread%d__' % i
+        threadlist.append(threading.Thread(target=threaddownload,args=(add,template,)))
+    for i in range(len(threadlist)):
+        threadlist[i].start()
+    for i in range(len(threadlist)):
+        threadlist[i].join()
+#    threaddownload('musicData/123/logF_starthreadfive',template,)
+                       
+def threaddownload(filename,template):
+    global queue
+    global errorqueue
+    timeoutcounts=0
+    outputFile=open(filename+'.xml','w')
+    print >>outputFile,'<?xml version="1.0" encoding="UTF-8"?>\n<apps>'
+    i=-1
+    counts=0
+    while True:
+        i+=1
+        if queue.empty():
+            break    
+        try:
+            (name,url)=queue.get()
+            while  True:
+                try: 
+                   # req = urllib2.Request(url = "http://itunes.apple.com/us/a5pp/numbers/id361304891?mt=8",headers = headers)
+                    agent = random.choice(user_agents)
+                    opener_normal.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')]
+                    content = opener_normal.open(url,timeout=5).read()
+                    #returnfile=urllib2.urlopen(url=url,timeout=5)
+#                    content=returnfile.read()
+#                    returnfile.close()
+                    timeoutcounts=0
+                    break
+                except Exception,e:
+                    if timeoutcounts<5:
+                        timeoutcounts+=1                       
+                        time.sleep(timeoutcounts)
+                    else:
+                        print name,url
+                        print str(e)
+                        errorqueue.put((name,url))
+                        break
+            if timeoutcounts>=5:
+                try:
+                    agent = random.choice(user_agents)
+                    opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')]
+                    content = opener_proxy.open(url,timeout=5).read()
+                    print 'proxy'
+                except Exception,e:
+                    timeoutcounts=0
+                    continue    
+            listname=ListName()
+            listname.feed(content)
+            d=pq(content)
+#            name=re.findall(r'app/.*/id',appurl[i])[0]
+#            name=name[4:-3]
+            name=listname.name
+            info={}
+            j=0
+            tmpinfo=simplify(listname.info[0:50])
+        
+            info['price']='Free'  
+            for one in tmpinfo:
+                if re.findall(r'\$\d*',one):
+                    info['price']=one
+                if one in ['Category:','Released','Updated:','Version:','Size:','Languages:','Language:',\
+                           'Seller:','Requirements:','Released:','All Versions:','Current Version:']:
+                    info[one]=tmpinfo[j+1]
+                    if one=='Seller:':
+                        info['copyright:']=tmpinfo[j+2]
+                        info['app_rating:']=tmpinfo[j+3]
+                        info['reasons:']=tmpinfo[j+4]
+                j+=1          
+#            if locals().has_key('category'):
+#                if category!=info['Category:']:
+#                    continue
+#            else:
+            category=info['Category:'] 
+            icon=listname.icon
+            id=str(re.findall(r'[\d]{5,}',url)[0])   
+            customer=simplify(listname.customer)
+            price=info['price']            
+            if info.has_key('Released:'):
+                Updated=info['Released:']
+            elif info.has_key('Updated:'):
+                Updated=info['Updated:']
+            version=info['Version:']
+            size=info['Size:']
+            languages=''
+            if info.has_key('Languages:'):
+                languages=info['Languages:']
+            elif info.has_key('Language:'):
+                languages=info['Language:']
+            
+            seller=info['Seller:']
+            copyright=info['copyright:']
+            app_rating=info['app_rating:']
+            reason=info['reasons:']
+            requirements=info['Requirements:']
+            stars1=rating_counts1=''
+            tmpi=0
+            if info.has_key('Current Version:'):
+                t=listname.allstart[tmpi].split(',')
+                tmpi+=1
+                stars1=t[0]   
+                rating_counts1=info['Current Version:']
+            stars2=rating_counts2=''
+            if  info.has_key('All Versions:'):
+                t=listname.allstart[tmpi].split(',')
+                stars2=t[0]
+                rating_counts2=info['All Versions:']
+            description=''.join(listname.description)
+            
+            t=simplify(listname.whatisnew[3:])
+            whats_new=''.join(t)
+            iphonescreenshot=''
+            screenshotDictList=[]
+            for one in listname.iphonescreenshots:
+                iphonescreenshot+='<screenshot type="iphone"><![CDATA[%s]]></screenshot>\n' % one#xml
+                # save in mysql throught thinksns
+                #########################
+                screenshotDictList.append({'screenshots':one,'aid':id})
+                
+            ipadscreenshot=''
+            for one in listname.ipadscreenshots:
+                ipadscreenshot+='<screenshot type="ipad"><![CDATA[%s]]></screenshot>\n' % one#xml
+                # save in mysql throught thinksns
+                #############################
+                screenshotDictList.append({'screenshots':one,'aid':id})
+            usernameindex=[]
+            for k in range(len(customer)):
+                if re.findall(r'by\n',customer[k]):
+                    usernameindex.append(k)
+            review=[]
+            reviewDictList=[]
+            for k in range(len(usernameindex)):
+                if k!=len(usernameindex)-1:
+                    t=usernameindex[k+1]-1
+                else:
+                    t=usernameindex[k]+2
+                s='<review>\n<user><![CDATA[%s]]></user>\n<rating><![CDATA[%s]]></rating>\n<title><![CDATA[%s]]></title>\n<content><![CDATA[%s]]></content>\n</review>\n' % \
+                (re.findall(r'.*$',customer[usernameindex[k]])[0].strip(),listname.customerstart[k],customer[usernameindex[k]-1],''.join(customer[usernameindex[k]+1:t]))
+                #save review in mysql throught by thinksns
+                ##########################
+                reviewDictList.append({'aid':id,'uname':re.findall(r'.*$',customer[usernameindex[k]])[0].strip(),'rating':\
+                            listname.customerstart[k],'title':customer[usernameindex[k]-1],'content':''.join(customer[usernameindex[k]+1:t])})
+                
+                review.append(s)
+
+            review=''.join(review)#s = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', str)
+            output=template % (id,icon,url,name, price,category,subcategory,Updated,version,size,languages,seller,copyright,app_rating,reason,requirements,\
+                 stars1,rating_counts1,stars2,rating_counts2,description,whats_new,iphonescreenshot,ipadscreenshot,review)
+            #save appdetail in mysql throught by thinksns
+            pydes=d('.product-review').eq(0)
+            pydes=pydes.html()#Jan 04, 2011
+            Updated=Updated.strip()
+            updateTime=Updated[-4:]+'-'+str(MONTH[Updated[:3]])+'-'+Updated[4:6]
+            appdetailData=urllib.urlencode({'aid':id,'appurl':url,'name':name,'icon_url':icon,'price':price,'category':category,\
+                                             'subcategory':subcategory,'updated':updateTime,'version':version,'size':size,'languages':\
+                                             languages,'seller':seller,'copyright':copyright,'des':app_rating,'reason':reason,'requirement':requirements,\
+                                             'cstars':stars1,'crating_count':rating_counts1,'stars':stars2,'rating_count':rating_counts2,\
+                                             'description':pydes,'whatsnew':whats_new})
+##   APPDETAIL
+            aid=urllib2.urlopen(saveAppdetailInMysqlByUrl,appdetailData).read()
+            for one in reviewDictList:
+                one['aid']=aid
+                #reivew
+                urllib2.urlopen(saveReviewsInMysqlByUrl,urllib.urlencode(one))
+            
+            for one in screenshotDictList:
+                one['aid']=aid
+                urllib2.urlopen(saveScreenshortsInMysqlByUrl,urllib.urlencode(one))      
+            
+            
+            
+            
+            
+            
+            
+            
+            print '%s    %dth over!' % (threading.currentThread().getName(),i)
+            output = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', output)
+            print >>outputFile,output
+            counts+=1
+            if counts==10:
+                counts=0
+                print >>outputFile,'</apps>'
+                outputFile.close()
+                tempfilename='%s%d.xml' % (filename,i)
+                outputFile=open(tempfilename,'w')
+                print >>outputFile,'<?xml version="1.0" encoding="UTF-8"?>\n<apps>'
+        except Exception,e:
+            print str(e)
+            print url
+            print info
+            errorqueue.put((name,url))
+            print >>outputFile,'</apps>'
+            outputFile.close()
+            tempfilename='%s%d.xml' % (filename,i)
+            outputFile=open(tempfilename,'w')
+            print >>outputFile,'<?xml version="1.0" encoding="UTF8"?>\n<apps>'
+    if not counts==0:
+        print >>outputFile,'</apps>' 
+        outputFile.close()
+   # outputFile.close()
+if __name__=='__main__':
+    s=['http://itunes.apple.com/us/genre/ios-music/id6011?mt=8']
+    for one in s:
+        main(one)
+        errorfile=open('log%s.txt' % subcategory,'w')
+        tempqueue=[]
+        while not errorqueue.empty():
+            tempqueue.append(errorqueue.get())
+        t=json.dumps(tempqueue)
+        print >>errorfile,t    
+        print '***********ok**********'
diff --git a/improvingWebCrawlingByQuery.py b/improvingWebCrawlingByQuery.py
new file mode 100644
index 0000000..790216c
--- /dev/null
+++ b/improvingWebCrawlingByQuery.py
@@ -0,0 +1,230 @@
+#!/usr/bin/python27
+#coding=UTF-8
+#author:gausszh
+#e-mail:gausszh@gmail.com
+
+import urllib2,re,json,os
+import cookielib,urllib
+import getAllUrl
+from Queue import Queue
+import threading
+from pyquery import PyQuery as pq
+import random,time
+
+proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
+cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
+opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler)
+opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
+
+saveAppdetailInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveReviewsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveScreenshortsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+   
+user_agents = [
+    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
+    'Opera/9.25 (Windows NT 5.1; U; en)',
+    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
+    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
+    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
+    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
+    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
+    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
+] 
+queue=Queue()
+errorqueue=Queue()
+def getC(node):
+    try:
+        return node.text().split(':')[1].strip()
+    except Exception,e:
+        return node.text().split(u'\uff1a')[1].strip()
+def main(rootUrl):
+    global folder,subcategory    
+    try:
+        homeurl=rootUrl
+       # (a,b)=getAllUrl.main(homeurl)#ios-abc
+        t=re.findall(r'ios-.*/',homeurl)
+        
+        folder='cn/'+t[0][4:-1]
+        os.system('mkdir -p '+folder)
+       # subcategory=t[0][4:-1]
+        urlfilename=folder+'appurl.txt'
+        namefilename=folder+'appname.txt'
+#        urlfile=open(urlfilename,'w')
+#        namefile=open(namefilename,'w')
+#        a=json.dumps(a)
+#        b=json.dumps(b)
+#        print >>urlfile,a
+#        print >>namefile,b
+#        urlfile.close()
+#        namefile.close()
+            
+        appnamefile=open(namefilename,'r')
+        appname=json.load(appnamefile)
+        appurlfile=open(urlfilename,'r')
+        appurl=json.load(appurlfile)
+        appnamefile.close()
+        appurlfile.close()
+        for i in range(len(appname)):
+            n=appname[i]
+            u=appurl[i]
+            queue.put((n,u))# (name url) tuple
+        templatefile=open('template.xml','r')
+        template=''.join(templatefile.readlines())
+        templatefile.close()
+    except Exception,e:
+       print str(e)
+    threadlist=[]
+#    while True:
+#        (name,url)=queue.get()
+#        tmpid=re.findall('/id\d*',url)[0][3:]
+#        if tmpid=='476005657':
+#            break
+    for i in range(9):
+        add=folder+'/thread%d__' % i
+        threadlist.append(threading.Thread(target=threaddownload,args=(add,template,)))
+    for i in range(len(threadlist)):
+        threadlist[i].start()
+    for i in range(len(threadlist)):
+        threadlist[i].join()
+#    threaddownload('musicData/123/logF_starthreadfive',template,)
+                       
+def threaddownload(filename,template):
+    global queue
+    global errorqueue
+    timeoutcounts=0
+    outputFile=open(filename+'.xml','w')
+    print >>outputFile,'<?xml version="1.0" encoding="UTF-8"?>\n<apps>'
+    cases=-1
+    counts=0
+    while True:
+        cases+=1
+        if queue.empty():
+            break    
+        try:
+            (name,url)=queue.get()
+            while  True:
+                try: 
+                   # req = urllib2.Request(url = "http://itunes.apple.com/us/a5pp/numbers/id361304891?mt=8",headers = headers)
+                    agent = random.choice(user_agents)
+                    opener_normal.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')]
+                    content = opener_normal.open(url,timeout=5).read()
+                    timeoutcounts=0
+                    break
+                except Exception,e:
+                    if timeoutcounts<5:
+                        timeoutcounts+=1                       
+                        time.sleep(timeoutcounts)
+                    else:
+                        break
+            if timeoutcounts>=5:
+                try:
+                    agent = random.choice(user_agents)
+                    opener_proxy.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer','http://www.google.com.hk')]
+                    content = opener_proxy.open(url,timeout=5).read()
+                    print 'proxy'
+                    timeoutcounts=0
+                except Exception,e:
+                    print name,url
+                    print str(e)
+                    errorqueue.put((name,url))
+                    timeoutcounts=0
+                    continue  
+            d=pq(content)  
+            name=d('h1').text()
+            id=re.findall('\d{5,}',url)[0]
+            ls=d('#left-stack')
+            lschildren=ls.children()
+            detail=lschildren.eq(0)
+            price=detail('.price').text()
+            icon=detail('img').attr('src')
+            detail_ul=detail('ul')
+            categoryN=detail_ul('.genre')
+            dateN=categoryN.next()
+            versionN=dateN.next()
+            sizeN=versionN.next()
+            languagesN=sizeN.next()
+            sellerN=languagesN.next()
+            copyrightN=sellerN.next()
+            ratingN=detail_ul.next()
+            requirementsN=ratingN.next()
+            stars1=rating_counts1=stars2=rating_counts2=''
+            list_customer_ratingN=detail.next()
+            customer_ratingN=list_customer_ratingN('.rating')
+            if len(customer_ratingN)==2:
+                aria_label=customer_ratingN.eq(0).attr('aria-label')
+                aria_label=re.findall('[\d\.]+',aria_label)
+                stars1=aria_label[0]
+                rating_counts1=aria_label[1]
+                aria_label=customer_ratingN.eq(1).attr('aria-label')
+                aria_label=re.findall('[\d\.]+',aria_label)
+                stars2=aria_label[0]
+                rating_counts2=aria_label[1]
+            elif len(customer_ratingN)==1:
+                pN=customer_ratingN.eq(0).prev()#上一个节点
+                aria_label=customer_ratingN.eq(0).attr('aria-label')
+                aria_label=re.findall('[\d\.]+',aria_label)
+                if pN.text().find(u'\u5f53'):                   
+                    stars1=aria_label[0]
+                    rating_counts1=aria_label[1]
+                else:
+                    stars2=aria_label[0]
+                    rating_counts2=aria_label[1]
+            description=d('.product-review').eq(0)
+            descriptionData=description.html()
+            descriptionXml=description.text()
+            whats_new=d('.product-review').eq(1)
+            whats_newData=whats_new.html()
+            whats_newXml=whats_new.text()
+            date='-'.join(re.findall('\d+',getC(dateN)))
+            output=template % (id,url,name,getC(categoryN),'',descriptionXml,whats_newXml)
+            appdetailData=urllib.urlencode({'aid':id,'appurl':url,'name':name,'icon_url':icon,'price':price,'category':getC(categoryN),\
+                                             'subcategory':'','updated':date,'version':getC(versionN),'size':getC(sizeN),'languages':\
+                                             getC(languagesN),'seller':getC(sellerN),'copyright':copyrightN.text(),'des':ratingN('a').text(),\
+                                             'reason':ratingN('ul').text(),'requirement':getC(requirementsN),'cstars':stars1,'crating_count':rating_counts1,\
+                                             'stars':stars2,'rating_count':rating_counts2,'description':descriptionData,'whatsnew':whats_newData})
+            aid=urllib2.urlopen(saveAppdetailInMysqlByUrl,appdetailData).read()   
+            reviewN=d('.customer-review')
+            for i in range(len(reviewN)):
+                oneN=reviewN.eq(i)
+                s=oneN('.rating').attr('aria-label')
+                c_rating=re.findall('[\d\.]+',s)[0]
+                data={'aid':aid,'uname':getC(oneN('.user-info')),'rating':c_rating,'title':oneN('.customerReviewTitle').text(),'content':oneN('.content').text()}
+                urllib2.urlopen(saveReviewsInMysqlByUrl,urllib.urlencode(data))
+            screenshotsN=d('.swoosh.lockup-container.application.large.screenshots')('img')
+            for i in range(len(screenshotsN)):
+                urllib2.urlopen(saveScreenshortsInMysqlByUrl,urllib.urlencode({'screenshots':screenshotsN.eq(i).attr('src'),'aid':aid}))    
+            print '%s    %dth over!' % (threading.currentThread().getName(),cases)
+            output = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', output)
+            print >>outputFile,output
+            counts+=1
+            if counts==1000:
+                counts=0
+                print >>outputFile,'</apps>'
+                outputFile.close()
+                tempfilename='%s%d.xml' % (filename,i)
+                outputFile=open(tempfilename,'w')
+                print >>outputFile,'<?xml version="1.0" encoding="UTF-8"?>\n<apps>'
+        except Exception,e:
+            print str(e)
+            print url
+            errorqueue.put((name,url))
+            print >>outputFile,'</apps>'
+            outputFile.close()
+            tempfilename='%s%d.xml' % (filename,i)
+            outputFile=open(tempfilename,'w')
+            print >>outputFile,'<?xml version="1.0" encoding="UTF8"?>\n<apps>'
+    if not counts==0:
+        print >>outputFile,'</apps>' 
+        outputFile.close()
+   # outputFile.close()
+if __name__=='__main__':
+    s=['http://itunes.apple.com/cn/genre/ios-tu-shu/id6018?mt=8']
+    for one in s:
+        main(one)
+        errorfile=open('%slog.txt' % folder,'w')
+        tempqueue=[]
+        while not errorqueue.empty():
+            tempqueue.append(errorqueue.get())
+        t=json.dumps(tempqueue)
+        print >>errorfile,t    
+        print '***********ok**********'
\ No newline at end of file
diff --git a/shiwan_category.py b/shiwan_category.py
new file mode 100644
index 0000000..f0d00c5
--- /dev/null
+++ b/shiwan_category.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+#coding=utf8
+#author:gausszh
+#E-mail:gauss.zh@gmail.com
+#2012-09-25
+from pyquery import PyQuery as pq
+import re,urllib2,cookielib
+
+proxy_support = urllib2.ProxyHandler({'http':'http://127.0.0.1:8087'})
+cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
+opener_proxy = urllib2.build_opener(proxy_support,cookie_support, urllib2.HTTPHandler)
+opener_normal= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
+MAXINF=999999
+reg=re.compile('id\d{7,}')
+
+def download_one_subcategory(beginUrl):
+    """beginUrl like "http://apple.shiwan.com/list/cat-371" """
+    
+    beginUrl+='/pf-0/price-0/age-0/ft-0/ps-0/order-0/p-%d'
+    for i in range(1,MAXINF):
+        theUrl=beginUrl % i 
+        try:
+            content=opener_normal.open(theUrl,timeout=5).read()
+        except Exception,e:
+            print str(e),theUrl
+        d=pq(content)
+        dir_ido=d('div.dir_ido')
+        if not re.findall('\d{3,}',dir_ido.eq(0)('a').attr('href')):
+            break
+        for alink in range(len(dir_ido)):
+            div=dir_ido.eq(alink)
+            s=div.html()
+            regid=reg.findall(s)
+            if regid:
+                id=regid[0][2:]
+                print id
+            else:
+                print s
+if __name__=='__main__':
+    download_one_subcategory('http://apple.shiwan.com/list/cat-371')
+                
\ No newline at end of file
diff --git a/usingRabbitmqAndPyquery.py b/usingRabbitmqAndPyquery.py
new file mode 100644
index 0000000..de256b0
--- /dev/null
+++ b/usingRabbitmqAndPyquery.py
@@ -0,0 +1,119 @@
+#coding=utf8
+
+import pika,urllib2,urllib,cookielib,random,time
+import re
+from pyquery import PyQuery as pq
+from Queue import Queue
+saveAppdetailInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveAppdetailInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveReviewsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveReviewsInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+saveScreenshortsInMysqlByUrl='http://192.168.1.125/ThinkSNS_2_5_forlove/index.php?app=api&mod=apps&act=saveScreenshortInMysql&oauth_token=cba6e111235ed535957be29d6436087d&oauth_token_secret=7cec9ee898ca22743eb2e1b32203304e'
+   
+htmlStringQ=Queue()
+c=pika.ConnectionParameters(host='192.168.1.102')
+conn=pika.BlockingConnection(c)
+channel=conn.channel()
+channel.queue_declare(queue='appurl')
+channel.queue_declare(queue='webstring')
+COUNTS=1
+XML_COUNTS=1
+templatefile=open('template.xml','r')
+template=''.join(templatefile.readlines())
+outputFile=open('samsung.xml','w')
+print >>outputFile,'<?xml version="1.0" encoding="UTF-8"?>\n<apps>'
+
+def getC(node):
+    try:
+        return node.text().split(':')[1].strip()
+    except Exception,e:
+        return node.text().split(u'\uff1a')[1].strip()
+def convertHtmlToXmlAndDatabase(htmlString):
+    global XML_COUNTS,COUNTS
+    d=pq(htmlString)  
+    name=d('h1').text()
+    link=d('link')
+    url=link.attr('href')
+    print url
+    id=re.findall('\d{5,}',url)[0]
+    ls=d('#left-stack')
+    lschildren=ls.children()
+    detail=lschildren.eq(0)
+    price=detail('.price').text()
+    icon=detail('img').attr('src')
+    detail_ul=detail('ul')
+    categoryN=detail_ul('.genre')
+    dateN=categoryN.next()
+    versionN=dateN.next()
+    sizeN=versionN.next()
+    languagesN=sizeN.next()
+    sellerN=languagesN.next()
+    copyrightN=sellerN.next()
+    ratingN=detail_ul.next()
+    requirementsN=ratingN.next()
+    stars1=rating_counts1=stars2=rating_counts2=''
+    list_customer_ratingN=detail.next()
+    customer_ratingN=list_customer_ratingN('.rating')
+    if len(customer_ratingN)==2:
+        aria_label=customer_ratingN.eq(0).attr('aria-label')
+        aria_label=re.findall('[\d\.]+',aria_label)
+        stars1=aria_label[0]
+        rating_counts1=aria_label[1]
+        aria_label=customer_ratingN.eq(1).attr('aria-label')
+        aria_label=re.findall('[\d\.]+',aria_label)
+        stars2=aria_label[0]
+        rating_counts2=aria_label[1]
+    elif len(customer_ratingN)==1:
+        pN=customer_ratingN.eq(0).prev()#上一个节点
+        aria_label=customer_ratingN.eq(0).attr('aria-label')
+        aria_label=re.findall('[\d\.]+',aria_label)
+        if pN.text().find(u'\u5f53'):                   
+            stars1=aria_label[0]
+            rating_counts1=aria_label[1]
+        else:
+            stars2=aria_label[0]
+            rating_counts2=aria_label[1]
+    description=d('.product-review').eq(0)
+    descriptionData=description.html()
+    descriptionXml=description.text()
+    whats_new=d('.product-review').eq(1)
+    whats_newData=whats_new.html()
+    whats_newXml=whats_new.text()
+    date='-'.join(re.findall('\d+',getC(dateN)))
+    output=template % (id,url,name,getC(categoryN),'',descriptionXml,whats_newXml)
+    output = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', output)
+    if XML_COUNTS==1000:
+        XML_COUNTS=0
+        print >>outputFile,'</apps>'
+        outputFile.close()
+        tempfilename='samsung%d.xml' % (COUNTS)
+        outputFile=open(tempfilename,'w')
+        print >>outputFile,'<?xml version="1.0" encoding="UTF-8"?>\n<apps>'
+    appdetailData=urllib.urlencode({'aid':id,'appurl':url,'name':name,'icon_url':icon,'price':price,'category':getC(categoryN),\
+                                     'subcategory':'','updated':date,'version':getC(versionN),'size':getC(sizeN),'languages':\
+                                     getC(languagesN),'seller':getC(sellerN),'copyright':copyrightN.text(),'des':ratingN('a').text(),\
+                                     'reason':ratingN('ul').text(),'requirement':getC(requirementsN),'cstars':stars1,'crating_count':rating_counts1,\
+                                     'stars':stars2,'rating_count':rating_counts2,'description':descriptionData,'whatsnew':whats_newData})
+    aid=urllib2.urlopen(saveAppdetailInMysqlByUrl,appdetailData).read()   
+    reviewN=d('.customer-review')
+    for i in range(len(reviewN)):
+        oneN=reviewN.eq(i)
+        s=oneN('.rating').attr('aria-label')
+        c_rating=re.findall('[\d\.]+',s)[0]
+        data={'aid':aid,'uname':getC(oneN('.user-info')),'rating':c_rating,'title':oneN('.customerReviewTitle').text(),'content':oneN('.content').text()}
+        urllib2.urlopen(saveReviewsInMysqlByUrl,urllib.urlencode(data))
+    screenshotsN=d('.swoosh.lockup-container.application.large.screenshots')('img')
+    for i in range(len(screenshotsN)):
+        urllib2.urlopen(saveScreenshortsInMysqlByUrl,urllib.urlencode({'screenshots':screenshotsN.eq(i).attr('src'),'aid':aid}))    
+    print '%dth over!' % (COUNTS)
+    COUNTS+=1
+def callback(cn,method,pro,body):
+    htmlString=str(body)
+    htmlStringQ.put(htmlString)
+    try:
+        convertHtmlToXmlAndDatabase(htmlString)
+    except Exception,e:
+        print str(e)
+    cn.basic_ack(delivery_tag=method.delivery_tag)
+channel.basic_qos(prefetch_count=1)
+channel.basic_consume(callback,queue='webstring')
+print 'start...'
+channel.start_consuming()
\ No newline at end of file