Skip to content
This repository has been archived by the owner on Dec 17, 2018. It is now read-only.

Commit

Permalink
开放了两个检查
Browse files Browse the repository at this point in the history
  • Loading branch information
will4906 committed May 13, 2017
1 parent ce927d0 commit 72b2a5a
Show file tree
Hide file tree
Showing 23 changed files with 48 additions and 9,199 deletions.
20 changes: 17 additions & 3 deletions PatentCrawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
class PatentcrawlerPipeline(object):

LINE_INDEX = 1

def process_item(self, item, spider):
if self.checkForInventor(item):
print(item.items())
self.writeToExcel(item)
if self.checkForProposer(item):
print(item.items())
self.writeToExcel(item)
return item

def writeWithNotNone(self, sh, i, strData):
Expand Down Expand Up @@ -47,9 +49,21 @@ def writeToExcel(self, item):
print("写excel报错")

def checkForInventor(self, item):
if BaseConfig.CHECK_INVENTOR is False:
return True
targetInventor = item.get('targetInventor')
inventorList = item.get('inventorName').split(";")
for i in inventorList:
if targetInventor == i.strip():
return True
return False
return False

def checkForProposer(self, item):
if BaseConfig.CHECK_PROPOSER is False:
return True
targetProposer = item.get('targetProposer')
proposerList = item.get('proposerName').split(";")
for p in proposerList:
if targetProposer == p.strip():
return True
return False
2 changes: 1 addition & 1 deletion PatentCrawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
Expand Down
7 changes: 4 additions & 3 deletions PatentCrawler/spiders/Patent.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def parsePatentList(self, response):
soup = BeautifulSoup(response.body_as_unicode(), "lxml")
type = response.meta['inventionType']
pageTop = soup.find(attrs={"class": "page_top"})
patentSum = int(pageTop.get_text(strip=True)[8:-3])
strSum = pageTop.get_text(strip=True)
patentSum = int(strSum[strSum[2:].find("页") + 3:strSum.find("条")])
if (patentSum % int(BaseConfig.CRAWLER_SPEED)) == 0:
scale = 0
else:
Expand All @@ -108,7 +109,8 @@ def parsePatentList(self, response):
lawStateBn = footer.find(attrs={"role": "lawState"})
yield self.requestLawState(lawStateBn, pi)
except Exception as e:
print("此人在此类型没有专利")
print("申请人-" + response.meta.get('proposer') + "发明人-" + response.meta.get('inventor') + "在" + QueryInfo.inventionTypeToString(type) + "没有专利")
print(e)

# 解析翻页后的专利数据
def parseNextPatentList(self, response):
Expand Down Expand Up @@ -165,7 +167,6 @@ def requestLawState(self, lawStateBn, pi):
meta=pi
)

# "VDB:((PD>='" + startDate + "' AND PAVIEW='" + proposer + "' AND INVIEW='" + inventor + "' AND DOC_TYPE='" + type + "' AND (CC='HK' OR CC='MO' OR CC='TW' OR CC='CN')))"
# 生成接下来专利信息的请求
def requestNextPage(self, searchExp, index, nSum, startDate, proposer, inventor, type):
formData = {
Expand Down
7 changes: 6 additions & 1 deletion config/BaseConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,9 @@ class BaseConfig:
# 爬取专利速度,每个请求80个,范围是(0,89]
CRAWLER_SPEED = "80"
FILE_NAME = "output\专利.xls"
LOG_FILE_NAME = "log\PatentCrawler{0}.log".format(TimeUtil.getFormatTime("%Y%m%d_%H%M%S"))
LOG_FILE_NAME = "log\PatentCrawler{0}.log".format(TimeUtil.getFormatTime("%Y%m%d_%H%M%S"))

# 发明人名称检查
CHECK_INVENTOR = False
# 申请人名称检查
CHECK_PROPOSER = True
14 changes: 8 additions & 6 deletions config/QueryInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
class QueryInfo:
def __init__(self):
self.__inventionTypeList = ["I", "U", "D"]
self.__inventorList = ["陈思平", "陈昕", "汪天富", "谭力海", "彭珏", "但果", "叶继伦", "覃正笛",
"张旭", "张会生", "钱建庭", "丁惠君", "刁现芬", "沈圆圆", "周永进", "孔湉湉",
"陆敏华", "张新宇", "孙怡雯", "李乔亮", "齐素文", "徐海华", "倪东", "刘维湘",
"李抱朴", "黄炳升", "徐敏", "雷柏英", "胡亚欣", "何前军", "郑介志", "常春起",
"陈雯雯", "罗永祥", "黄鹏", "林静", "王倪传", "刘立", "张治国", "董磊"]
self.__proposer = "深圳大学"
# self.__inventorList = ["陈思平", "陈昕", "汪天富", "谭力海", "彭珏", "但果", "叶继伦", "覃正笛",
# "张旭", "张会生", "钱建庭", "丁惠君", "刁现芬", "沈圆圆", "周永进", "孔湉湉",
# "陆敏华", "张新宇", "孙怡雯", "李乔亮", "齐素文", "徐海华", "倪东", "刘维湘",
# "李抱朴", "黄炳升", "徐敏", "雷柏英", "胡亚欣", "何前军", "郑介志", "常春起",
# "陈雯雯", "罗永祥", "黄鹏", "林静", "王倪传", "刘立", "张治国", "董磊"]
# self.__proposer = "深圳大学"
self.__inventorList = [""]
self.__proposer = "北京科技大学"
self.__startDate = "2001-01-01"
return

Expand Down
231 changes: 0 additions & 231 deletions controller/ProgressController.py

This file was deleted.

6 changes: 0 additions & 6 deletions controller/__init__.py

This file was deleted.

Loading

0 comments on commit 72b2a5a

Please sign in to comment.