-
Notifications
You must be signed in to change notification settings - Fork 4
/
amazon_review.py
118 lines (112 loc) · 6.06 KB
/
amazon_review.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
import re
import time
import codecs
import csv
import sys
import os
import fnmatch
import HTMLParser
import datetime
# 根据idre查找html页面对应的商品型号
idre = re.compile('product\-reviews/([A-Z0-9]+)/ref\=cm_cr_arp_d_hist', re.MULTILINE | re.S)
# 从html页面中抽取包含评论的部分,其他无用部分不要
contentre = re.compile('cm_cr-review_list.*?>(.*?)(?:askReviewsPageAskWidget|a-form-actions a-spacing-top-extra-large|/html)', re.MULTILINE | re.S)
# 抽取每一个评论的块,正常一个页面10个评论,然后对每个评论块抽取对应的信息
blockre = re.compile('a-section review\">(.*?)report-abuse-link', re.MULTILINE | re.S)
# 评分
ratingre = re.compile('star-(.) review-rating', re.MULTILINE | re.S)
# 标题
titlere = re.compile('review-title.*?>(.*?)</a>', re.MULTILINE | re.S)
# 链接
links = re.compile('review-title.*?href="(.*?)">', re.MULTILINE | re.S)
# 日期
datere = re.compile('review-date">(.*?)</span>', re.MULTILINE | re.S)
# 是否VP(Verified Purchase) *
vpre = re.compile('data-hook=\"avp-badge.*?>(.*?)</span>', re.MULTILINE | re.S)
# 型号(针对多机型合并页面) *
#formatre = re.compile('data-hook=\"format-strip.*?>(.*?)</a>', re.MULTILINE | re.S)
# 评论
reviewre = re.compile('base review-text">(.*?)</span', re.MULTILINE | re.S)
# 用户
userre = re.compile('profile\/(.*?)["/].*?\<\/div\>.*?\<\/div\>.', re.MULTILINE | re.S)
# 评论数 *
comnumre = re.compile('review-comment-total.*?>([0-9]+)</span>', re.MULTILINE | re.S)
# 点赞数
helpfulre = re.compile('review-votes.*?([0-9]+,[0-9]+|[0-9]+|One).*?</span>', re.MULTILINE | re.S)
# 当前评价数
totalre = re.compile('data-hook=\"total-review-count.*?>(.*?)</span>')
# 当前平均评分
averagere = re.compile('data-hook=\"rating-out-of-text.*?>(.*?) out of 5 stars</span>')
# 遍历文件夹,获取所有html文件名
def get_review_filesnames(input_dir):
for root, dirnames, filenames in os.walk(input_dir):
for filename in fnmatch.filter(filenames, '*.html'):
yield os.path.join(root, filename)
def main():
dir = r"F:\project"
outfile = r"C:\Users\1111\Desktop\nameOfOutput.csv"
reviews = dict()
record_date = time.strftime("%Y/%m/%d/%H:%M:%S")
with codecs.open(outfile,'w',encoding='utf8') as out:
writer = csv.writer(out, lineterminator='\n')
writeTitle = ['Product_ASIN','review_date','date_format','total_review', 'average_star','title', 'review_content', 'star', 'star_class', 'reply_num', 'helpfulVotes', 'user', 'VP','link_id', 'record_date']
# 写入标题 对应为 产品ASIN码,评论时间,评论者,是否VP,评论标题,评论文本,评分,好/坏评分,评论回复数,评论点赞数,采集时间,评论链接
writer.writerow(writeTitle)
for filepath in get_review_filesnames(dir):
with codecs.open(filepath, mode='r', encoding='utf8') as file:
htmlpage = file.read()
if not idre.search(htmlpage):
continue
id_ = idre.findall(htmlpage)[0]
total_review = totalre.findall(htmlpage)[0]
average_star = averagere.findall(htmlpage)[0]
print(id_, filepath)
htmlpage = contentre.findall(htmlpage)[0]
for block in blockre.findall(htmlpage):
link_id = 'https://www.amazon.com' + links.findall(block)[0]
title = titlere.findall(block)[0]
reviewtext = reviewre.findall(block)[0]
# 评论里会包含很多的<**>和空格 需要去除, 一些html特殊字符需要进行转义
remo = re.compile('<.*?>', re.MULTILINE | re.S)
#remo1 = re.compile('&.*?;', re.MULTILINE | re.S)
title = HTMLParser.HTMLParser().unescape(remo.sub(' ', title))
reviewtext = HTMLParser.HTMLParser().unescape(remo.sub(' ', reviewtext))
vpmatch = vpre.findall(block)
if not vpmatch:
vp = u'Unverified'
else:
vp = vpmatch[0]
rating = int(ratingre.findall(block)[0])
date = ''.join(datere.findall(block)[0].split(' ')[1:])
date_format = datetime.datetime.strptime(date,'%B%d,%Y')
user = 'ANONYMOUS'
usermatch = userre.findall(block)
if usermatch:
user = usermatch[0]
comments = 0
helptot = 0
helpmatch = helpfulre.findall(block)
commentsmatch = comnumre.findall(block)
if helpmatch:
helptot = int(helpmatch[0].replace(',','').replace('One', '1'))
if commentsmatch:
comments = int(commentsmatch[0])
if rating >= 4:
binaryrating = 'positive'
else:
binaryrating = 'negative'
#对应于Python版本3.0版本以上进行如下操作
if sys.version_info[0] >= 3:
review_row = [id_, date, date_format, total_review, average_star, title, reviewtext, rating, binaryrating, comments, helptot, user, vp, link_id, record_date]
else:
review_row = [id_,unicode.encode(date, encoding='ascii', errors='ignore'),date_format,total_review,
average_star,unicode.encode(title, encoding='ascii', errors='ignore'),
unicode.encode(reviewtext, encoding='ascii', errors='ignore'), rating,
binaryrating, comments, helptot,
unicode.encode(user, encoding='ascii', errors='ignore'),
unicode.encode(vp, encoding='ascii', errors='ignore'),
unicode.encode(link_id, encoding='ascii', errors='ignore'),record_date]
writer.writerow(review_row)
if __name__ == '__main__':
main()