forked from prefect12/PythonCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessor.py
177 lines (133 loc) · 5.89 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 18 20:11:02 2020
@author: Administrator
"""
#encoding=utf-8
import pandas as pd
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import os
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans,MeanShift
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from numpy import linspace
import random
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
class SXSAnalyser:
def __init__(self,path,userDict='./stopwords/dict.txt',stopWords='./stopwords/sum.txt'):
if not os.path.exists('./image'):
os.makedirs('./image')
self.__path = path
self._df = pd.read_csv(filepath_or_buffer=path,engine='python')
self.FiltereDf = self._df
jieba.load_userdict(userDict)
stopWords = np.array(pd.read_csv(filepath_or_buffer=stopWords,header=None)).T.tolist()[0]
self.__DeRedundancy()
self.__RemoveStopWords(stopWords)
self.__getWordFrequency()
self.prapaerData()
def __saveImag(self,plt,name):
path = './image/' + self.__path[2:-4] + name + '.jpg'
print(path)
if name != '3D':
plt.savefig(path,bbox_inches='tight')
else:
pass
def prapaerData(self,select = 100,minCount = 10):
self.__showWords = self.__wordsSelector(select,minCount)
self.__wordVector = self.__getwordVec(self.__showWords)
#数据去重
def __DeRedundancy(self):
self._df.dropna(subset=['jobUrl'],inplace=True)
self._df.fillna(value='',inplace=True)
self._df.drop_duplicates(['title','companyName','jobDescrib'],inplace = True)
self._df.index = range(len(self._df))
#预处理,删除 stop words
def __RemoveStopWords(self,stopWord):
self._df['jobDescrib'] = self._df['jobDescrib'].apply(lambda x:re.sub('�0|�1|�6|�2|[【】◆]','',x.lower()))
self._df['title'] = self._df['title'].apply(lambda x:re.sub('�0|�1|�6|�2|[【】◆]','',x.lower()))
self._df['titleWord'] = self._df['title'].apply(lambda x:[i for i in jieba.lcut(x) if len(i)>=2 ])
self._df['jobDesWord'] = self._df['jobDescrib'].apply(lambda x:[i for i in jieba.lcut(x) if len(i)>=2 and i not in stopWord])
# 计算词频
def __getWordFrequency(self):
wordDf = pd.DataFrame({'Word':np.concatenate(self._df.jobDesWord)})
wordStat = wordDf.groupby(by=['Word'])["Word"].agg({'number':np.size})
self._wordStat = wordStat.reset_index().sort_values(by='number',ascending=False)
def __wordsSelector(self,select,minCount):
#根据数字选择
wordStat2 = self._wordStat.loc[self._wordStat['number'] >= minCount]
if select < 1:
num = int(len(wordStat2)*select)
else:
num = select
wordStat3 = wordStat2.head(num)
showWord = np.array(wordStat3['Word']).tolist()
return showWord
def __getwordVec(self,showWord):
SentenceList = np.array(self._df.jobDesWord).T.tolist()
self._model = Word2Vec(SentenceList,min_count=10)
wordVec = self._model.wv[showWord]
return wordVec
def draw2D(self):
model = TSNE(n_components=2)
result = model.fit_transform(self.__wordVector)
model = KMeans(5)
lable = model.fit_predict(result)
cm_subsection = linspace(0,1,5)
colors = [cm.rainbow(x) for x in cm_subsection]
random.shuffle(colors)
fig = plt.figure(figsize=(20,12))
for i,word in enumerate(self.__showWords):
plt.scatter(result[i,0],result[i,1],color = colors[lable[i]])
plt.annotate(word,xy=(result[i,0],result[i,1]))
fig.show()
self.__saveImag(plt,'2D')
def draw3D(self):
model = PCA(n_components=3)
result = model.fit_transform(self.__wordVector)
ax = plt.axes(projection='3d')
fig = plt.figure()
ax = Axes3D(fig)
for i,word in enumerate(self.__showWords):
ax.scatter3D(result[i,0],result[i,1],result[i,2])
ax.text(result[i,0],result[i,1],result[i,2],word)
self.__saveImag(ax,'3D')
def drawCloud(self):
wordList= ''
word = np.array(self._wordStat['Word']).tolist()
nums = np.array(self._wordStat['number']).tolist()
fig = plt.figure(figsize=(20,12))
for i in range(len(nums)):
wordList += (word[i] + ' ')*nums[i]
wordcloud = WordCloud(collocations=False,scale=8,font_path='simhei.ttf',background_color='white').generate(wordList)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
fig.show()
self.__saveImag(plt,'wordCloud')
def drawBar(self,nums=35):
pltData = self._wordStat.head(nums)
fig = plt.figure(figsize=(20,12))
plt.xlabel('关键字')
plt.ylabel('出现次数')
plt.title('招聘关键字分析')
plt.xticks(rotation = 45)
plt.bar(pltData['Word'],pltData['number'])
fig.show()
self.__saveImag(plt,'BarChart')
def strongFilter(self,keywords=[''],save=False):
if not keywords:
print('Input KeyWords')
Filter = '|'.join(keywords)
newDF = self._df.loc[self._df['title'].str.contains(Filter)]
if save:
newDF.to_csv('./'+Filter+'.csv',encoding='GBK',index=False)
if __name__ == "__main__":
ana = SXSAnalyser(path="./算法intern全国45.csv")