-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdepends.py
332 lines (285 loc) · 10.7 KB
/
depends.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 18:00:24 2019
@author: autol
"""
#%%
#import time
import zhon.hanzi,zhon.cedict,requests
import os,re,json,string,subprocess,configparser
from bs4 import BeautifulSoup
docs_ppt = ['.ppt','.pptx','.pptm']
#%%
class Var(object):
def __init__(self,
target_file=0,
tika_url_local=0,
tika_url_remote=0,
txt_n=0,
img_correct=0,
img_clean=0,
):
self.target_file = target_file
self.tika_url_local = tika_url_local
self.tika_url_remote = tika_url_remote
self.txt_n = txt_n
self.img_correct = img_correct
self.img_clean = img_clean
var = Var(
target_file = '.',
tika_url_local = 'http://127.0.0.1:3232/',
tika_url_remote = 'http://45.78.19.198:3232/',
txt_n=60,
img_correct=1,
img_clean=1,
)
cfgfile = 'conf.txt'
punc_all = string.punctuation + zhon.hanzi.punctuation
char_all = string.printable + zhon.cedict.all
def txt_clean(t,tag=','):
t = list(filter(None,re.split(r'[\n%s]'%punc_all,t)))
t = list(map(lambda x:re.sub(r'\s+','',x),t))
t = tag.join(t)
return t
def txt_clean_type(t,suffix,is_cn=0):
if suffix in docs_ppt:
t = re.sub(r'Presentation1|演示文稿|幻灯片','',t) #ppt PowerPoint
if suffix == '.html':
t = re.sub(r'\<.*?\>','',t)#html
if suffix in ['.doc','.docx']:
t = re.sub(r'\[.*?\]','',t)#docx tags
if is_cn:
t = re.sub(r'[a-zA-Z0-9]+','',t)#only chinese
t = txt_clean(t,tag='_')
return t
def get_tika_put(file,url,headers,t=''):
with open(file,'rb') as f:
r = requests.put(url=url,
data=f,
headers=headers,
)
r.encoding = r.apparent_encoding # 处理中文乱码
t = r.text
return t
# 使用远程
def get_tika_ocr(file):
url = tika_url_ocr+'tika'
headers = {
'X-Tika-OCRLanguage':'chi_sim',
'X-Tika-OCRpageSegMode':'6'
}
return get_tika_put(file,url,headers)
# 使用本地
def get_tika_rmeta(file):
url = tika_url_txt+'rmeta/text'
headers = {'Accept': 'application/json',}
return get_tika_put(file,url,headers)
def get_tika_meta(file):
url = tika_url_txt+'meta'
headers = {'Accept': 'application/json',}
return get_tika_put(file,url,headers)
def get_tika_pdf(file):
url = tika_url_txt+'tika'
headers = {
'X-Tika-PDFextractInlineImages':'true',
'X-Tika-OCRLanguage':'chi_sim',
'X-Tika-OCRpageSegMode':'6'
}
return get_tika_put(file,url,headers)
def get_curl_ocr(file):
#-H "X-Tika-PDFOcrStrategy:ocr_only"
#-H "X-Tika-PDFextractInlineImages:true"
url = tika_url_ocr+'tika'
return '''curl -T %s %s -H "X-Tika-OCRLanguage: chi_sim" -H "X-Tika-OCRpageSegMode:6" ''' \
%(file,url)
def get_tesseract(file):# os.path.splitext(tmpf)[0]
return 'tesseract %s stdout -l chi_sim+eng'%(file,)
# return 'curl -T %s %s -H "Accept: application/json"'%(file,var.tika_url_local+'meta') # return type:list
def get_curl_pdf(file):
return '''curl -T %s %s -H "X-Tika-PDFextractInlineImages:true" -H "X-Tika-OCRLanguage: chi_sim" -H "X-Tika-OCRpageSegMode:6" ''' \
%(file,tika_url_ocr+'tika')
def get_curl_meta(file): #-H "X-Tika-OCRLanguage: chi_sim+eng"
return 'curl -T %s %s -H "Accept: application/json"'%(file,tika_url_txt+'meta') # return type:list
def get_curl_rmeta(file): #-H "X-Tika-OCRLanguage: chi_sim+eng"
# return 'curl -T %s %s -H "Accept: application/json"'%(file,var.tika_url_local+'rmeta') # return type:list
return 'curl -T %s %s -H "Accept: application/json"'%(file,tika_url_txt+'rmeta/text')
def subprocess_cmd(func,file):
try:
print('cmd:',func(file))
r = subprocess.check_output(func(file),shell=1).decode('utf-8')
except subprocess.CalledProcessError as e:
print(e)
return r
def subprocess_Popen(cmd):
p = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,shell=1)
return iter(p.stdout.readline, b'')
def get_tika_version(url):
try:
print(requests.get(url=url+'version',timeout=0.5).text)
return url
except Exception as e:
print('Tika Error',e)
return None
def setup_local_tika():
print('Now Start Tika Server...')
url,url_r = var.tika_url_local,var.tika_url_remote
if get_tika_version(var.tika_url_local):
if not get_tika_version(var.tika_url_remote):
url_r = None
else:
url = url_r = get_tika_version(var.tika_url_remote)
if os.path.exists('tika-server.jar'):
try:
cmd = 'start /B java -Djava.awt.headless=true -jar tika-server.jar --config=tika-config.xml --host=127.0.0.1 --port=3232'
for output_line in subprocess_Popen(cmd):
print(output_line)
if 'Started' in str(output_line):
break
# if(subprocess.check_call(cmd,shell=1)==0):
# print('Start Ok!')
# time.sleep(1.5)
url = get_tika_version(var.tika_url_local)
except Exception as e:
print(e,'Select Remote Server...')
return url,url_r
tika_url_txt,tika_url_ocr = setup_local_tika()
def get_tika_version1():
try:
print(subprocess.check_output('curl %s '%(var.tika_url_local+'version'),shell=1))
except subprocess.CalledProcessError as e:
print(e)
return False
return True
def get_curl_ocr_txt(file):
return get_tika_txt(subprocess_cmd(get_curl_ocr,file))
def get_tika_txt(txt,jtype=1):
if is_json(txt): #json type
if jtype == 1:
txt = dict(json.loads(txt))
else:
txt = dict(json.loads(txt)[0])
elif 'xmlns' in txt: #html type
soup = BeautifulSoup(txt)
txt = soup.find("div", class_="ocr")
if txt: txt = txt.text
return txt
def get_sizes_human(file):
"""
this function will convert bytes to MB.... GB... etc
"""
num = os.path.getsize(file)
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%3.1f %s" % (num, x)
num /= 1024.0
def print_log(*args, **kwargs):
print(*args, **kwargs)
with open('log.txt', "a",encoding='utf-8') as file:
print(*args, **kwargs, file=file)
def rename_file(file,t):
if '_mod' in file:
file = file.replace('_mod','')
print_log('>>> 找到内容: 【%s】=>【%s】'%(file,t))
os_rename(file,t)
def os_rename(file,t):
if file:
dirname = os.path.dirname(file)
suffix = os.path.splitext(file)[1]
oldname = os.path.splitext(os.path.basename(file))[0]
header = dirname+'\\' if dirname else ''
nfile = header + t + '_vs_' + oldname + suffix
nfile_copy = header + t + '_vs_' + oldname + '_copy' + suffix
cond = t == os.path.splitext(os.path.basename(file))[0].split('_vs_')[0]
if not cond:
print_log('>>> 重命名: 【%s】=>【%s】'%(file,nfile))
try:
os.rename(file,nfile)
except FileExistsError:
os_rename(file,nfile_copy)
def mkdir_file(path):
if not os.path.exists(path):
os.mkdir(path)
def parse_subpath(path,file):
'''make subpath'''
if not path: return file
if not os.path.exists(path):
os.mkdir(path)
return os.path.join(path,file)
def remove_file(file):
if file and os.path.exists(file):
print('>>> del',file)
try:
os.remove(file)
except Exception as e:
print(e)
def check_size(file,is_img=0):
if is_img:
return os.path.getsize(file) > 500 * 1024 # 大于 500k
return os.path.getsize(file) > 30 * 1024 ** 2
def get_file_name(file):
return os.path.splitext(file)[0]
def get_file_suffix(file):
return os.path.splitext(file)[1]
def get_mod_name(file,tag='_mod',suffix=0):
# import tempfile # optional use tempfile
# _, tmp_name = tempfile.mkstemp(prefix='tmp_')
# file = tmp_name + get_file_suffix(file)
tag = '' if tag in file else tag
suffix = suffix if suffix else get_file_suffix(file)
file = get_file_name(file) + tag + suffix
return file
def is_json(s):
try:
json.loads(s)
except ValueError:
return False
return True
#%% optional config
def process_config():
try:
if not os.path.exists(cfgfile):
'''生成默认配置'''
write_config()
read_config()
except Exception as e:
print('>>> 配置文件出错 %s ,删除...'%e)
if os.path.exists(cfgfile):
os.remove(cfgfile)
try:
write_config()
read_config()
except Exception as e:
'''这里可以添加配置问题预判问题'''
print('>>> 配置文件再次生成失败 %s ...'%e)
return var
def write_config():
cfg = configparser.ConfigParser(allow_no_value=1,
inline_comment_prefixes=('#', ';'))
cfg['config'] = {
'target_file': var.target_file+' # 重命名目录,留空就是当前目录',
'tika_url_local': var.tika_url_local+' # tika本地',
'tika_url_remote': var.tika_url_remote+' # tika远程',
'img_cut': str(var.img_cut)+' # 是否裁剪',
'img_thumbnail':str(var.img_thumbnail)+' # 是否缩小,推荐',
'img_correct':str(var.img_correct)+' # 是否修正',
'img_clean':str(var.img_clean)+' # 是否清理,模糊图片用',
'txt_n':str(var.txt_n)+' # 名字长度',
}
with open(cfgfile, 'w',encoding='utf-8-sig') as configfile:
cfg.write(configfile)
print('>>> 重新生成配置 %s ...'%cfgfile)
def read_config():
cfg = configparser.ConfigParser(allow_no_value=1,
inline_comment_prefixes=('#', ';'))
cfg.read(cfgfile,encoding='utf-8-sig')
var.target_file = cfg['config']['target_file']
var.tika_url_local = cfg['config']['tika_url_local']
var.tika_url_remote = cfg['config']['tika_url_remote']
var.img_cut = int(cfg['config']['img_cut'])
var.img_thumbnail = int(cfg['config']['img_thumbnail'])
var.img_correct = int(cfg['config']['img_correct'])
var.img_clean =int(cfg['config']['img_clean'])
var.txt_n = int(cfg['config']['txt_n'])
return dict(cfg.items('config'))