-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPatentHubSearchCriteriaFormatter.py
234 lines (204 loc) · 9.81 KB
/
PatentHubSearchCriteriaFormatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
from ply import lex
from wcwidth import wcswidth
KEYWORD_regex_string = r'(t|ti|ts|s|tsc|desc|tscd|eti|cti|ab|eab|cab|cl|ecl|d|eds|ap|fap|addr|aee|caee|in|ag|at|n|dn|an|pr|dd|ddy|ddm|ad|ady|adm|pctDate|ipc|ipc-section|ipc-class|ipc-subclass|ipc-group|ipc-subgroup|ipc-main|ipcm-section|ipcm-class|ipcm-subclass|ipcm-group|ipcm-subgroup|uspc|uspcc|fi|ft|loc|cpc|ls|currentStatus|type|cc|acc|kc|lang|apt|ap-zip|country|province|city|aeet|caeet|agc|legalTag|legalEvent|maintainYears|citedCount|citingCount|level|judgment\.date|judgment\.title|judgment\.caseId|judgment\.court|judgment\.province|judgment\.city|judgment\.accuser|judgment\.defendant)(?=\s*:)'
regex = re.compile(KEYWORD_regex_string, flags = re.IGNORECASE)
reserved = {
'and' : 'AND'
,'or' : 'OR'
,'not' : 'NOT'
,'to' : 'TO'
}
# List of token names. This is always required
tokens = (
'KEYWORD'
,'EQUAL'
,'VALUE'
,'CONNECT'
,'LPARENTHESIS'
,'RPARENTHESIS'
,'LBRACKET'
,'RBRACKET'
) + tuple(reserved.values())
# Regular expression rules for simple tokens
t_EQUAL = r':'
# t_CONNECT = r'\(\s*[1-9]?[wWnN]\s*\)'
t_LPARENTHESIS = r'\('
t_RPARENTHESIS = r'\)'
t_LBRACKET = r'\['
t_RBRACKET = r'\]'
def t_KEYWORD(t):
r'(t|ti|ts|s|tsc|desc|tscd|eti|cti|ab|eab|cab|cl|ecl|d|eds|ap|fap|addr|aee|caee|in|ag|at|n|dn|an|pr|dd|ddy|ddm|ad|ady|adm|pctDate|ipc|ipc-section|ipc-class|ipc-subclass|ipc-group|ipc-subgroup|ipc-main|ipcm-section|ipcm-class|ipcm-subclass|ipcm-group|ipcm-subgroup|uspc|uspcc|fi|ft|loc|cpc|ls|currentStatus|type|cc|acc|kc|lang|apt|ap-zip|country|province|city|aeet|caeet|agc|legalTag|legalEvent|maintainYears|citedCount|citingCount|level|judgment\.date|judgment\.title|judgment\.caseId|judgment\.court|judgment\.province|judgment\.city|judgment\.accuser|judgment\.defendant)(?=\s*:)'
t.type = 'KEYWORD'
t.value = t.value.lower()
return t
# A regular expression rule with some action code
def t_VALUE(t):
# r'(?P<quote>")?[^\(\)\[\]\s=]+(?(quote)")'
r'"[^"]+"|[^\s\(\)\[\]]+'
t.type = reserved.get(t.value.lower(), 'VALUE')
t.value = re.sub(' +', ' ', t.value) #将多个连续的空格替换为单个空格
t.value = re.sub('(?<=")[\s\n\t ]+|[\s\n\t]+(?=")', '', t.value) #将紧靠双引号内侧的空格删除
return t
# Define a rule so we can track line numbers
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'
# Error handling rule
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
def scFormatter(string):
string = regex.sub(lambda i: i.group().lower(), string)
# Build the lexer
lexer = lex.lex()
if string.strip() == '':
return
# Give the lexer some input
lexer.input(f'{string}')
# Tokenize
result = []
newline = ''
step = ' ' * 8
align = ' ' * 4
indent = ''
for tok in lexer:
if not tok: break # No more input
# print(tok.type, tok.value, tok.lexpos, sep = '\t'*2)
if tok.type in ("LPARENTHESIS", "LBRACKET"):
######测试代码{
#如果一行代码以(or|and|not) (结尾且不仅仅只包括or|and|not,那么就在or|and|not前换行
match = re.match('^(?P<head>.*?)\s+(?P<tail>(?:or|and|not))\s*$', newline)
if match:
if match.group('head'):
result.append(match.group('head'))
newline = indent[:-4] + match.group('tail').ljust(4, ' ')
# if match.group('tail') == 'or':
#or 在行首,额外增加一个空格,以使其能够与and对齐
# newline += ' '
# result.append(newline)
######测试代码}
elif newline.endswith(' : '):
pass
# elif re.search(t_CONNECT + '\s*$', newline):
# pass
elif newline.strip() != '':
newline = newline.rstrip()
newline += tok.value
result.append(newline)
indent += step
newline = indent
elif tok.type in ("RPARENTHESIS", "RBRACKET"):
indent = indent[:-8]
if 0 < len(newline.strip()) < 40 and result[-1].endswith('('):
newline = result.pop().rstrip() + newline.strip()
else:
result.append(newline.rstrip())
newline = ''
newline += indent
newline += tok.value
result.append(newline)
newline = indent
# elif tok.type == "LBRACKET":
# newline += tok.value + ''
# elif tok.type == "RBRACKET":
# newline = newline.rstrip() + tok.value
# result.append(newline)
# newline += indent
elif tok.type == "KEYWORD":
match = re.match('(?P<head>\S+?)\s+(?P<tail>or|and|not)\s*$', newline)
if match:
# if match.group('head'):
result.append(match.group('head'))
newline = indent[:-4] + match.group('tail').ljust(4, ' ')
newline += tok.value + ' '
elif tok.type == "EQUAL":
newline += tok.value + ' '
elif tok.type == "VALUE":
if newline.strip() == '' and result[-1].endswith(')'):
#主要用来处理公司名称中出现的括号
#当公司名称中有括号时,不要换行
newline = result.pop()
newline += tok.value + ' '
elif tok.type == "CONNECT":
tok.value = re.sub('\(\s*([1-9]?)\s*([wn])\s*\)', '(\g<1>\g<2>)', tok.value.lower())
if newline.strip() == '' and result[-1].endswith(')'):
if result[-1].strip() != ')':
#连字符紧跟在右括号后边时,不换行
newline = result.pop() + ' '
newline += tok.value + ' '
else:
result.append(newline.rstrip())
newline = indent
newline = newline[:-4]
newline += tok.value.ljust(4, ' ')
else:
newline += tok.value + ' '
elif tok.type == "TO":
newline += tok.value.lower() + ' '
elif tok.type == "AND":
if newline.strip() == '':
if len(newline) < 8:
return scFormatter(f'({string})')
newline = newline[:-4]
newline += tok.value.lower() + ' '
elif tok.type == "OR":
if wcswidth(newline) > 100:
result.append(newline.rstrip())
newline = '' + indent
if newline.strip() == '':
if len(newline) < 8:
return scFormatter(f'({string})')
newline = newline[:-4]
newline += tok.value.lower() + ' '
else:
newline += tok.value.lower() + ' '
elif tok.type == "NOT":
if newline.strip() == '':
if len(newline) < 8:
return scFormatter(f'({string})')
newline = newline[:-4]
newline += tok.value.lower() + ' '
else:
if newline.strip():
result.append(newline.rstrip())
while indent:
indent = indent[:-8]
result.append(indent + ')')
for i in range(1, len(result) - 1):
if result[i] == '':
continue
else:
l = i - 1
while(l and result[l] == ''):
l -= 1
n = i + 1
# if result[l].endswith('(') and len(result[i].strip()) < 50 and result[n].strip() == ')':
# result[l] += result[i].strip() + result[n].strip()
# result[i] = ''
# result[n] = ''
# elif result[i].lstrip().startswith('(w)') or result[i].lstrip().startswith('(n)'):
# result[l] += ' ' + result[i].strip()
# result[i] = ''
else:
result = '\n'.join(filter(None, result))
result = re.sub('\([\n\s]*\)', '', result)#删除空括号对(自动补全括号时,有可能会产生空括号对。
result = re.sub('^\s*(and|or|not)*\s*$', '', result, flags = re.M)#删除只有and or not的行以及空行。
result = re.sub(r'[A-Z]([A-Za-z-]*?[A-Za-z])?(?=\s*=\s*)', lambda matched : matched.group(0).lower(), result)
result = re.sub(r'(?P<quote>")?(?P<ipc>\b(?P<section>[A-H])(?(section)(?P<class>\d{2})?)(?(class)(?P<subclass>[A-Z])?)(?(subclass)(?P<group>\d{1,})?)(?(group)(?:/(?P<subgroup>\d{1,}))?)\b)(?(quote)")',
lambda matched : matched.group('ipc').upper(), result, flags = re.IGNORECASE)
#将所有ipc分类号转为大写
# result = re.sub(r'\b(?P<country>am|ap|ar|at|au|ba|be|bg|br|by|ca|ch|cl|cn|co|cr|cs|cu|cy|cz|dd|de|dk|do|dz|ea|ec|ee|eg|ep|es|fi|fr|gb|gc|ge|gr|gt|hk|hn|hr|hu|id|ie|il|in|is|it|jo|jp|ke|kr|kz|lt|lu|lv|ma|mc|md|me|mn|mo|mt|mw|mx|my|ni|nl|no|nz|oa|pa|pe|ph|pl|pt|ro|rs|ru|se|sg|si|sk|sm|su|sv|th|tj|tr|tt|tw|ua|us|uy|uz|vn|wo|yu|za|zm|zw|py|bo|ve|eu|sa|kg|tn|ae|bh|bn|lb)\b(?!\s*=)', lambda matched : matched.group('country').upper(), result, flags = re.IGNORECASE)
# 将所有国别代码转为大写
result = re.sub('(\n\))+\n*$', '\n)\n', result)
result = re.sub('\[[\n\s]*(?P<start>\d{1,4}-?\d{1,2}-?\d{1,2})\s+to\s+(?P<end>\d{1,4}-?\d{1,2}-?\d{1,2})[\n\s]*\]', '[\g<start> to \g<end>]', result) #避免方括号内的日期占用三行空间
return result.strip()
if __name__ == "__main__":
import sys
data = "".join(sys.stdin.readlines()) #接受管道输入的其它参数
print(scFormatter(data))