This repository has been archived by the owner on Dec 15, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_utils.py
178 lines (148 loc) · 5.06 KB
/
preprocess_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# Text preprocess functions
import contractions
import re
import unicodedata
import os
# Ratio stopwords to special characters
import numpy as np
# Load stopwords
from nltk.corpus import stopwords as sw
swEng = set(sw.words("english"))
# Settings for cleaning text
THRESHOLD_SPECIAL_CHARACTERS = .15
THRESHOLD_STOPWORDS_UPPER = .16
THRESHOLD_STOPWORDS_LOWER = .01
THRESHOLD_DIGITS = .18
# Remove everything between parentheses
def clear_parentheses(string):
"""Remove everything between parentheses"""
return(re.sub(r'\(.*\)', '', string))
def remove_citations(string):
"""Remove citations"""
return(re.sub(r'\[[0-9]{1,3}\]', '', string))
# For each text, check ratio of stopwords to special characters
def categorize_characters(string):
"""Count by character type"""
alpha = digit = special = 0
for i in range(len(string)):
if string[i] == " ": continue
# Count
if string[i].isalpha():
alpha = alpha + 1
elif string[i].isdigit():
digit = digit + 1
else:
special = special + 1
# Return
return(special, digit)
def count_stopwords(string):
"""Count stopwords"""
tokens = string.split(" ")
stopword_count = 0
# Check if in stopwords
for token in tokens:
if token.strip() in swEng:
stopword_count += 1
# Return
return(stopword_count)
def filter_criterion(string, minimum_characters = 10):
"""Calculate ratio to stopwords"""
if len(string) < minimum_characters: return((None, None, None))
# Snippet length
sl = len(string)
# Count # stopwords
swcount = count_stopwords(string)
# Count special characters
sccount, dgcount = categorize_characters(string)
# Return counts
return( (np.round(sccount / sl, 5), np.round(swcount / sl, 5), np.round(dgcount / sl, 5)) )
def tokens_lower(tokens):
"""All tokens to lower-case"""
tokens_rep = []
for token in tokens:
tokens_rep.append(token.lower())
return tokens_rep
def remove_punctuation(tokens):
"""Remove punctuation from a token"""
tokens_rep = []
for token in tokens:
tokens_rep.append(re.sub(r'[^\w\s]', '', token))
return tokens_rep
def replace_contractions(text):
"""Replace contractions in string of text"""
text = contractions.fix(text)
# Remove specifically 's
# Specific contraction that is ignored by contractions library
return(text.replace("'s", ""))
def remove_non_ascii(tokens):
"""Remove non-asci characters"""
tokens_rep = []
for token in tokens:
tokens_rep.append(unicodedata.normalize('NFKD', token).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
return(tokens_rep)
def remove_digits(tokens):
"""Remove digits from a list of tokens"""
tokens_rep = []
for token in tokens:
tokens_rep.append(re.sub(r"\d", "#", token))
return(tokens_rep)
# Define tokenizer function
def tokenize_text(input, lower_tokens = True, remove_digits_token=True):
"""Parse & tokenize a piece of text"""
# Replace contractions
try:
input = replace_contractions(input)
except IndexError as e:
print("Document threw index error ... continuing without removing contractions")
# Remove parentheses and citations
input = clear_parentheses(input)
input = remove_citations(input)
# Check quality of input string
filter_text = filter_criterion(input)
# If any is None, then raise error
if None in filter_text:
print("String contains None or is too short ...")
return(None)
# Tokenize
tokens = input.split()
input = " ".join(tokens)
# Else, check against values
if filter_text[0] > THRESHOLD_SPECIAL_CHARACTERS: return(None)
if (filter_text[1] > THRESHOLD_STOPWORDS_UPPER) or (filter_text[1] < THRESHOLD_STOPWORDS_LOWER): return(None)
if filter_text[2] > THRESHOLD_DIGITS: return(None)
# Replace non-ascii
tokens = remove_non_ascii(tokens)
# Remove punctuation
# This is also done by keras tokenizer
#tokens = remove_punctuation(tokens)
# Remove digits
if remove_digits_token:
tokens = remove_digits(tokens)
# To lowercase
if lower_tokens:
tokens = tokens_lower(tokens)
# If short sentence, then pass
if len(tokens) <= 4:
return None
# Return
return(" ".join(tokens).strip())
# Load wikipedia data (only first paragraph)
def load_snippets(file):
"""Load the wikipedia snippets pre-processed by Cortical"""
assert os.path.exists(file), "File '{}' does not exist ...".format(file)
count = 0
pages = dict()
with open(file, "r") as inFile:
for line in inFile:
# If 0, then start of a page
unjoin = line.split("\t")
# Split on meta information
meta = unjoin[0].split("|")
# Get snippet number ==> if this is 0 then start of new page
snippet_number = int(meta[-1])
# Filter first pages
if (snippet_number == 0):
pages[meta[0]] = unjoin[-1]
count += 1
# Return
return(pages)