-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
81 lines (49 loc) · 1.47 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
import json
with open('contractions.json') as f:
contractions_dict = json.load(f)
# very necessary function you can't tell differece between "’" and "'", I don't even have that symbol in my keyboard
def text_cleaning_apos(text):
text = str(text)
text = text.lower()
text = re.sub("’", "'", text) # removing punctuation
return text
# FUNCTIONS TO EXPAND CONTRACTIONS
def cont_to_exp(x):
x = str(x).lower()
xsplited = x.split(' ')
exp_sentence = []
for s in x.split():
if s in contractions_dict.keys():
s = contractions_dict.get(s)
exp_sentence.append(s)
x = ' '.join(exp_sentence)
return x
def text_cleaning(text):
text = str(text)
text = text.lower()
text = re.sub("[^a-zA-Z]", " ", text) # removing punctuation
# remove special characters from text column
text = re.sub('[#,@,&]', '',text)
# Remove digits
text = re.sub('\d*','', text)
# remove "'s"
text = re.sub("'s",'', text)
#Remove www
text = re.sub('w{3}','', text)
# remove urls
text = re.sub("http\S+", "", text)
# remove multiple spaces with single space
text = re.sub('\s+', ' ', text)
#remove all single characters
text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
return text
def clean(x):
x = text_cleaning_apos(str(x))
x = cont_to_exp(x)
x = text_cleaning(x)
return x
def main():
...
if __name__=="__main__":
main()