-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreprocessing.py
130 lines (108 loc) · 4.48 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from textblob import TextBlob
import pandas as pd
import re
import html
import string
import spacy
from spacy.tokens import Doc,Span,Token
from spacymoji import Emoji
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
path_from="/home/daanish/Desktop/Project/MachineLearning/Twitter/Testfiles/in1.csv"
path_to="/home/daanish/Desktop/Project/MachineLearning/Twitter/Testfiles/out1.csv"
column_names=["SNo","Time","Location","Text"]
dict=("Location","Text")
# path_from="/home/daanish/Desktop/Project/MachineLearning/Twitter/Testfiles/trainin1.csv"
# path_to="/home/daanish/Desktop/Project/MachineLearning/Twitter/Testfiles/trainout1.csv"
# column_names=["Label","Text"]
# dict=["Text"]
class Preprocessing:
def __init__(self,language="en"):
df=pd.read_csv(path_from,usecols=[0,1,2,3],names=column_names,header=None)
# df=pd.read_csv(path_from,names=column_names,encoding="ISO-8859-1",header=None)
self.language=language
self.nlp=spacy.load("en_core_web_sm",disable=["parser"])
self.nlp.add_pipe(Emoji(self.nlp),first=True)
for key in dict:
# print("Starting Language Conversion to {0} for {1}".format(language,key))
# self.lang(df[key])
# print("Done Language Conversion to {0} for {1}".format(language,key))
print("Starting Noise Removal for {0}".format(key))
self.remove_noise(df[key])
print("Done Noise Removal for {0}".format(key))
if(key=="Text"):
# print("Starting Coreference Resolution for {0}".format(key))
# self.coref(df[key])
# print("Done Coreference Resolution for {0}".format(key))
print("Starting Spellchecking for {0}".format(key))
self.spellcheck(df[key])
print("Done Spellchecking for {0}".format(key))
print("Starting Lemmatisation for {0}".format(key))
self.lemmatisation(df[key])
print("Done Lemmatisation for {0}".format(key))
df.to_csv(path_to,index=False,encoding="utf-8")
return
# language conversion and spellchecking using TextBlob which is built over NLTK and Pattern thus making it suitable for the task
def lang(self,texts):
for i in range(len(texts)):
text=TextBlob(str(texts[i]))
try:
text=text.translate(to="en")
texts.at[i]=str(text)
except Exception as e:
print(e)
return
def spellcheck(self,texts):
for i in range(len(texts)):
print("Spellcheck: ",i)
text=TextBlob(str(texts[i]))
try:
text=text.correct()
texts.at[i]=str(text)
except Exception as e:
print(e)
return
# noise removal using regex
def remove_noise(self,texts):
for i in range(len(texts)):
print("Noise: ",i)
# Hashtags
texts.at[i]=re.sub(r"#[\w]*","",str(texts[i]))
# Handels
texts.at[i]=re.sub(r"@[\w]*","",str(texts[i]))
# URLs
texts.at[i]=re.sub(r"((www\.[^\s]+)|((http|https|ftp)://[^\s]+))","",str(texts[i]))
# html
texts.at[i]=html.unescape(str(texts[i]))
# repeating characters in a word
rpt_regex = re.compile(r"(.)\1{1,}",re.IGNORECASE)
texts.at[i]=re.sub(rpt_regex,r"\1",str(texts[i]))
# RT
texts.at[i]=re.sub(r"RT","",str(texts[i]))
# punctuations
texts.at[i] = re.sub(r"["+string.punctuation+"?"+"]","",str(texts[i]))
# BackSlashes
texts.at[i]=re.sub(r"\\[\w]*","",str(texts[i]))
return
# lemmatisation using spacy
def lemmatisation(self,texts):
for i in range(len(texts)):
print("Lemma: ",i)
doc=self.nlp(str(texts[i]))
tokens=[]
ents=[]
for ent in doc.ents:
ents.append(ent.text)
for tok in doc:
# removing emojis
if(tok._.is_emoji):
continue
if(tok.lemma_!="-PRON-" and tok.text not in ents):
tok=tok.lemma_.lower().strip()
else:
tok=tok.lower_
if(tok not in stopwords):
tokens.append(tok)
texts.at[i]=str(" ".join(tokens))
return
call=Preprocessing()