-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutility.py
55 lines (46 loc) · 1.64 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
class Utility(object):
@staticmethod
def clean_text(text):
""" Generate a clean textual content"""
text = Utility.lowercase(text)
text = Utility.remove_newlines(text)
text = Utility.remove_tabs(text)
text = Utility.remove_digits(text)
text = Utility.replace_special_chars(text)
text = Utility.remove_multiple_whitespaces(text)
text = Utility.remove_trailing_whitespaces(text)
return text
@staticmethod
def lowercase(text):
""" Lowercase text """
return text.lower()
@staticmethod
def remove_newlines(text):
""" Remove newlines characters from text """
return text.replace("\n", "")
@staticmethod
def remove_tabs(text):
""" Remove tab characters from text """
return text.replace("\t", "")
@staticmethod
def remove_digits(text):
""" Remove digits from text """
return ''.join([word for word in text if not word.isdigit()])
@staticmethod
def replace_special_chars(text):
""" Remove special characters from text """
return re.sub(r'\W+', ' ', text)
@staticmethod
def remove_multiple_whitespaces(text):
""" Remove whitespaces from text """
return re.sub(r'\s+', ' ', text, flags=re.I)
@staticmethod
def remove_trailing_whitespaces(text):
""" Remove whitespaces from beggining and ending text """
return text.strip()
@staticmethod
def get_doc_length(text):
""" Determine number of words in a document."""
doc_length = len(re.findall(r'\w+', text))
return doc_length