forked from aub-mind/arabert
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_arabert.py
113 lines (99 loc) · 5.02 KB
/
preprocess_arabert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import re
import pyarabic.araby as araby
prefix_list = ["ال", "و", "ف", "ب", "ك", "ل", "لل", "\u0627\u0644", "\u0648", "\u0641", "\u0628", "\u0643", "\u0644", "\u0644\u0644", "س"]
suffix_list = ["ه", "ها", "ك", "ي", "هما", "كما", "نا", "كم", "هم", "هن", "كن",
"ا", "ان", "ين", "ون", "وا", "ات", "ت", "ن", "ة",
"\u0647", "\u0647\u0627", "\u0643", "\u064a", "\u0647\u0645\u0627", "\u0643\u0645\u0627", "\u0646\u0627", "\u0643\u0645", "\u0647\u0645", "\u0647\u0646", "\u0643\u0646",
"\u0627", "\u0627\u0646", "\u064a\u0646", "\u0648\u0646", "\u0648\u0627", "\u0627\u062a", "\u062a", "\u0646", "\u0629" ]
# the never_split list is ussed with the transformers library
prefix_symbols = [ x+"+" for x in prefix_list]
suffix_symblos = [ "+"+x for x in suffix_list]
never_split_tokens = list(set(prefix_symbols+suffix_symblos))
regex_url_step1 = r'(?=http)[^\s]+'
regex_url_step2 = r'(?=www)[^\s]+'
regex_url = r'(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)'
regex_mention = r'@[\w\d]+'
regex_email = r'\S+@\S+'
redundant_punct_pattern = r'([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})'
def remove_elongation(word):
"""
:param word: the input word to remove elongation
:return: delongated word
"""
regex_tatweel = r'(\w)\1{2,}'
# loop over the number of times the regex matched the word
for index_ in range(len(re.findall(regex_tatweel, word))):
if re.search(regex_tatweel, word):
elongation_found = re.search(regex_tatweel, word)
elongation_replacement = elongation_found.group()[0]
elongation_pattern = elongation_found.group()
word = re.sub(elongation_pattern, elongation_replacement, word, flags=re.MULTILINE)
else:
break
return word
def tokenize_arabic_words_farasa(line_input, farasa):
segmented_line=[]
line_farasa = farasa.segmentLine(line_input)
for index , word in enumerate(line_farasa):
if word in ['[',']']:
continue
if word in ['رابط','بريد','مستخدم'] and line_farasa[index-1] in ['[',']']:
segmented_line.append('['+word+']')
continue
segmented_word=[]
for token in word.split('+'):
if token in prefix_list:
segmented_word.append(token+'+')
elif token in suffix_list:
segmented_word.append('+'+token)
else:
segmented_word.append(token)
segmented_line.extend(segmented_word)
return ' '.join(segmented_line)
def remove_redundant_punct(text):
text_ = text
result = re.search(redundant_punct_pattern, text)
dif = 0
while result:
sub = result.group()
sub = sorted(set(sub), key=sub.index)
sub = ' ' + ''.join(list(sub)) + ' '
text = ''.join((text[:result.span()[0]+dif], sub, text[result.span()[1]+dif:]))
text_ = ''.join((text_[:result.span()[0]], text_[result.span()[1]:])).strip()
dif = abs(len(text) - len(text_))
result = re.search(redundant_punct_pattern, text_)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def preprocess(text, do_farasa_tokenization=True , farasa=None):
"""
Preprocess takes an input text line an applies the same preprocessing used in araBERT
pretraining
Args:
text (string): inout text string
farasa (JavaGateway): pass a py4j gateway to the FarasaSegmenter.jar file
Example:
from py4j.java_gateway import JavaGateway
gateway = JavaGateway.launch_gateway(classpath='./FarasaSegmenterJar.jar')
farasa = gateway.jvm.com.qcri.farasa.segmenter.Farasa()
processed_text = preprocess("Some_Text",do_farasa_tokenization=True , farasa=farasa)
"""
text=str(text)
processing_tweet = araby.strip_tashkeel(text)
processing_tweet = re.sub(r'\d+\/[ء-ي]+\/\d+\]', '', processing_tweet)
processing_tweet = re.sub('ـ', '', processing_tweet)
processing_tweet = re.sub('[«»]', ' " ', processing_tweet)
#replace the [رابط] token with space if you want to clean links
processing_tweet = re.sub(regex_url_step1, '[رابط]', processing_tweet)
processing_tweet = re.sub(regex_url_step2, '[رابط]', processing_tweet)
processing_tweet = re.sub(regex_url, '[رابط]', processing_tweet)
processing_tweet = re.sub(regex_email, '[بريد]', processing_tweet)
processing_tweet = re.sub(regex_mention, '[مستخدم]', processing_tweet)
processing_tweet = re.sub('…', r'\.', processing_tweet).strip()
processing_tweet = remove_redundant_punct(processing_tweet)
processing_tweet = re.sub(r'\[ رابط \]|\[ رابط\]|\[رابط \]', ' [رابط] ', processing_tweet)
processing_tweet = re.sub(r'\[ بريد \]|\[ بريد\]|\[بريد \]', ' [بريد] ', processing_tweet)
processing_tweet = re.sub(r'\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]', ' [مستخدم] ', processing_tweet)
processing_tweet = remove_elongation(processing_tweet)
if do_farasa_tokenization and farasa is not None:
processing_tweet = tokenize_arabic_words_farasa(processing_tweet, farasa)
return processing_tweet.strip()