-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbw2ar.py
executable file
·126 lines (111 loc) · 4.37 KB
/
bw2ar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
functions to convert Arabic words/text into buckwalter encoding and vice versa
"""
import sys
import re
import utils
buck2uni = {
"'": u"\u0621", # hamza-on-the-line
"|": u"\u0622", # madda
">": u"\u0623", # hamza-on-'alif
"&": u"\u0624", # hamza-on-waaw
"<": u"\u0625", # hamza-under-'alif
"}": u"\u0626", # hamza-on-yaa'
"A": u"\u0627", # bare 'alif
"b": u"\u0628", # baa'
"p": u"\u0629", # taa' marbuuTa
"t": u"\u062A", # taa'
"v": u"\u062B", # thaa'
"j": u"\u062C", # jiim
"H": u"\u062D", # Haa'
"x": u"\u062E", # khaa'
"d": u"\u062F", # daal
"*": u"\u0630", # dhaal
"r": u"\u0631", # raa'
"z": u"\u0632", # zaay
"s": u"\u0633", # siin
"$": u"\u0634", # shiin
"S": u"\u0635", # Saad
"D": u"\u0636", # Daad
"T": u"\u0637", # Taa'
"Z": u"\u0638", # Zaa' (DHaa')
"E": u"\u0639", # cayn
"g": u"\u063A", # ghayn
"_": u"\u0640", # taTwiil
"f": u"\u0641", # faa'
"q": u"\u0642", # qaaf
"k": u"\u0643", # kaaf
"l": u"\u0644", # laam
"m": u"\u0645", # miim
"n": u"\u0646", # nuun
"h": u"\u0647", # haa'
"w": u"\u0648", # waaw
"Y": u"\u0649", # 'alif maqSuura
"y": u"\u064A", # yaa'
"F": u"\u064B", # fatHatayn
"N": u"\u064C", # Dammatayn
"K": u"\u064D", # kasratayn
"a": u"\u064E", # fatHa
"u": u"\u064F", # Damma
"i": u"\u0650", # kasra
"~": u"\u0651", # shaddah
"o": u"\u0652", # sukuun
"`": u"\u0670", # dagger 'alif
"{": u"\u0671", # waSla
}
# For a reverse transliteration (Unicode -> Buckwalter), a dictionary
# which is the reverse of the above buck2uni is essential.
uni2buck = {}
# Iterate through all the items in the buck2uni dict.
for (key, value) in buck2uni.items():
# The value from buck2uni becomes a key in uni2buck, and vice
# versa for the keys.
uni2buck[value] = key
# add special characters
uni2buck[u"\ufefb"] = "lA"
uni2buck[u"\ufef7"] = "l>"
uni2buck[u"\ufef5"] = "l|"
uni2buck[u"\ufef9"] = "l<"
# clean the arabic text from unwanted characters that may cause problem while building the language model
def clean_text(text):
text = re.sub(u"[\ufeff]", "", text, flags=re.UNICODE) # strip Unicode Character 'ZERO WIDTH NO-BREAK SPACE' (U+FEFF). For more info, check http://www.fileformat.info/info/unicode/char/feff/index.htm
text = utils.remove_non_arabic(text)
text = utils.strip_tashkeel(text)
text = utils.strip_tatweel(text)
return text
# convert a single word into buckwalter and vice versa
def transliterate_word(input_word, direction='bw2ar'):
output_word = ''
# Loop over each character in the string, bw_word.
for char in input_word:
# Look up current char in the dictionary to get its
# respective value. If there is no match, e.g., chars like
# spaces, then just stick with the current char without any
# conversion.
# if type(char) == bytes:
# char = char.decode('ascii')
if direction == 'bw2ar':
#print('in bw2ar')
output_word += buck2uni.get(char, char)
elif direction == 'ar2bw':
#print('in ar2bw')
output_word += uni2buck.get(char, char)
else:
sys.stderr.write('Error: invalid direction!')
sys.exit()
return output_word
# convert a text into buckwalter and vice versa
def transliterate_text(input_text, direction='bw2ar'):
output_text = ''
for input_word in input_text.split(' '):
output_text += transliterate_word(input_word, direction) + ' '
return output_text[:-1] # remove the last space ONLY
if __name__ == '__main__':
if len(sys.argv) < 2:
sys.stderr.write('Usage: INPUT TEXT | python {} DIRECTION(bw2ar|ar2bw)'.format(sys.argv[1]))
exit(1)
for line in sys.stdin:
line = line if sys.argv[1] == 'bw2ar' else clean_text(line)
output_text = transliterate_text(line, direction=str(sys.argv[1]))
if output_text.strip() != '':
sys.stdout.write('{}\n'.format(output_text.strip()))