|
""" |
|
functions to convert Arabic words/text into buckwalter encoding and vice versa |
|
""" |
|
|
|
import sys |
|
import re |
|
import utils |
|
|
|
buck2uni = { |
|
"'": u"\u0621", |
|
"|": u"\u0622", |
|
">": u"\u0623", |
|
"&": u"\u0624", |
|
"<": u"\u0625", |
|
"}": u"\u0626", |
|
"A": u"\u0627", |
|
"b": u"\u0628", |
|
"p": u"\u0629", |
|
"t": u"\u062A", |
|
"v": u"\u062B", |
|
"j": u"\u062C", |
|
"H": u"\u062D", |
|
"x": u"\u062E", |
|
"d": u"\u062F", |
|
"*": u"\u0630", |
|
"r": u"\u0631", |
|
"z": u"\u0632", |
|
"s": u"\u0633", |
|
"$": u"\u0634", |
|
"S": u"\u0635", |
|
"D": u"\u0636", |
|
"T": u"\u0637", |
|
"Z": u"\u0638", |
|
"E": u"\u0639", |
|
"g": u"\u063A", |
|
"_": u"\u0640", |
|
"f": u"\u0641", |
|
"q": u"\u0642", |
|
"k": u"\u0643", |
|
"l": u"\u0644", |
|
"m": u"\u0645", |
|
"n": u"\u0646", |
|
"h": u"\u0647", |
|
"w": u"\u0648", |
|
"Y": u"\u0649", |
|
"y": u"\u064A", |
|
"F": u"\u064B", |
|
"N": u"\u064C", |
|
"K": u"\u064D", |
|
"a": u"\u064E", |
|
"u": u"\u064F", |
|
"i": u"\u0650", |
|
"~": u"\u0651", |
|
"o": u"\u0652", |
|
"`": u"\u0670", |
|
"{": u"\u0671", |
|
} |
|
|
|
|
|
|
|
uni2buck = {} |
|
|
|
|
|
for (key, value) in buck2uni.items(): |
|
|
|
|
|
uni2buck[value] = key |
|
|
|
|
|
uni2buck[u"\ufefb"] = "lA" |
|
uni2buck[u"\ufef7"] = "l>" |
|
uni2buck[u"\ufef5"] = "l|" |
|
uni2buck[u"\ufef9"] = "l<" |
|
|
|
|
|
def clean_text(text): |
|
text = re.sub(u"[\ufeff]", "", text, flags=re.UNICODE) |
|
text = utils.remove_non_arabic(text) |
|
text = utils.strip_tashkeel(text) |
|
text = utils.strip_tatweel(text) |
|
return text |
|
|
|
|
|
def transliterate_word(input_word, direction='bw2ar'): |
|
output_word = '' |
|
|
|
for char in input_word: |
|
|
|
|
|
|
|
|
|
|
|
|
|
if direction == 'bw2ar': |
|
|
|
output_word += buck2uni.get(char, char) |
|
elif direction == 'ar2bw': |
|
|
|
output_word += uni2buck.get(char, char) |
|
else: |
|
sys.stderr.write('Error: invalid direction!') |
|
sys.exit() |
|
return output_word |
|
|
|
|
|
|
|
def transliterate_text(input_text, direction='bw2ar'): |
|
output_text = '' |
|
for input_word in input_text.split(' '): |
|
output_text += transliterate_word(input_word, direction) + ' ' |
|
|
|
return output_text[:-1] |
|
|
|
|
|
if __name__ == '__main__': |
|
if len(sys.argv) < 2: |
|
sys.stderr.write('Usage: INPUT TEXT | python {} DIRECTION(bw2ar|ar2bw)'.format(sys.argv[1])) |
|
exit(1) |
|
for line in sys.stdin: |
|
line = line if sys.argv[1] == 'bw2ar' else clean_text(line) |
|
output_text = transliterate_text(line, direction=str(sys.argv[1])) |
|
if output_text.strip() != '': |
|
sys.stdout.write('{}\n'.format(output_text.strip())) |
|
|
|
|
|
|
|
|