File size: 4,473 Bytes
bcc0c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
functions to convert Arabic words/text into buckwalter encoding and vice versa
"""

import sys
import re
import utils

buck2uni = {
            "'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
}

# For a reverse transliteration (Unicode -> Buckwalter), a dictionary
# which is the reverse of the above buck2uni is essential.
uni2buck = {}

# Iterate through all the items in the buck2uni dict.
for (key, value) in buck2uni.items():
    # The value from buck2uni becomes a key in uni2buck, and vice
    # versa for the keys.
    uni2buck[value] = key

# add special characters
uni2buck[u"\ufefb"] = "lA"
uni2buck[u"\ufef7"] = "l>"
uni2buck[u"\ufef5"] = "l|"
uni2buck[u"\ufef9"] = "l<"

# clean the arabic text from unwanted characters that may cause problem while building the language model
def clean_text(text):
    text = re.sub(u"[\ufeff]", "", text,  flags=re.UNICODE) # strip Unicode Character 'ZERO WIDTH NO-BREAK SPACE' (U+FEFF). For more info, check http://www.fileformat.info/info/unicode/char/feff/index.htm
    text = utils.remove_non_arabic(text)
    text = utils.strip_tashkeel(text)
    text = utils.strip_tatweel(text)
    return text

# convert a single word into buckwalter and vice versa
def transliterate_word(input_word, direction='bw2ar'):
    output_word = ''
    # Loop over each character in the string, bw_word.
    for char in input_word:
        # Look up current char in the dictionary to get its
        # respective value. If there is no match, e.g., chars like
        # spaces, then just stick with the current char without any
        # conversion.
        # if type(char) == bytes:
        #    char = char.decode('ascii')
        if direction == 'bw2ar':
            #print('in bw2ar')
            output_word += buck2uni.get(char, char)
        elif direction == 'ar2bw':
            #print('in ar2bw')
            output_word += uni2buck.get(char, char)
        else:
            sys.stderr.write('Error: invalid direction!')
            sys.exit()
    return output_word


# convert a text into buckwalter and vice versa
def transliterate_text(input_text, direction='bw2ar'):
    output_text = ''
    for input_word in input_text.split(' '):
        output_text += transliterate_word(input_word, direction) + ' '

    return output_text[:-1] # remove the last space ONLY


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.stderr.write('Usage: INPUT TEXT | python {} DIRECTION(bw2ar|ar2bw)'.format(sys.argv[1]))
        exit(1)
    for line in sys.stdin:
        line = line if sys.argv[1] == 'bw2ar' else clean_text(line)
        output_text = transliterate_text(line, direction=str(sys.argv[1]))
        if output_text.strip() != '':
            sys.stdout.write('{}\n'.format(output_text.strip()))