Spaces:

MohamedRashad
/

arabic-auto-tashkeel

Running

File size: 4,473 Bytes

bcc0c7f

"""
functions to convert Arabic words/text into buckwalter encoding and vice versa
"""

import sys
import re
import utils

buck2uni = {
            "'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
}

# For a reverse transliteration (Unicode -> Buckwalter), a dictionary
# which is the reverse of the above buck2uni is essential.
uni2buck = {}

# Iterate through all the items in the buck2uni dict.
for (key, value) in buck2uni.items():
    # The value from buck2uni becomes a key in uni2buck, and vice
    # versa for the keys.
    uni2buck[value] = key

# add special characters
uni2buck[u"\ufefb"] = "lA"
uni2buck[u"\ufef7"] = "l>"
uni2buck[u"\ufef5"] = "l|"
uni2buck[u"\ufef9"] = "l<"

# clean the arabic text from unwanted characters that may cause problem while building the language model
def clean_text(text):
    text = re.sub(u"[\ufeff]", "", text,  flags=re.UNICODE) # strip Unicode Character 'ZERO WIDTH NO-BREAK SPACE' (U+FEFF). For more info, check http://www.fileformat.info/info/unicode/char/feff/index.htm
    text = utils.remove_non_arabic(text)
    text = utils.strip_tashkeel(text)
    text = utils.strip_tatweel(text)
    return text

# convert a single word into buckwalter and vice versa
def transliterate_word(input_word, direction='bw2ar'):
    output_word = ''
    # Loop over each character in the string, bw_word.
    for char in input_word:
        # Look up current char in the dictionary to get its
        # respective value. If there is no match, e.g., chars like
        # spaces, then just stick with the current char without any
        # conversion.
        # if type(char) == bytes:
        #    char = char.decode('ascii')
        if direction == 'bw2ar':
            #print('in bw2ar')
            output_word += buck2uni.get(char, char)
        elif direction == 'ar2bw':
            #print('in ar2bw')
            output_word += uni2buck.get(char, char)
        else:
            sys.stderr.write('Error: invalid direction!')
            sys.exit()
    return output_word


# convert a text into buckwalter and vice versa
def transliterate_text(input_text, direction='bw2ar'):
    output_text = ''
    for input_word in input_text.split(' '):
        output_text += transliterate_word(input_word, direction) + ' '

    return output_text[:-1] # remove the last space ONLY


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.stderr.write('Usage: INPUT TEXT | python {} DIRECTION(bw2ar|ar2bw)'.format(sys.argv[1]))
        exit(1)
    for line in sys.stdin:
        line = line if sys.argv[1] == 'bw2ar' else clean_text(line)
        output_text = transliterate_text(line, direction=str(sys.argv[1]))
        if output_text.strip() != '':
            sys.stdout.write('{}\n'.format(output_text.strip()))