|
""" |
|
note: this code is used in bw2ar.py file |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Constants for arabic """ |
|
import re |
|
COMMA = u'\u060C' |
|
SEMICOLON = u'\u061B' |
|
QUESTION = u'\u061F' |
|
HAMZA = u'\u0621' |
|
ALEF_MADDA = u'\u0622' |
|
ALEF_HAMZA_ABOVE = u'\u0623' |
|
WAW_HAMZA = u'\u0624' |
|
ALEF_HAMZA_BELOW = u'\u0625' |
|
YEH_HAMZA = u'\u0626' |
|
ALEF = u'\u0627' |
|
BEH = u'\u0628' |
|
TEH_MARBUTA = u'\u0629' |
|
TEH = u'\u062a' |
|
THEH = u'\u062b' |
|
JEEM = u'\u062c' |
|
HAH = u'\u062d' |
|
KHAH = u'\u062e' |
|
DAL = u'\u062f' |
|
THAL = u'\u0630' |
|
REH = u'\u0631' |
|
ZAIN = u'\u0632' |
|
SEEN = u'\u0633' |
|
SHEEN = u'\u0634' |
|
SAD = u'\u0635' |
|
DAD = u'\u0636' |
|
TAH = u'\u0637' |
|
ZAH = u'\u0638' |
|
AIN = u'\u0639' |
|
GHAIN = u'\u063a' |
|
TATWEEL = u'\u0640' |
|
FEH = u'\u0641' |
|
QAF = u'\u0642' |
|
KAF = u'\u0643' |
|
LAM = u'\u0644' |
|
MEEM = u'\u0645' |
|
NOON = u'\u0646' |
|
HEH = u'\u0647' |
|
WAW = u'\u0648' |
|
ALEF_MAKSURA = u'\u0649' |
|
YEH = u'\u064a' |
|
MADDA_ABOVE = u'\u0653' |
|
HAMZA_ABOVE = u'\u0654' |
|
HAMZA_BELOW = u'\u0655' |
|
ZERO = u'\u0660' |
|
ONE = u'\u0661' |
|
TWO = u'\u0662' |
|
THREE = u'\u0663' |
|
FOUR = u'\u0664' |
|
FIVE = u'\u0665' |
|
SIX = u'\u0666' |
|
SEVEN = u'\u0667' |
|
EIGHT = u'\u0668' |
|
NINE = u'\u0669' |
|
PERCENT = u'\u066a' |
|
DECIMAL = u'\u066b' |
|
THOUSANDS = u'\u066c' |
|
STAR = u'\u066d' |
|
MINI_ALEF = u'\u0670' |
|
ALEF_WASLA = u'\u0671' |
|
FULL_STOP = u'\u06d4' |
|
BYTE_ORDER_MARK = u'\ufeff' |
|
|
|
|
|
FATHATAN = u'\u064b' |
|
DAMMATAN = u'\u064c' |
|
KASRATAN = u'\u064d' |
|
FATHA = u'\u064e' |
|
DAMMA = u'\u064f' |
|
KASRA = u'\u0650' |
|
SHADDA = u'\u0651' |
|
SUKUN = u'\u0652' |
|
|
|
|
|
LAM_ALEF = u'\ufefb' |
|
LAM_ALEF_HAMZA_ABOVE = u'\ufef7' |
|
LAM_ALEF_HAMZA_BELOW = u'\ufef9' |
|
LAM_ALEF_MADDA_ABOVE = u'\ufef5' |
|
SIMPLE_LAM_ALEF = u'\u0644\u0627' |
|
SIMPLE_LAM_ALEF_HAMZA_ABOVE = u'\u0644\u0623' |
|
SIMPLE_LAM_ALEF_HAMZA_BELOW = u'\u0644\u0625' |
|
SIMPLE_LAM_ALEF_MADDA_ABOVE = u'\u0644\u0622' |
|
|
|
|
|
HARAKAT_PAT = re.compile(u"["+u"".join([FATHATAN, DAMMATAN, KASRATAN, |
|
FATHA, DAMMA, KASRA, SUKUN, |
|
SHADDA])+u"]") |
|
HAMZAT_PAT = re.compile(u"["+u"".join([WAW_HAMZA, YEH_HAMZA])+u"]") |
|
ALEFAT_PAT = re.compile(u"["+u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, |
|
ALEF_HAMZA_BELOW, HAMZA_ABOVE, |
|
HAMZA_BELOW])+u"]") |
|
LAMALEFAT_PAT = re.compile(u"["+u"".join([LAM_ALEF, |
|
LAM_ALEF_HAMZA_ABOVE, |
|
LAM_ALEF_HAMZA_BELOW, |
|
LAM_ALEF_MADDA_ABOVE])+u"]") |
|
|
|
def strip_tashkeel(text): |
|
text = HARAKAT_PAT.sub('', text) |
|
text = re.sub(u"[\u064E]", "", text, flags=re.UNICODE) |
|
text = re.sub(u"[\u0671]", "", text, flags=re.UNICODE) |
|
return text |
|
|
|
def strip_tatweel(text): |
|
return re.sub(u'[%s]' % TATWEEL, '', text) |
|
|
|
|
|
def remove_non_arabic(text): |
|
text = strip_tashkeel(text) |
|
text = strip_tatweel(text) |
|
return ' '.join(re.sub(u"[^\u0621-\u063A\u0641-\u064A ]", " ", text, flags=re.UNICODE).split()) |
|
|
|
|