Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from functools import partial | |
| from joeynmt.prediction import predict | |
| from joeynmt.helpers import ( | |
| check_version, | |
| load_checkpoint, | |
| load_config, | |
| parse_train_args, | |
| resolve_ckpt_path, | |
| ) | |
| from joeynmt.model import build_model | |
| from joeynmt.tokenizers import build_tokenizer | |
| from joeynmt.vocabulary import build_vocab | |
| from joeynmt.datasets import build_dataset | |
| import gradio as gr | |
| # INPUT = "سلاو لە ناو گلی کرد" | |
| cfg_file = 'config.yaml' | |
| ckpt = './models/Sorani-Arabic/best.ckpt' | |
| cfg = load_config(Path(cfg_file)) | |
| # parse and validate cfg | |
| model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args( | |
| cfg["training"], mode="prediction") | |
| test_cfg = cfg["testing"] | |
| src_cfg = cfg["data"]["src"] | |
| trg_cfg = cfg["data"]["trg"] | |
| load_model = load_model if ckpt is None else Path(ckpt) | |
| ckpt = resolve_ckpt_path(load_model, model_dir) | |
| src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir) | |
| model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) | |
| # load model state from disk | |
| model_checkpoint = load_checkpoint(ckpt, device=device) | |
| model.load_state_dict(model_checkpoint["model_state"]) | |
| if device.type == "cuda": | |
| model.to(device) | |
| tokenizer = build_tokenizer(cfg["data"]) | |
| sequence_encoder = { | |
| src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True), | |
| trg_cfg["lang"]: None, | |
| } | |
| test_cfg["batch_size"] = 1 # CAUTION: this will raise an error if n_gpus > 1 | |
| test_cfg["batch_type"] = "sentence" | |
| test_data = build_dataset( | |
| dataset_type="stream", | |
| path=None, | |
| src_lang=src_cfg["lang"], | |
| trg_lang=trg_cfg["lang"], | |
| split="test", | |
| tokenizer=tokenizer, | |
| sequence_encoder=sequence_encoder, | |
| ) | |
| # test_data.set_item(INPUT.rstrip()) | |
| def _translate_data(test_data, cfg=test_cfg): | |
| """Translates given dataset, using parameters from outer scope.""" | |
| _, _, hypotheses, trg_tokens, trg_scores, _ = predict( | |
| model=model, | |
| data=test_data, | |
| compute_loss=False, | |
| device=device, | |
| n_gpu=n_gpu, | |
| normalization="none", | |
| num_workers=num_workers, | |
| cfg=cfg, | |
| fp16=fp16, | |
| ) | |
| return hypotheses[0] | |
| def normalize(text, language_script): | |
| test_data.set_item(text) | |
| result = _translate_data(test_data) | |
| return result | |
| title = "Script Normalization for Unconventional Writing" | |
| description = """ | |
| <ul> | |
| <li>"<em>mar7aba!</em>"</li> | |
| <li>"<em>هاو ئار یوو؟</em>"</li> | |
| <li>"<em>Μπιάνβενου α σετ ντεμό!</em>"</li> | |
| </ul> | |
| <p>What all these sentences are in common? Being greeted in Arabic with "<em>mar7aba</em>" written in the Latin script, then asked how you are ("<em>هاو ئار یوو؟</em>") in English using the Perso-Arabic script of Kurdish and then, welcomed to this demo in French ("<em>Μπιάνβενου α σετ ντεμό!</em>") written in Greek script. All these sentences are written in an <strong>unconventional</strong> script.</p> | |
| <p>Although you may find these sentences risible, unconventional writing is a common practice among millions of speakers in bilingual communities. In our paper entitled "<a href="https://sinaahmadi.github.io/docs/articles/ahmadi2023acl.pdf" target="_blank"><strong>Script Normalization for Unconventional Writing of Under-Resourced Languages in Bilingual Communities</strong></a>", we shed light on this problem and propose an approach to normalize noisy text written in unconventional writing.</p> | |
| <p>This demo deploys a few models that are trained for <strong>the normalization of unconventional writing</strong>. Please note that this tool is not a spell-checker and cannot correct errors beyond character normalization.</p> | |
| For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a> | |
| """ | |
| languages_scripts = { | |
| "Azeri Turkish in Persian": "AzeriTurkish-Persian", | |
| "Central Kurdish in Arabic": "Sorani-Arabic", | |
| "Central Kurdish in Persian": "Sorani-Persian", | |
| "Gilaki in Persian": "Gilaki-Persian", | |
| "Gorani in Arabic": "Gorani-Arabic", | |
| "Gorani in Central Kurdish": "Gorani-Sorani", | |
| "Gorani in Persian": "Gorani-Persian", | |
| "Kashmiri in Urdu": "Kashmiri-Urdu", | |
| "Mazandarani in Persian": "Mazandarani-Persian", | |
| "Northern Kurdish in Arabic": "Kurmanji-Arabic", | |
| "Northern Kurdish in Persian": "Kurmanji-Persian", | |
| "Sindhi in Urdu": "Sindhi-Urdu" | |
| } | |
| examples = [ | |
| ["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"], | |
| ["سلاو برا جونی؟", "Central Kurdish in Arabic"], | |
| ] | |
| demo = gr.Interface( | |
| title=title, | |
| description=description, | |
| fn=normalize, | |
| inputs = [ | |
| gr.inputs.Textbox(lines=4, label="Noisy Text"), | |
| gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))), | |
| ], | |
| outputs=gr.outputs.Textbox(label="Normalized Text"), | |
| examples=examples | |
| ) | |
| demo.launch() | |