diff --git "a/data/Afro-Asiatic.json" "b/data/Afro-Asiatic.json" --- "a/data/Afro-Asiatic.json" +++ "b/data/Afro-Asiatic.json" @@ -2,6171 +2,14680 @@ "name": "Afro-Asiatic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Berber", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Guanche", "iso_1_code": null, "iso_3_code": "gnc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "11", + "scripts": [], + "own_tokenizer": false }, { "name": "Eastern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Awjila-Sokna", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Awjilah", "iso_1_code": null, "iso_3_code": "auj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "14", + "scripts": [], + "own_tokenizer": false }, { "name": "Sawknah", "iso_1_code": null, "iso_3_code": "swn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "15", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "13", + "scripts": [], + "own_tokenizer": false }, { "name": "Siwa", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Siwi", "iso_1_code": null, "iso_3_code": "siz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "17", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "16", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "12", + "scripts": [], + "own_tokenizer": false }, { "name": "Northern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Chenoua", "iso_1_code": null, "iso_3_code": "cnu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "19", + "scripts": [], + "own_tokenizer": false }, { "name": "Atlas", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Judeo-Berber", "iso_1_code": null, "iso_3_code": "jbe", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "21", + "scripts": [], + "own_tokenizer": false }, { "name": "Tachelhit", "iso_1_code": null, "iso_3_code": "shi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "22", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Tamazight, Central Atlas", "iso_1_code": null, "iso_3_code": "tzm", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "23", + "scripts": [ + "Tfng" + ], + "own_tokenizer": false }, { "name": "Tamazight, Standard Moroccan", "iso_1_code": null, "iso_3_code": "zgh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "24", + "scripts": [ + "Tfng" + ], + "own_tokenizer": false } - ] + ], + "node_i": "20", + "scripts": [], + "own_tokenizer": false }, { "name": "Kabyle", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kabyle", "iso_1_code": null, "iso_3_code": "kab", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "26", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "25", + "scripts": [], + "own_tokenizer": false }, { "name": "Zenati", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "East", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ghadam\u00e8s", "iso_1_code": null, "iso_3_code": "gha", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "29", + "scripts": [], + "own_tokenizer": false }, { "name": "Nafusi", "iso_1_code": null, "iso_3_code": "jbn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "30", + "scripts": [], + "own_tokenizer": false }, { "name": "Sened", "iso_1_code": null, "iso_3_code": "sds", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "31", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "28", + "scripts": [], + "own_tokenizer": false }, { "name": "Ghomara", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ghomara", "iso_1_code": null, "iso_3_code": "gho", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "33", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "32", + "scripts": [], + "own_tokenizer": false }, { "name": "Mzab-Wargla", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Taznatit", "iso_1_code": null, "iso_3_code": "grr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "35", + "scripts": [], + "own_tokenizer": false }, { "name": "Tumzabt", "iso_1_code": null, "iso_3_code": "mzb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "36", + "scripts": [], + "own_tokenizer": false }, { "name": "Tagargrent", "iso_1_code": null, "iso_3_code": "oua", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "37", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamazight, Temacine", "iso_1_code": null, "iso_3_code": "tjo", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "38", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "34", + "scripts": [], + "own_tokenizer": false }, { "name": "Riff", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Tarifit", "iso_1_code": null, "iso_3_code": "rif", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "40", + "scripts": [], + "own_tokenizer": false }, { "name": "Senhaja Berber", "iso_1_code": null, "iso_3_code": "sjs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "41", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "39", + "scripts": [], + "own_tokenizer": false }, { "name": "Shawiya", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Tachawit", "iso_1_code": null, "iso_3_code": "shy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "43", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "42", + "scripts": [], + "own_tokenizer": false }, { "name": "Tidikelt", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Tamazight, Tidikelt", "iso_1_code": null, "iso_3_code": "tia", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "45", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "44", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "27", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "18", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamasheq", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Northern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Tamahaq, Tahaggart", "iso_1_code": null, "iso_3_code": "thv", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "48", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "47", + "scripts": [], + "own_tokenizer": false }, { "name": "Southern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Tamasheq", "iso_1_code": null, "iso_3_code": "taq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "50", + "scripts": [ + "Latn", + "Tfng" + ], + "own_tokenizer": false }, { "name": "Tamajeq, Tayart", "iso_1_code": null, "iso_3_code": "thz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "51", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamajaq, Tawallammat", "iso_1_code": null, "iso_3_code": "ttq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "52", + "scripts": [ + "Latn", + "Tfng" + ], + "own_tokenizer": false } - ] + ], + "node_i": "49", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "46", + "scripts": [], + "own_tokenizer": false }, { "name": "Zenaga", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Tetserret", "iso_1_code": null, "iso_3_code": "tez", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "54", + "scripts": [], + "own_tokenizer": false }, { "name": "Zenaga", "iso_1_code": null, "iso_3_code": "zen", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "55", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "53", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "10", + "scripts": [], + "own_tokenizer": false }, { "name": "Chadic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Biu-Mandara", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "A", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "A.1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Eastern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Boga", "iso_1_code": null, "iso_3_code": "bvw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "61", + "scripts": [], + "own_tokenizer": false }, { "name": "Ga\u2019anda", "iso_1_code": null, "iso_3_code": "gqa", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "62", + "scripts": [], + "own_tokenizer": false }, { "name": "Hwana", "iso_1_code": null, "iso_3_code": "hwo", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "63", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "60", + "scripts": [], + "own_tokenizer": false }, { "name": "Western", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Jara", "iso_1_code": null, "iso_3_code": "jaf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "65", + "scripts": [], + "own_tokenizer": false }, { "name": "Tera", "iso_1_code": null, "iso_3_code": "ttr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "66", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "64", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "59", + "scripts": [], + "own_tokenizer": false }, { "name": "A.2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Nggwahyi", "iso_1_code": null, "iso_3_code": "ngx", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "68", + "scripts": [], + "own_tokenizer": false }, { "name": "1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bura-Pabir", "iso_1_code": null, "iso_3_code": "bwr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "70", + "scripts": [], + "own_tokenizer": false }, { "name": "Kibaku", "iso_1_code": null, "iso_3_code": "ckl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "71", + "scripts": [], + "own_tokenizer": false }, { "name": "Kofa", "iso_1_code": null, "iso_3_code": "kso", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "72", + "scripts": [], + "own_tokenizer": false }, { "name": "Putai", "iso_1_code": null, "iso_3_code": "mfl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "73", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "69", + "scripts": [], + "own_tokenizer": false }, { "name": "2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Nya Huba", "iso_1_code": null, "iso_3_code": "hbb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "75", + "scripts": [], + "own_tokenizer": false }, { "name": "Marghi South", "iso_1_code": null, "iso_3_code": "mfm", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "76", + "scripts": [], + "own_tokenizer": false }, { "name": "Marghi Central", "iso_1_code": null, "iso_3_code": "mrt", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "77", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "74", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "67", + "scripts": [], + "own_tokenizer": false }, { "name": "A.3", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bana", "iso_1_code": null, "iso_3_code": "bcw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "79", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Kirya-Konzel", "iso_1_code": null, "iso_3_code": "fkk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "80", + "scripts": [], + "own_tokenizer": false }, { "name": "Kamwe", "iso_1_code": null, "iso_3_code": "hig", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "81", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Hya", "iso_1_code": null, "iso_3_code": "hya", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "82", + "scripts": [], + "own_tokenizer": false }, { "name": "Psikye", "iso_1_code": null, "iso_3_code": "kvj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "83", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "78", + "scripts": [], + "own_tokenizer": false }, { "name": "A.4", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Lamang", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Lamang", "iso_1_code": null, "iso_3_code": "hia", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "86", + "scripts": [], + "own_tokenizer": false }, { "name": "Vemgo-Mabas", "iso_1_code": null, "iso_3_code": "vem", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "87", + "scripts": [], + "own_tokenizer": false }, { "name": "Hdi", "iso_1_code": null, "iso_3_code": "xed", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "88", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "85", + "scripts": [], + "own_tokenizer": false }, { "name": "Mandara Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Glavda", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Cineni", "iso_1_code": null, "iso_3_code": "cie", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "91", + "scripts": [], + "own_tokenizer": false }, { "name": "Dghwede", "iso_1_code": null, "iso_3_code": "dgh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "92", + "scripts": [], + "own_tokenizer": false }, { "name": "Guduf-Gava", "iso_1_code": null, "iso_3_code": "gdf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "93", + "scripts": [], + "own_tokenizer": false }, { "name": "Glavda", "iso_1_code": null, "iso_3_code": "glw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "94", + "scripts": [], + "own_tokenizer": false }, { "name": "Gvoko", "iso_1_code": null, "iso_3_code": "ngs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "95", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "90", + "scripts": [], + "own_tokenizer": false }, { "name": "Mandara", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Wandala", "iso_1_code": null, "iso_3_code": "mfi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "97", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "96", + "scripts": [], + "own_tokenizer": false }, { "name": "Podoko", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Parkwa", "iso_1_code": null, "iso_3_code": "pbi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "99", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "98", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "89", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "84", + "scripts": [], + "own_tokenizer": false }, { "name": "A.5", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Baldemu", "iso_1_code": null, "iso_3_code": "bdn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "101", + "scripts": [], + "own_tokenizer": false }, { "name": "Cuvok", "iso_1_code": null, "iso_3_code": "cuv", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "102", + "scripts": [], + "own_tokenizer": false }, { "name": "Dugwor", "iso_1_code": null, "iso_3_code": "dme", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "103", + "scripts": [], + "own_tokenizer": false }, { "name": "Giziga, North", "iso_1_code": null, "iso_3_code": "gis", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "104", + "scripts": [], + "own_tokenizer": false }, { "name": "Giziga", "iso_1_code": null, "iso_3_code": "giz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "105", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Zulgo-Gemzek", "iso_1_code": null, "iso_3_code": "gnd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "106", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Mafa", "iso_1_code": null, "iso_3_code": "maf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "107", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Merey", "iso_1_code": null, "iso_3_code": "meq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "108", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Matal", "iso_1_code": null, "iso_3_code": "mfh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "109", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Mefele", "iso_1_code": null, "iso_3_code": "mfj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "110", + "scripts": [], + "own_tokenizer": false }, { "name": "Mofu, North", "iso_1_code": null, "iso_3_code": "mfk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "111", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Mofu-Gudur", "iso_1_code": null, "iso_3_code": "mif", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "112", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Vame", "iso_1_code": null, "iso_3_code": "mlr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "113", + "scripts": [], + "own_tokenizer": false }, { "name": "Moloko", "iso_1_code": null, "iso_3_code": "mlw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "114", + "scripts": [], + "own_tokenizer": false }, { "name": "Mbuko", "iso_1_code": null, "iso_3_code": "mqb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "115", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Muyang", "iso_1_code": null, "iso_3_code": "muy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "116", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Mada", "iso_1_code": null, "iso_3_code": "mxu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "117", + "scripts": [], + "own_tokenizer": false }, { "name": "Wuzlam", "iso_1_code": null, "iso_3_code": "udl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "118", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "100", + "scripts": [], + "own_tokenizer": false }, { "name": "A.6", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Sukur", "iso_1_code": null, "iso_3_code": "syk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "120", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "119", + "scripts": [], + "own_tokenizer": false }, { "name": "A.7", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Buwal", "iso_1_code": null, "iso_3_code": "bhs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "122", + "scripts": [], + "own_tokenizer": false }, { "name": "Daba", "iso_1_code": null, "iso_3_code": "dbq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "123", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Mazagway-Hidi", "iso_1_code": null, "iso_3_code": "dkx", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "124", + "scripts": [], + "own_tokenizer": false }, { "name": "Gavar", "iso_1_code": null, "iso_3_code": "gou", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "125", + "scripts": [], + "own_tokenizer": false }, { "name": "Mina", "iso_1_code": null, "iso_3_code": "hna", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "126", + "scripts": [], + "own_tokenizer": false }, { "name": "Mbudum", "iso_1_code": null, "iso_3_code": "xmd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "127", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "121", + "scripts": [], + "own_tokenizer": false }, { "name": "A.8", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bacama", "iso_1_code": null, "iso_3_code": "bcy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "129", + "scripts": [], + "own_tokenizer": false }, { "name": "Bata", "iso_1_code": null, "iso_3_code": "bta", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "130", + "scripts": [], + "own_tokenizer": false }, { "name": "Fali Muchella", "iso_1_code": null, "iso_3_code": "fli", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "131", + "scripts": [], + "own_tokenizer": false }, { "name": "Gude", "iso_1_code": null, "iso_3_code": "gde", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "132", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Gudu", "iso_1_code": null, "iso_3_code": "gdu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "133", + "scripts": [], + "own_tokenizer": false }, { "name": "Holma", "iso_1_code": null, "iso_3_code": "hod", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "134", + "scripts": [], + "own_tokenizer": false }, { "name": "Jimjimen", "iso_1_code": null, "iso_3_code": "jim", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "135", + "scripts": [], + "own_tokenizer": false }, { "name": "Ngwaba", "iso_1_code": null, "iso_3_code": "ngw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "136", + "scripts": [], + "own_tokenizer": false }, { "name": "Nzanyi", "iso_1_code": null, "iso_3_code": "nja", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "137", + "scripts": [], + "own_tokenizer": false }, { "name": "Sharwa", "iso_1_code": null, "iso_3_code": "swq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "138", + "scripts": [], + "own_tokenizer": false }, { "name": "Tsuvan", "iso_1_code": null, "iso_3_code": "tsh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "139", + "scripts": [], + "own_tokenizer": false }, { "name": "Zizilivakan", "iso_1_code": null, "iso_3_code": "ziz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "140", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "128", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "58", + "scripts": [], + "own_tokenizer": false }, { "name": "B", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "B.1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Auyokawa", "iso_1_code": null, "iso_3_code": "auo", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "143", + "scripts": [], + "own_tokenizer": false }, { "name": "Jilbe", "iso_1_code": null, "iso_3_code": "jie", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "144", + "scripts": [], + "own_tokenizer": false }, { "name": "Buduma", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Buduma", "iso_1_code": null, "iso_3_code": "bdm", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "146", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "145", + "scripts": [], + "own_tokenizer": false }, { "name": "Jina", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Jina", "iso_1_code": null, "iso_3_code": "jia", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "148", + "scripts": [], + "own_tokenizer": false }, { "name": "Majera", "iso_1_code": null, "iso_3_code": "xmj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "149", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "147", + "scripts": [], + "own_tokenizer": false }, { "name": "Kotoko Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "North", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Afade", "iso_1_code": null, "iso_3_code": "aal", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "152", + "scripts": [], + "own_tokenizer": false }, { "name": "Mpade", "iso_1_code": null, "iso_3_code": "mpi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "153", + "scripts": [], + "own_tokenizer": false }, { "name": "Maslam", "iso_1_code": null, "iso_3_code": "msv", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "154", + "scripts": [], + "own_tokenizer": false }, { "name": "Malgbe", "iso_1_code": null, "iso_3_code": "mxf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "155", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "151", + "scripts": [], + "own_tokenizer": false }, { "name": "South", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Lagwan", "iso_1_code": null, "iso_3_code": "kot", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "157", + "scripts": [], + "own_tokenizer": false }, { "name": "Mser", "iso_1_code": null, "iso_3_code": "kqx", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "158", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "156", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "150", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "142", + "scripts": [], + "own_tokenizer": false }, { "name": "B.2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Muskum", "iso_1_code": null, "iso_3_code": "mje", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "160", + "scripts": [], + "own_tokenizer": false }, { "name": "Mbara", "iso_1_code": null, "iso_3_code": "mpk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "161", + "scripts": [], + "own_tokenizer": false }, { "name": "Musgu", "iso_1_code": null, "iso_3_code": "mug", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "162", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "159", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "141", + "scripts": [], + "own_tokenizer": false }, { "name": "C", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Gidar", "iso_1_code": null, "iso_3_code": "gid", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "164", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "163", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "57", + "scripts": [], + "own_tokenizer": false }, { "name": "East", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "A", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "A.1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Buso", "iso_1_code": null, "iso_3_code": "bso", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "168", + "scripts": [], + "own_tokenizer": false }, { "name": "1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Mulgi", "iso_1_code": null, "iso_3_code": "mvh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "170", + "scripts": [], + "own_tokenizer": false }, { "name": "Ndam", "iso_1_code": null, "iso_3_code": "ndm", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "171", + "scripts": [], + "own_tokenizer": false }, { "name": "Soumraye", "iso_1_code": null, "iso_3_code": "sor", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "172", + "scripts": [], + "own_tokenizer": false }, { "name": "Tumak", "iso_1_code": null, "iso_3_code": "tmc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "173", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "169", + "scripts": [], + "own_tokenizer": false }, { "name": "2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Boor", "iso_1_code": null, "iso_3_code": "bvf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "175", + "scripts": [], + "own_tokenizer": false }, { "name": "Gadang", "iso_1_code": null, "iso_3_code": "gdk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "176", + "scripts": [], + "own_tokenizer": false }, { "name": "Miltu", "iso_1_code": null, "iso_3_code": "mlj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "177", + "scripts": [], + "own_tokenizer": false }, { "name": "Sarua", "iso_1_code": null, "iso_3_code": "swy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "178", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "174", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "167", + "scripts": [], + "own_tokenizer": false }, { "name": "A.2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kimr\u00e9", "iso_1_code": null, "iso_3_code": "kqp", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "181", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Lele", "iso_1_code": null, "iso_3_code": "lln", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "182", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Nancere", "iso_1_code": null, "iso_3_code": "nnc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "183", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "180", + "scripts": [], + "own_tokenizer": false }, { "name": "2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Gabri", "iso_1_code": null, "iso_3_code": "gab", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "185", + "scripts": [], + "own_tokenizer": false }, { "name": "Kabalai", "iso_1_code": null, "iso_3_code": "kvf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "186", + "scripts": [], + "own_tokenizer": false }, { "name": "Tobanga", "iso_1_code": null, "iso_3_code": "tng", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "187", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "184", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "179", + "scripts": [], + "own_tokenizer": false }, { "name": "A.3", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kera", "iso_1_code": null, "iso_3_code": "ker", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "189", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Kwang", "iso_1_code": null, "iso_3_code": "kvi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "190", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "188", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "166", + "scripts": [], + "own_tokenizer": false }, { "name": "B", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "B.1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bidiyo", "iso_1_code": null, "iso_3_code": "bid", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "194", + "scripts": [], + "own_tokenizer": false }, { "name": "Dangal\u00e9at", "iso_1_code": null, "iso_3_code": "daa", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "195", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Jonkor Bourmataguil", "iso_1_code": null, "iso_3_code": "jeu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "196", + "scripts": [], + "own_tokenizer": false }, { "name": "Mawa", "iso_1_code": null, "iso_3_code": "mcw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "197", + "scripts": [], + "own_tokenizer": false }, { "name": "Migaama", "iso_1_code": null, "iso_3_code": "mmy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "198", + "scripts": [], + "own_tokenizer": false }, { "name": "Mogum", "iso_1_code": null, "iso_3_code": "mou", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "199", + "scripts": [], + "own_tokenizer": false }, { "name": "Mabire", "iso_1_code": null, "iso_3_code": "muj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "200", + "scripts": [], + "own_tokenizer": false }, { "name": "Ubi", "iso_1_code": null, "iso_3_code": "ubi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "201", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "193", + "scripts": [], + "own_tokenizer": false }, { "name": "2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Birgit", "iso_1_code": null, "iso_3_code": "btf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "203", + "scripts": [], + "own_tokenizer": false }, { "name": "Kajakse", "iso_1_code": null, "iso_3_code": "ckq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "204", + "scripts": [], + "own_tokenizer": false }, { "name": "Masmaje", "iso_1_code": null, "iso_3_code": "mes", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "205", + "scripts": [], + "own_tokenizer": false }, { "name": "Mubi", "iso_1_code": null, "iso_3_code": "mub", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "206", + "scripts": [], + "own_tokenizer": false }, { "name": "Toram", "iso_1_code": null, "iso_3_code": "trj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "207", + "scripts": [], + "own_tokenizer": false }, { "name": "Zerenkel", "iso_1_code": null, "iso_3_code": "zrn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "208", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "202", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "192", + "scripts": [], + "own_tokenizer": false }, { "name": "B.2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Mukulu", "iso_1_code": null, "iso_3_code": "moz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "210", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "209", + "scripts": [], + "own_tokenizer": false }, { "name": "B.3", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Barein", "iso_1_code": null, "iso_3_code": "bva", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "212", + "scripts": [], + "own_tokenizer": false }, { "name": "Saba", "iso_1_code": null, "iso_3_code": "saa", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "213", + "scripts": [], + "own_tokenizer": false }, { "name": "Sokoro", "iso_1_code": null, "iso_3_code": "sok", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "214", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamki", "iso_1_code": null, "iso_3_code": "tax", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "215", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "211", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "191", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "165", + "scripts": [], + "own_tokenizer": false }, { "name": "Masa", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Herd\u00e9", "iso_1_code": null, "iso_3_code": "hed", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "217", + "scripts": [], + "own_tokenizer": false }, { "name": "P\u00e9v\u00e9", "iso_1_code": null, "iso_3_code": "lme", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "218", + "scripts": [], + "own_tokenizer": false }, { "name": "Masana", "iso_1_code": null, "iso_3_code": "mcn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "219", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Marba", "iso_1_code": null, "iso_3_code": "mpg", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "220", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Musey", "iso_1_code": null, "iso_3_code": "mse", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "221", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Ngete", "iso_1_code": null, "iso_3_code": "nnn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "222", + "scripts": [], + "own_tokenizer": false }, { "name": "Mesme", "iso_1_code": null, "iso_3_code": "zim", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "223", + "scripts": [], + "own_tokenizer": false }, { "name": "Zumaya", "iso_1_code": null, "iso_3_code": "zuy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "224", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "216", + "scripts": [], + "own_tokenizer": false }, { "name": "West", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "A", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "A.1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Gwandara", "iso_1_code": null, "iso_3_code": "gwn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "228", + "scripts": [], + "own_tokenizer": false }, { "name": "Hausa", "iso_1_code": "ha", "iso_3_code": "hau", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "229", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "227", + "scripts": [], + "own_tokenizer": false }, { "name": "A.2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bole", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bole Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bole", "iso_1_code": null, "iso_3_code": "bol", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "233", + "scripts": [], + "own_tokenizer": false }, { "name": "Bure", "iso_1_code": null, "iso_3_code": "bvh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "234", + "scripts": [], + "own_tokenizer": false }, { "name": "Beele", "iso_1_code": null, "iso_3_code": "bxq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "235", + "scripts": [], + "own_tokenizer": false }, { "name": "Deno", "iso_1_code": null, "iso_3_code": "dbb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "236", + "scripts": [], + "own_tokenizer": false }, { "name": "Daza", "iso_1_code": null, "iso_3_code": "dzd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "237", + "scripts": [], + "own_tokenizer": false }, { "name": "Geruma", "iso_1_code": null, "iso_3_code": "gea", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "238", + "scripts": [], + "own_tokenizer": false }, { "name": "Gera", "iso_1_code": null, "iso_3_code": "gew", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "239", + "scripts": [], + "own_tokenizer": false }, { "name": "Galambi", "iso_1_code": null, "iso_3_code": "glo", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "240", + "scripts": [], + "own_tokenizer": false }, { "name": "Giiwo", "iso_1_code": null, "iso_3_code": "kks", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "241", + "scripts": [], + "own_tokenizer": false }, { "name": "Kubi", "iso_1_code": null, "iso_3_code": "kof", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "242", + "scripts": [], + "own_tokenizer": false }, { "name": "Kholok", "iso_1_code": null, "iso_3_code": "ktc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "243", + "scripts": [], + "own_tokenizer": false }, { "name": "Maaka", "iso_1_code": null, "iso_3_code": "mew", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "244", + "scripts": [], + "own_tokenizer": false }, { "name": "Ngamo", "iso_1_code": null, "iso_3_code": "nbh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "245", + "scripts": [], + "own_tokenizer": false }, { "name": "Nyam", "iso_1_code": null, "iso_3_code": "nmi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "246", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "232", + "scripts": [], + "own_tokenizer": false }, { "name": "Karekare", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Karekare", "iso_1_code": null, "iso_3_code": "kai", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "248", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "247", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "231", + "scripts": [], + "own_tokenizer": false }, { "name": "Tangale", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Dera", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Dera", "iso_1_code": null, "iso_3_code": "kna", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "251", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "250", + "scripts": [], + "own_tokenizer": false }, { "name": "Tangale Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kutto", "iso_1_code": null, "iso_3_code": "kpa", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "253", + "scripts": [], + "own_tokenizer": false }, { "name": "Kwaami", "iso_1_code": null, "iso_3_code": "ksq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "254", + "scripts": [], + "own_tokenizer": false }, { "name": "Kushi", "iso_1_code": null, "iso_3_code": "kuh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "255", + "scripts": [], + "own_tokenizer": false }, { "name": "Pero", "iso_1_code": null, "iso_3_code": "pip", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "256", + "scripts": [], + "own_tokenizer": false }, { "name": "Piya-Kwonci", "iso_1_code": null, "iso_3_code": "piy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "257", + "scripts": [], + "own_tokenizer": false }, { "name": "Tangale", "iso_1_code": null, "iso_3_code": "tan", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "258", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "252", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "249", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "230", + "scripts": [], + "own_tokenizer": false }, { "name": "A.3", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Angas Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Jakattoe", "iso_1_code": null, "iso_3_code": "jrt", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "261", + "scripts": [], + "own_tokenizer": false }, { "name": "1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ngas", "iso_1_code": null, "iso_3_code": "anc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "263", + "scripts": [], + "own_tokenizer": false }, { "name": "Cakfem-Mushere", "iso_1_code": null, "iso_3_code": "cky", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "264", + "scripts": [], + "own_tokenizer": false }, { "name": "Belning", "iso_1_code": null, "iso_3_code": "glb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "265", + "scripts": [], + "own_tokenizer": false }, { "name": "Kofyar", "iso_1_code": null, "iso_3_code": "kwl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "266", + "scripts": [], + "own_tokenizer": false }, { "name": "Miship", "iso_1_code": null, "iso_3_code": "mjs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "267", + "scripts": [], + "own_tokenizer": false }, { "name": "Nteng", "iso_1_code": null, "iso_3_code": "nqt", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "268", + "scripts": [], + "own_tokenizer": false }, { "name": "Mwaghavul", "iso_1_code": null, "iso_3_code": "sur", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "269", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "262", + "scripts": [], + "own_tokenizer": false }, { "name": "2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Goemai", "iso_1_code": null, "iso_3_code": "ank", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "271", + "scripts": [], + "own_tokenizer": false }, { "name": "Koenoem", "iso_1_code": null, "iso_3_code": "kcs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "272", + "scripts": [], + "own_tokenizer": false }, { "name": "Tehl", "iso_1_code": null, "iso_3_code": "mtl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "273", + "scripts": [], + "own_tokenizer": false }, { "name": "Piapung", "iso_1_code": null, "iso_3_code": "pcw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "274", + "scripts": [], + "own_tokenizer": false }, { "name": "Tal", "iso_1_code": null, "iso_3_code": "tal", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "275", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "270", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "260", + "scripts": [], + "own_tokenizer": false }, { "name": "Yiwom", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ywom", "iso_1_code": null, "iso_3_code": "gek", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "277", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "276", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "259", + "scripts": [], + "own_tokenizer": false }, { "name": "A.4", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Fyer", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Fyer", "iso_1_code": null, "iso_3_code": "fie", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "280", + "scripts": [], + "own_tokenizer": false }, { "name": "Rom", "iso_1_code": null, "iso_3_code": "tdk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "281", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "279", + "scripts": [], + "own_tokenizer": false }, { "name": "Ron Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ron", "iso_1_code": null, "iso_3_code": "cla", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "283", + "scripts": [], + "own_tokenizer": false }, { "name": "Duhwa", "iso_1_code": null, "iso_3_code": "kbz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "284", + "scripts": [], + "own_tokenizer": false }, { "name": "Kulere", "iso_1_code": null, "iso_3_code": "kul", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "285", + "scripts": [], + "own_tokenizer": false }, { "name": "Mindat", "iso_1_code": null, "iso_3_code": "mmf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "286", + "scripts": [], + "own_tokenizer": false }, { "name": "Sya", "iso_1_code": null, "iso_3_code": "scw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "287", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "282", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "278", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "226", + "scripts": [], + "own_tokenizer": false }, { "name": "B", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "B.1", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Teshenawa", "iso_1_code": null, "iso_3_code": "twc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "290", + "scripts": [], + "own_tokenizer": false }, { "name": "Bade Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bade", "iso_1_code": null, "iso_3_code": "bde", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "292", + "scripts": [], + "own_tokenizer": false }, { "name": "Ngizim", "iso_1_code": null, "iso_3_code": "ngi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "293", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "291", + "scripts": [], + "own_tokenizer": false }, { "name": "Duwai", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Duwai", "iso_1_code": null, "iso_3_code": "dbp", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "295", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "294", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "289", + "scripts": [], + "own_tokenizer": false }, { "name": "B.2", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ajawa", "iso_1_code": null, "iso_3_code": "ajw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "297", + "scripts": [], + "own_tokenizer": false }, { "name": "Burku", "iso_1_code": null, "iso_3_code": "bbt", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "298", + "scripts": [], + "own_tokenizer": false }, { "name": "Dirya", "iso_1_code": null, "iso_3_code": "dwa", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "299", + "scripts": [], + "own_tokenizer": false }, { "name": "Zibinju", "iso_1_code": null, "iso_3_code": "jmb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "300", + "scripts": [], + "own_tokenizer": false }, { "name": "Kariya", "iso_1_code": null, "iso_3_code": "kil", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "301", + "scripts": [], + "own_tokenizer": false }, { "name": "Vune mi", "iso_1_code": null, "iso_3_code": "mkf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "302", + "scripts": [], + "own_tokenizer": false }, { "name": "Pa\u2019anci", "iso_1_code": null, "iso_3_code": "pqa", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "303", + "scripts": [], + "own_tokenizer": false }, { "name": "Siri", "iso_1_code": null, "iso_3_code": "sir", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "304", + "scripts": [], + "own_tokenizer": false }, { "name": "Choogen", "iso_1_code": null, "iso_3_code": "tgd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "305", + "scripts": [], + "own_tokenizer": false }, { "name": "Warji", "iso_1_code": null, "iso_3_code": "wji", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "306", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "296", + "scripts": [], + "own_tokenizer": false }, { "name": "B.3", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Dass", "iso_1_code": null, "iso_3_code": "dot", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "308", + "scripts": [], + "own_tokenizer": false }, { "name": "Boghom", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Boghom", "iso_1_code": null, "iso_3_code": "bux", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "310", + "scripts": [], + "own_tokenizer": false }, { "name": "Kir-Balar", "iso_1_code": null, "iso_3_code": "kkr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "311", + "scripts": [], + "own_tokenizer": false }, { "name": "Mansi", "iso_1_code": null, "iso_3_code": "zns", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "312", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "309", + "scripts": [], + "own_tokenizer": false }, { "name": "Eastern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Jimi", "iso_1_code": null, "iso_3_code": "jmi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "314", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "313", + "scripts": [], + "own_tokenizer": false }, { "name": "Guruntum", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Guruntum-Mbaaru", "iso_1_code": null, "iso_3_code": "grd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "316", + "scripts": [], + "own_tokenizer": false }, { "name": "Juu", "iso_1_code": null, "iso_3_code": "juu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "317", + "scripts": [], + "own_tokenizer": false }, { "name": "Tala", "iso_1_code": null, "iso_3_code": "tak", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "318", + "scripts": [], + "own_tokenizer": false }, { "name": "Zamwal", "iso_1_code": null, "iso_3_code": "zah", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "319", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "315", + "scripts": [], + "own_tokenizer": false }, { "name": "Zaar Proper", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Cha\u2019ari", "iso_1_code": null, "iso_3_code": "cxh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "321", + "scripts": [], + "own_tokenizer": false }, { "name": "Dokshi", "iso_1_code": null, "iso_3_code": "dsk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "322", + "scripts": [], + "own_tokenizer": false }, { "name": "Dyarim", "iso_1_code": null, "iso_3_code": "dyr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "323", + "scripts": [], + "own_tokenizer": false }, { "name": "Gyaazi", "iso_1_code": null, "iso_3_code": "gyz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "324", + "scripts": [], + "own_tokenizer": false }, { "name": "Luri", "iso_1_code": null, "iso_3_code": "ldd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "325", + "scripts": [], + "own_tokenizer": false }, { "name": "Dir-Nyamzak-Mbarimi", "iso_1_code": null, "iso_3_code": "nzr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "326", + "scripts": [], + "own_tokenizer": false }, { "name": "Pesse", "iso_1_code": null, "iso_3_code": "pze", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "327", + "scripts": [], + "own_tokenizer": false }, { "name": "Saya", "iso_1_code": null, "iso_3_code": "say", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "328", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Tulai", "iso_1_code": null, "iso_3_code": "tvi", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "329", + "scripts": [], + "own_tokenizer": false }, { "name": "Buli", "iso_1_code": null, "iso_3_code": "uly", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "330", + "scripts": [], + "own_tokenizer": false }, { "name": "Zari", "iso_1_code": null, "iso_3_code": "zaz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "331", + "scripts": [], + "own_tokenizer": false }, { "name": "Bu", "iso_1_code": null, "iso_3_code": "zbu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "332", + "scripts": [], + "own_tokenizer": false }, { "name": "Zeem", "iso_1_code": null, "iso_3_code": "zem", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "333", + "scripts": [], + "own_tokenizer": false }, { "name": "Zul", "iso_1_code": null, "iso_3_code": "zlu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "334", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "320", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "307", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "288", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "225", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "56", + "scripts": [], + "own_tokenizer": false }, { "name": "Cushitic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Central", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Eastern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Xamtanga", "iso_1_code": null, "iso_3_code": "xan", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "338", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "337", + "scripts": [], + "own_tokenizer": false }, { "name": "Northern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bilen", "iso_1_code": null, "iso_3_code": "byn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "340", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "339", + "scripts": [], + "own_tokenizer": false }, { "name": "Southern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Awngi", "iso_1_code": null, "iso_3_code": "awn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "342", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "341", + "scripts": [], + "own_tokenizer": false }, { "name": "Western", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Qimant", "iso_1_code": null, "iso_3_code": "ahg", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "344", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "343", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "336", + "scripts": [], + "own_tokenizer": false }, { "name": "East", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Boon", "iso_1_code": null, "iso_3_code": "bnl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "346", + "scripts": [], + "own_tokenizer": false }, { "name": "Dullay", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ale", "iso_1_code": null, "iso_3_code": "gwd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "348", + "scripts": [], + "own_tokenizer": false }, { "name": "Tsamai", "iso_1_code": null, "iso_3_code": "tsb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "349", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "347", + "scripts": [], + "own_tokenizer": false }, { "name": "Highland", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Alaba-K\u2019abeena", "iso_1_code": null, "iso_3_code": "alw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "351", + "scripts": [], + "own_tokenizer": false }, { "name": "Burji", "iso_1_code": null, "iso_3_code": "bji", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "352", + "scripts": [], + "own_tokenizer": false }, { "name": "Gedeo", "iso_1_code": null, "iso_3_code": "drs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "353", + "scripts": [], + "own_tokenizer": false }, { "name": "Hadiyya", "iso_1_code": null, "iso_3_code": "hdy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "354", + "scripts": [], + "own_tokenizer": false }, { "name": "Kambaata", "iso_1_code": null, "iso_3_code": "ktb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "355", + "scripts": [ + "Ethi" + ], + "own_tokenizer": false }, { "name": "Libido", "iso_1_code": null, "iso_3_code": "liq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "356", + "scripts": [], + "own_tokenizer": false }, { "name": "Sidamo", "iso_1_code": null, "iso_3_code": "sid", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "357", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "350", + "scripts": [], + "own_tokenizer": false }, { "name": "Konso-Gidole", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Mositacha", "iso_1_code": null, "iso_3_code": "dox", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "359", + "scripts": [], + "own_tokenizer": false }, { "name": "Dirasha", "iso_1_code": null, "iso_3_code": "gdl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "360", + "scripts": [], + "own_tokenizer": false }, { "name": "Konso", "iso_1_code": null, "iso_3_code": "kxc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "361", + "scripts": [ + "Ethi" + ], + "own_tokenizer": false } - ] + ], + "node_i": "358", + "scripts": [], + "own_tokenizer": false }, { "name": "Oromo", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Oromo, Borana-Arsi-Guji", "iso_1_code": "om", "iso_3_code": "gax", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "363", + "scripts": [], + "own_tokenizer": false }, { "name": "Oromo, West Central", "iso_1_code": "om", "iso_3_code": "gaz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "364", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Oromo, Eastern", "iso_1_code": "om", "iso_3_code": "hae", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "365", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Orma", "iso_1_code": "om", "iso_3_code": "orc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "366", + "scripts": [], + "own_tokenizer": false }, { "name": "Waata", "iso_1_code": null, "iso_3_code": "ssn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "367", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "362", + "scripts": [], + "own_tokenizer": false }, { "name": "Rendille-Boni", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Aweer", "iso_1_code": null, "iso_3_code": "bob", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "369", + "scripts": [], + "own_tokenizer": false }, { "name": "Rendille", "iso_1_code": null, "iso_3_code": "rel", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "370", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "368", + "scripts": [], + "own_tokenizer": false }, { "name": "Saho-Afar", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Afar", "iso_1_code": "aa", "iso_3_code": "aar", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "372", + "scripts": [], + "own_tokenizer": false }, { "name": "Saho", "iso_1_code": null, "iso_3_code": "ssy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "373", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "371", + "scripts": [], + "own_tokenizer": false }, { "name": "Somali", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Dabarre", "iso_1_code": null, "iso_3_code": "dbr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "375", + "scripts": [], + "own_tokenizer": false }, { "name": "Garre", "iso_1_code": null, "iso_3_code": "gex", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "376", + "scripts": [], + "own_tokenizer": false }, { "name": "Girirra", "iso_1_code": null, "iso_3_code": "gii", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "377", + "scripts": [], + "own_tokenizer": false }, { "name": "Jiiddu", "iso_1_code": null, "iso_3_code": "jii", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "378", + "scripts": [], + "own_tokenizer": false }, { "name": "Somali", "iso_1_code": "so", "iso_3_code": "som", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "379", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Tunni", "iso_1_code": null, "iso_3_code": "tqq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "380", + "scripts": [], + "own_tokenizer": false }, { "name": "Maay", "iso_1_code": null, "iso_3_code": "ymm", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "381", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "374", + "scripts": [], + "own_tokenizer": false }, { "name": "Western Omo-Tana", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Arbore", "iso_1_code": null, "iso_3_code": "arv", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "383", + "scripts": [], + "own_tokenizer": false }, { "name": "Baiso", "iso_1_code": null, "iso_3_code": "bsw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "384", + "scripts": [], + "own_tokenizer": false }, { "name": "Daasanach", "iso_1_code": null, "iso_3_code": "dsh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "385", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "El Molo", "iso_1_code": null, "iso_3_code": "elo", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "386", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "382", + "scripts": [], + "own_tokenizer": false }, { "name": "Yaaku", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Yaaku", "iso_1_code": null, "iso_3_code": "muu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "388", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "387", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "345", + "scripts": [], + "own_tokenizer": false }, { "name": "North", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bedawiyet", "iso_1_code": null, "iso_3_code": "bej", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "390", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "389", + "scripts": [], + "own_tokenizer": false }, { "name": "South", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Aas\u00e1x", "iso_1_code": null, "iso_3_code": "aas", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "392", + "scripts": [], + "own_tokenizer": false }, { "name": "Burunge", "iso_1_code": null, "iso_3_code": "bds", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "393", + "scripts": [], + "own_tokenizer": false }, { "name": "Dahalo", "iso_1_code": null, "iso_3_code": "dal", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "394", + "scripts": [], + "own_tokenizer": false }, { "name": "Gorowa", "iso_1_code": null, "iso_3_code": "gow", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "395", + "scripts": [], + "own_tokenizer": false }, { "name": "Iraqw", "iso_1_code": null, "iso_3_code": "irk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "396", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Alagwa", "iso_1_code": null, "iso_3_code": "wbj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "397", + "scripts": [], + "own_tokenizer": false }, { "name": "Kw\u2019adza", "iso_1_code": null, "iso_3_code": "wka", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "398", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "391", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "335", + "scripts": [], + "own_tokenizer": false }, { "name": "Egyptian", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "coptic", - "tokenizer": "StanzaTokenizer(\"cop\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Coptic", "iso_1_code": null, "iso_3_code": "cop", - "tokenizer": { - "name": "coptic", - "tokenizer": "StanzaTokenizer(\"cop\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "400", + "scripts": [ + "Copt" + ], + "own_tokenizer": true } - ] + ], + "node_i": "399", + "scripts": [], + "own_tokenizer": false }, { "name": "Omotic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "North", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Dizoid", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Dizin", "iso_1_code": null, "iso_3_code": "mdx", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "404", + "scripts": [], + "own_tokenizer": false }, { "name": "Nayi", "iso_1_code": null, "iso_3_code": "noz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "405", + "scripts": [], + "own_tokenizer": false }, { "name": "Sheko", "iso_1_code": null, "iso_3_code": "she", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "406", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "403", + "scripts": [], + "own_tokenizer": false }, { "name": "Gonga-Gimojan", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Gimojan", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Janjero", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Yemsa", "iso_1_code": null, "iso_3_code": "jnj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "410", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "409", + "scripts": [], + "own_tokenizer": false }, { "name": "Ometo-Gimira", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Chara", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Tsaara", "iso_1_code": null, "iso_3_code": "cra", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "413", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "412", + "scripts": [], + "own_tokenizer": false }, { "name": "Gimira", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bench", "iso_1_code": null, "iso_3_code": "bcq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "415", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "414", + "scripts": [], + "own_tokenizer": false }, { "name": "Ometo", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Male", "iso_1_code": null, "iso_3_code": "mdy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "417", + "scripts": [ + "Ethi" + ], + "own_tokenizer": false }, { "name": "Central", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Dorze", "iso_1_code": null, "iso_3_code": "doz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "419", + "scripts": [], + "own_tokenizer": false }, { "name": "Dawro", "iso_1_code": null, "iso_3_code": "dwr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "420", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Gamo", "iso_1_code": null, "iso_3_code": "gmv", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "421", + "scripts": [ + "Latn", + "Ethi" + ], + "own_tokenizer": false }, { "name": "Gofa", "iso_1_code": null, "iso_3_code": "gof", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "422", + "scripts": [ + "Latn", + "Ethi" + ], + "own_tokenizer": false }, { "name": "Melo", "iso_1_code": null, "iso_3_code": "mfx", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "423", + "scripts": [], + "own_tokenizer": false }, { "name": "Oyda", "iso_1_code": null, "iso_3_code": "oyd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "424", + "scripts": [], + "own_tokenizer": false }, { "name": "Wolaytta", "iso_1_code": null, "iso_3_code": "wal", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "425", + "scripts": [ + "Latn", + "Ethi" + ], + "own_tokenizer": false } - ] + ], + "node_i": "418", + "scripts": [], + "own_tokenizer": false }, { "name": "East", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kachama-Ganjule", "iso_1_code": null, "iso_3_code": "kcx", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "427", + "scripts": [], + "own_tokenizer": false }, { "name": "Koorete", "iso_1_code": null, "iso_3_code": "kqy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "428", + "scripts": [ + "Ethi" + ], + "own_tokenizer": false }, { "name": "Zayse", "iso_1_code": null, "iso_3_code": "zay", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "429", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "426", + "scripts": [], + "own_tokenizer": false }, { "name": "West", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Basketo", "iso_1_code": null, "iso_3_code": "bst", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "431", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "430", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "416", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "411", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "408", + "scripts": [], + "own_tokenizer": false }, { "name": "Gonga", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Central", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Anfillo", "iso_1_code": null, "iso_3_code": "myo", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "434", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "433", + "scripts": [], + "own_tokenizer": false }, { "name": "North", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Borna", "iso_1_code": null, "iso_3_code": "bwo", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "436", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "435", + "scripts": [], + "own_tokenizer": false }, { "name": "South", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kafa", "iso_1_code": null, "iso_3_code": "kbr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "438", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Shekkacho", "iso_1_code": null, "iso_3_code": "moy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "439", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "437", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "432", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "407", + "scripts": [], + "own_tokenizer": false }, { "name": "Mao", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ganza", "iso_1_code": null, "iso_3_code": "gza", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "441", + "scripts": [], + "own_tokenizer": false }, { "name": "Hozo", "iso_1_code": null, "iso_3_code": "hoz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "442", + "scripts": [], + "own_tokenizer": false }, { "name": "Mawes Aasse", "iso_1_code": null, "iso_3_code": "myf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "443", + "scripts": [], + "own_tokenizer": false }, { "name": "Seze", "iso_1_code": null, "iso_3_code": "sze", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "444", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "440", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "402", + "scripts": [], + "own_tokenizer": false }, { "name": "South", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Aari", "iso_1_code": null, "iso_3_code": "aiw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "446", + "scripts": [], + "own_tokenizer": false }, { "name": "Hamer-Banna", "iso_1_code": null, "iso_3_code": "amf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "447", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Dime", "iso_1_code": null, "iso_3_code": "dim", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "448", + "scripts": [], + "own_tokenizer": false }, { "name": "Gayil", "iso_1_code": null, "iso_3_code": "gyl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "449", + "scripts": [], + "own_tokenizer": false }, { "name": "Karo", "iso_1_code": null, "iso_3_code": "kxh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "450", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "445", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "401", + "scripts": [], + "own_tokenizer": false }, { "name": "Semitic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Central", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Faifi", "iso_1_code": null, "iso_3_code": "fif", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "453", + "scripts": [], + "own_tokenizer": false }, { "name": "R\u0101zi\u1e25\u012b", "iso_1_code": null, "iso_3_code": "rzh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "454", + "scripts": [], + "own_tokenizer": false }, { "name": "Aramaic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Eastern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Syriac", "iso_1_code": null, "iso_3_code": "syc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "457", + "scripts": [ + "Syrc" + ], + "own_tokenizer": false }, { "name": "Central", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Northeastern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Assyrian Neo-Aramaic", "iso_1_code": null, "iso_3_code": "aii", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "460", + "scripts": [ + "Syrc" + ], + "own_tokenizer": false }, { "name": "Bohtan Neo-Aramaic", "iso_1_code": null, "iso_3_code": "bhn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "461", + "scripts": [], + "own_tokenizer": false }, { "name": "Barzani-Sandu Jewish Neo-Aramaic", "iso_1_code": null, "iso_3_code": "bjf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "462", + "scripts": [], + "own_tokenizer": false }, { "name": "Chaldean Neo-Aramaic", "iso_1_code": null, "iso_3_code": "cld", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "463", + "scripts": [], + "own_tokenizer": false }, { "name": "H\u00e9rtevin", "iso_1_code": null, "iso_3_code": "hrt", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "464", + "scripts": [], + "own_tokenizer": false }, { "name": "Koy Sanjaq Surat", "iso_1_code": null, "iso_3_code": "kqd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "465", + "scripts": [], + "own_tokenizer": false }, { "name": "Senaya", "iso_1_code": null, "iso_3_code": "syn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "466", + "scripts": [], + "own_tokenizer": false }, { "name": "Jewish Babylonian Aramaic", "iso_1_code": null, "iso_3_code": "tmr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "467", + "scripts": [], + "own_tokenizer": false }, { "name": "Trans-Zab", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Inter-Zab Jewish Neo-Aramaic", "iso_1_code": null, "iso_3_code": "aij", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "469", + "scripts": [], + "own_tokenizer": false }, { "name": "Hulaul\u00e1", "iso_1_code": null, "iso_3_code": "huy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "470", + "scripts": [], + "own_tokenizer": false }, { "name": "Lishana Deni", "iso_1_code": null, "iso_3_code": "lsd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "471", + "scripts": [], + "own_tokenizer": false }, { "name": "Lish\u00e1n Noshan", "iso_1_code": null, "iso_3_code": "trg", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "472", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "468", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "459", + "scripts": [], + "own_tokenizer": false }, { "name": "Northwestern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Mlahs\u00f6", "iso_1_code": null, "iso_3_code": "lhs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "474", + "scripts": [], + "own_tokenizer": false }, { "name": "Turoyo", "iso_1_code": null, "iso_3_code": "tru", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "475", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "473", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "458", + "scripts": [], + "own_tokenizer": false }, { "name": "Mandaic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Neo-Mandaic", "iso_1_code": null, "iso_3_code": "mid", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "477", + "scripts": [], + "own_tokenizer": false }, { "name": "Mandaic, Classical", "iso_1_code": null, "iso_3_code": "myz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "478", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "476", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "456", + "scripts": [], + "own_tokenizer": false }, { "name": "Western", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Western Neo-Aramaic", "iso_1_code": null, "iso_3_code": "amw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "480", + "scripts": [], + "own_tokenizer": false }, { "name": "Samaritan Aramaic", "iso_1_code": null, "iso_3_code": "sam", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "481", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "479", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "455", + "scripts": [], + "own_tokenizer": false }, { "name": "South", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Arabic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Arabic, Algerian Saharan", "iso_1_code": "ar", "iso_3_code": "aao", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "484", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Tajiki", "iso_1_code": "ar", "iso_3_code": "abh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "485", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Baharna", "iso_1_code": "ar", "iso_3_code": "abv", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "486", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Mesopotamian", "iso_1_code": "ar", "iso_3_code": "acm", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "487", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Ta\u2019izzi-Adeni", "iso_1_code": "ar", "iso_3_code": "acq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "488", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Hijazi", "iso_1_code": "ar", "iso_3_code": "acw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "489", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Omani", "iso_1_code": "ar", "iso_3_code": "acx", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "490", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Cypriot", "iso_1_code": "ar", "iso_3_code": "acy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "491", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Dhofari", "iso_1_code": "ar", "iso_3_code": "adf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "492", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Tunisian", "iso_1_code": "ar", "iso_3_code": "aeb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "493", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Sa\u2019idi", "iso_1_code": "ar", "iso_3_code": "aec", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "494", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Gulf", "iso_1_code": "ar", "iso_3_code": "afb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "495", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Judeo-Moroccan", "iso_1_code": null, "iso_3_code": "aju", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "496", + "scripts": [], + "own_tokenizer": false }, { "name": "Arabic, Levantine", "iso_1_code": "ar", "iso_3_code": "apc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "497", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Sudanese", "iso_1_code": "ar", "iso_3_code": "apd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "498", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Standard", "iso_1_code": "ar", "iso_3_code": "arb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "499", + "scripts": [ + "Arab", + "Latn" + ], + "own_tokenizer": true }, { "name": "Arabic, Algerian", "iso_1_code": "ar", "iso_3_code": "arq", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "500", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Najdi", "iso_1_code": "ar", "iso_3_code": "ars", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "501", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Moroccan", "iso_1_code": "ar", "iso_3_code": "ary", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "502", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Egyptian", "iso_1_code": "ar", "iso_3_code": "arz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "503", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Uzbeki", "iso_1_code": "ar", "iso_3_code": "auz", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "504", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Eastern Egyptian Bedawi", "iso_1_code": "ar", "iso_3_code": "avl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "505", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Hadrami", "iso_1_code": "ar", "iso_3_code": "ayh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "506", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Libyan", "iso_1_code": "ar", "iso_3_code": "ayl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "507", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Sanaani", "iso_1_code": "ar", "iso_3_code": "ayn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "508", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, North Mesopotamian", "iso_1_code": "ar", "iso_3_code": "ayp", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "509", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Judeo-Yemeni", "iso_1_code": null, "iso_3_code": "jye", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "510", + "scripts": [], + "own_tokenizer": false }, { "name": "Hassaniyya", "iso_1_code": null, "iso_3_code": "mey", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "511", + "scripts": [], + "own_tokenizer": false }, { "name": "Maltese", "iso_1_code": "mt", "iso_3_code": "mlt", - "tokenizer": { - "name": "maltese", - "tokenizer": "StanzaTokenizer(\"mt\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "512", + "scripts": [ + "Latn" + ], + "own_tokenizer": true }, { "name": "Arabic, Chadian", "iso_1_code": "ar", "iso_3_code": "shu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "513", + "scripts": [ + "Arab" + ], + "own_tokenizer": true }, { "name": "Arabic, Shihhi", "iso_1_code": "ar", "iso_3_code": "ssh", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "514", + "scripts": [], + "own_tokenizer": true }, { "name": "Arabic, Judeo-Iraqi", "iso_1_code": null, "iso_3_code": "yhd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "515", + "scripts": [], + "own_tokenizer": false }, { "name": "Arabic, Judeo-Tripolitanian", "iso_1_code": null, "iso_3_code": "yud", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "516", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "483", + "scripts": [], + "own_tokenizer": false }, { "name": "Canaanite", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "hebrew", - "tokenizer": "SpaCyTokenizer(\"he\")" + "tokenizers": { + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Hebrew, Ancient", "iso_1_code": null, "iso_3_code": "hbo", - "tokenizer": { - "name": "ancient_hebrew", - "tokenizer": "StanzaTokenizer(\"hbo\")" + "tokenizers": { + "Hebr": { + "full_object": "StanzaTokenizer(\"hbo\")", + "original_lang_name": "ancient_hebrew", + "original_lang_code": "hbo", + "scripts": [ + "Hebr" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "518", + "scripts": [ + "Hebr" + ], + "own_tokenizer": true }, { "name": "Hebrew", "iso_1_code": "he", "iso_3_code": "heb", - "tokenizer": { - "name": "hebrew", - "tokenizer": "SpaCyTokenizer(\"he\")" + "tokenizers": { + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "519", + "scripts": [ + "Hebr" + ], + "own_tokenizer": true }, { "name": "Samaritan Hebrew", "iso_1_code": null, "iso_3_code": "smp", - "tokenizer": { - "name": "hebrew", - "tokenizer": "SpaCyTokenizer(\"he\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "520", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "517", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "482", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "452", + "scripts": [], + "own_tokenizer": false }, { "name": "South", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Ethiopian", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "North", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Dahalik", "iso_1_code": null, "iso_3_code": "dlk", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "524", + "scripts": [], + "own_tokenizer": false }, { "name": "Geez", "iso_1_code": null, "iso_3_code": "gez", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "525", + "scripts": [], + "own_tokenizer": false }, { "name": "Tigr\u00e9", "iso_1_code": null, "iso_3_code": "tig", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "526", + "scripts": [ + "Ethi" + ], + "own_tokenizer": false }, { "name": "Tigrigna", "iso_1_code": "ti", "iso_3_code": "tir", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "527", + "scripts": [ + "Ethi" + ], + "own_tokenizer": true } - ] + ], + "node_i": "523", + "scripts": [], + "own_tokenizer": false }, { "name": "South", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Outer", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "n-Group", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Gafat", "iso_1_code": null, "iso_3_code": "gft", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "531", + "scripts": [], + "own_tokenizer": false }, { "name": "Kistane", "iso_1_code": null, "iso_3_code": "gru", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "532", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "530", + "scripts": [], + "own_tokenizer": false }, { "name": "tt-Group", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Inor", "iso_1_code": null, "iso_3_code": "ior", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "534", + "scripts": [], + "own_tokenizer": false }, { "name": "Mesqan", "iso_1_code": null, "iso_3_code": "mvz", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "535", + "scripts": [], + "own_tokenizer": false }, { "name": "Mesmes", "iso_1_code": null, "iso_3_code": "mys", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "536", + "scripts": [], + "own_tokenizer": false }, { "name": "Sebat Bet Gurage", "iso_1_code": null, "iso_3_code": "sgw", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "537", + "scripts": [ + "Ethi" + ], + "own_tokenizer": false } - ] + ], + "node_i": "533", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "529", + "scripts": [], + "own_tokenizer": false }, { "name": "Transversal", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Amharic-Argobba", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Argobba", "iso_1_code": null, "iso_3_code": "agj", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "540", + "scripts": [], + "own_tokenizer": false }, { "name": "Amharic", "iso_1_code": "am", "iso_3_code": "amh", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "541", + "scripts": [ + "Ethi" + ], + "own_tokenizer": true } - ] + ], + "node_i": "539", + "scripts": [], + "own_tokenizer": false }, { "name": "Harari-East Gurage", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"am\")", + "original_lang_name": "amharic", + "original_lang_code": "amh", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Harari", "iso_1_code": null, "iso_3_code": "har", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "543", + "scripts": [], + "own_tokenizer": false }, { "name": "Silt\u2019e", "iso_1_code": null, "iso_3_code": "stv", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "544", + "scripts": [], + "own_tokenizer": false }, { "name": "Wolane", "iso_1_code": null, "iso_3_code": "wle", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "545", + "scripts": [], + "own_tokenizer": false }, { "name": "Zay", "iso_1_code": null, "iso_3_code": "zwa", - "tokenizer": { - "name": "amharic", - "tokenizer": "SpaCyTokenizer(\"am\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "546", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "542", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "538", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "528", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "522", + "scripts": [], + "own_tokenizer": false }, { "name": "South Arabian", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" + "tokenizers": { + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bathari", "iso_1_code": null, "iso_3_code": "bhm", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "548", + "scripts": [], + "own_tokenizer": false }, { "name": "Mehri", "iso_1_code": null, "iso_3_code": "gdq", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "549", + "scripts": [], + "own_tokenizer": false }, { "name": "Hoby\u00f3t", "iso_1_code": null, "iso_3_code": "hoh", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "550", + "scripts": [], + "own_tokenizer": false }, { "name": "Harsusi", "iso_1_code": null, "iso_3_code": "hss", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "551", + "scripts": [], + "own_tokenizer": false }, { "name": "Shehri", "iso_1_code": null, "iso_3_code": "shv", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "552", + "scripts": [], + "own_tokenizer": false }, { "name": "Soqotri", "iso_1_code": null, "iso_3_code": "sqt", - "tokenizer": { - "name": "tigrinya", - "tokenizer": "SpaCyTokenizer(\"ti\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "553", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "547", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "521", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "451", + "scripts": [], + "own_tokenizer": false }, { "name": "Unclassified", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Copt": { + "full_object": "StanzaTokenizer(\"cop\")", + "original_lang_name": "coptic", + "original_lang_code": "cop", + "scripts": [ + "Copt" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "StanzaTokenizer(\"mt\")", + "original_lang_name": "maltese", + "original_lang_code": "mlt", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Hebr": { + "full_object": "SpaCyTokenizer(\"he\")", + "original_lang_name": "hebrew", + "original_lang_code": "heb", + "scripts": [ + "Hebr" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Ethi": { + "full_object": "SpaCyTokenizer(\"ti\")", + "original_lang_name": "tigrinya", + "original_lang_code": "tir", + "scripts": [ + "Ethi" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Ongota", "iso_1_code": null, "iso_3_code": "bxe", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "555", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "554", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "9", + "scripts": [], + "own_tokenizer": false } \ No newline at end of file