diff --git "a/data/Creole.json" "b/data/Creole.json" --- "a/data/Creole.json" +++ "b/data/Creole.json" @@ -2,1346 +2,2822 @@ "name": "Creole", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "bottom", "children": [ { "name": "Afrikaans based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Flaaitaal", "iso_1_code": null, "iso_3_code": "fly", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3483", + "scripts": [], + "own_tokenizer": false }, { "name": "Oorlams", "iso_1_code": null, "iso_3_code": "oor", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3484", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3482", + "scripts": [], + "own_tokenizer": false }, { "name": "Arabic based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "bottom", "children": [ { "name": "Nubi", "iso_1_code": null, "iso_3_code": "kcn", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3486", + "scripts": [], + "own_tokenizer": false }, { "name": "Arabic, Juba", "iso_1_code": "ar", "iso_3_code": "pga", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ar\")", + "original_lang_name": "arabic", + "original_lang_code": "ara", + "scripts": [ + "Arab" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "3487", + "scripts": [], + "own_tokenizer": true } - ] + ], + "node_i": "3485", + "scripts": [], + "own_tokenizer": false }, { "name": "Assamese based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Nagamese", "iso_1_code": null, "iso_3_code": "nag", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3489", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3488", + "scripts": [], + "own_tokenizer": false }, { "name": "Dutch based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Berbice Dutch Creole", "iso_1_code": null, "iso_3_code": "brc", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3491", + "scripts": [], + "own_tokenizer": false }, { "name": "Negerhollands", "iso_1_code": null, "iso_3_code": "dcr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3492", + "scripts": [], + "own_tokenizer": false }, { "name": "Javindo", "iso_1_code": null, "iso_3_code": "jvd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3493", + "scripts": [], + "own_tokenizer": false }, { "name": "Petjo", "iso_1_code": null, "iso_3_code": "pey", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3494", + "scripts": [], + "own_tokenizer": false }, { "name": "Skepi Dutch Creole", "iso_1_code": null, "iso_3_code": "skw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3495", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3490", + "scripts": [], + "own_tokenizer": false }, { "name": "English based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "bottom", "children": [ { "name": "Saramaccan", "iso_1_code": null, "iso_3_code": "srm", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3497", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Atlantic", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "bottom", "children": [ { "name": "Eastern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Turks and Caicos English Creole", "iso_1_code": null, "iso_3_code": "tch", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3500", + "scripts": [], + "own_tokenizer": false }, { "name": "Northern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Afro-Seminole Creole", "iso_1_code": null, "iso_3_code": "afs", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3502", + "scripts": [], + "own_tokenizer": false }, { "name": "Bahamas English Creole", "iso_1_code": null, "iso_3_code": "bah", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3503", + "scripts": [], + "own_tokenizer": false }, { "name": "Sea Island English Creole", "iso_1_code": null, "iso_3_code": "gul", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3504", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "3501", + "scripts": [], + "own_tokenizer": false }, { "name": "Southern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Leeward Caribbean English Creole", "iso_1_code": null, "iso_3_code": "aig", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3506", + "scripts": [], + "own_tokenizer": false }, { "name": "Bajan", "iso_1_code": null, "iso_3_code": "bjs", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3507", + "scripts": [], + "own_tokenizer": false }, { "name": "Grenadian English Creole", "iso_1_code": null, "iso_3_code": "gcl", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3508", + "scripts": [], + "own_tokenizer": false }, { "name": "Guyanese English Creole", "iso_1_code": null, "iso_3_code": "gyn", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3509", + "scripts": [], + "own_tokenizer": false }, { "name": "Vincentian English Creole", "iso_1_code": null, "iso_3_code": "svc", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3510", + "scripts": [], + "own_tokenizer": false }, { "name": "Tobagonian English Creole", "iso_1_code": null, "iso_3_code": "tgh", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3511", + "scripts": [], + "own_tokenizer": false }, { "name": "Trinidadian English Creole", "iso_1_code": null, "iso_3_code": "trf", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3512", + "scripts": [], + "own_tokenizer": false }, { "name": "Virgin Islands English Creole", "iso_1_code": null, "iso_3_code": "vic", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3513", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3505", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3499", + "scripts": [], + "own_tokenizer": false }, { "name": "Krio", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "bottom", "children": [ { "name": "Equatorial Guinean Pidgin", "iso_1_code": null, "iso_3_code": "fpe", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3515", + "scripts": [], + "own_tokenizer": false }, { "name": "Ghanaian Pidgin English", "iso_1_code": null, "iso_3_code": "gpe", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3516", + "scripts": [], + "own_tokenizer": false }, { "name": "Krio", "iso_1_code": null, "iso_3_code": "kri", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3517", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Pidgin, Nigerian", "iso_1_code": null, "iso_3_code": "pcm", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "3518", + "scripts": [ + "Latn" + ], + "own_tokenizer": true }, { "name": "Pidgin, Cameroon", "iso_1_code": null, "iso_3_code": "wes", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3519", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "3514", + "scripts": [], + "own_tokenizer": false }, { "name": "Suriname", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Sranan Tongo", "iso_1_code": null, "iso_3_code": "srn", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3521", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Ndyuka", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Aukan", "iso_1_code": null, "iso_3_code": "djk", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3523", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Kwinti", "iso_1_code": null, "iso_3_code": "kww", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3524", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3522", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3520", + "scripts": [], + "own_tokenizer": false }, { "name": "Western", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Belize English Creole", "iso_1_code": null, "iso_3_code": "bzj", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3526", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Nicaragua English Creole", "iso_1_code": null, "iso_3_code": "bzk", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3527", + "scripts": [], + "own_tokenizer": false }, { "name": "Islander English Creole", "iso_1_code": null, "iso_3_code": "icr", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3528", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Jamaican English Creole", "iso_1_code": null, "iso_3_code": "jam", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3529", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "3525", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3498", + "scripts": [], + "own_tokenizer": false }, { "name": "Pacific", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + }, + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Bislama", "iso_1_code": "bi", "iso_3_code": "bis", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3531", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Hawaii Pidgin", "iso_1_code": null, "iso_3_code": "hwc", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3532", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Ngatik Men\u2019s Creole", "iso_1_code": null, "iso_3_code": "ngm", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3533", + "scripts": [], + "own_tokenizer": false }, { "name": "Pitcairn-Norfolk", "iso_1_code": null, "iso_3_code": "pih", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3534", + "scripts": [], + "own_tokenizer": false }, { "name": "Pijin", "iso_1_code": null, "iso_3_code": "pis", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3535", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Kriol", "iso_1_code": null, "iso_3_code": "rop", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3536", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Torres Strait Creole", "iso_1_code": null, "iso_3_code": "tcs", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3537", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Tok Pisin", "iso_1_code": null, "iso_3_code": "tpi", - "tokenizer": { - "name": "nigerian_pidgin", - "tokenizer": "StanzaTokenizer(\"pcm\")" + "tokenizers": { + "Latn": { + "full_object": "StanzaTokenizer(\"pcm\")", + "original_lang_name": "nigerian_pidgin", + "original_lang_code": "pcm", + "scripts": [ + "Latn" + ], + "class_name": "StanzaTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3538", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "3530", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3496", + "scripts": [], + "own_tokenizer": false }, { "name": "French based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Lesser Antillean French Creole", "iso_1_code": null, "iso_3_code": "acf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3540", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Tayo", "iso_1_code": null, "iso_3_code": "cks", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3541", + "scripts": [], + "own_tokenizer": false }, { "name": "Seychelles French Creole", "iso_1_code": null, "iso_3_code": "crs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3542", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Guadeloupean French Creole", "iso_1_code": null, "iso_3_code": "gcf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3543", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Guianese French Creole", "iso_1_code": null, "iso_3_code": "gcr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3544", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Haitian Creole", "iso_1_code": "ht", "iso_3_code": "hat", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3545", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Karipuna French Creole", "iso_1_code": null, "iso_3_code": "kmv", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3546", + "scripts": [], + "own_tokenizer": false }, { "name": "Louisiana Creole", "iso_1_code": null, "iso_3_code": "lou", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3547", + "scripts": [], + "own_tokenizer": false }, { "name": "Morisyen", "iso_1_code": null, "iso_3_code": "mfe", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3548", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "R\u00e9union French Creole", "iso_1_code": null, "iso_3_code": "rcf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3549", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "San Miguel French Creole", "iso_1_code": null, "iso_3_code": "scf", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3550", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3539", + "scripts": [], + "own_tokenizer": false }, { "name": "German based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Unserdeutsch", "iso_1_code": null, "iso_3_code": "uln", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3552", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3551", + "scripts": [], + "own_tokenizer": false }, { "name": "Hindi based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Andaman Hindi Creole", "iso_1_code": null, "iso_3_code": "hca", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3554", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3553", + "scripts": [], + "own_tokenizer": false }, { "name": "Iberian based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Papiamentu", "iso_1_code": null, "iso_3_code": "pap", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3556", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "3555", + "scripts": [], + "own_tokenizer": false }, { "name": "Japanese-based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Yilan Creole", "iso_1_code": null, "iso_3_code": "ycr", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3558", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3557", + "scripts": [], + "own_tokenizer": false }, { "name": "Kongo based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Kituba", "iso_1_code": null, "iso_3_code": "ktu", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3560", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Kituba", "iso_1_code": null, "iso_3_code": "mkw", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3561", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3559", + "scripts": [], + "own_tokenizer": false }, { "name": "Malay based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "bottom", "children": [ { "name": "Malay, Ambonese", "iso_1_code": null, "iso_3_code": "abs", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3563", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Betawi", "iso_1_code": null, "iso_3_code": "bew", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3564", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Malay, Banda", "iso_1_code": null, "iso_3_code": "bpq", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3565", + "scripts": [], + "own_tokenizer": false }, { "name": "Malaccan Malay Creole", "iso_1_code": null, "iso_3_code": "ccm", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3566", + "scripts": [], + "own_tokenizer": false }, { "name": "Malay, Cocos Islands", "iso_1_code": "ms", "iso_3_code": "coa", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "3567", + "scripts": [], + "own_tokenizer": true }, { "name": "Malay, Larantuka", "iso_1_code": null, "iso_3_code": "lrt", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3568", + "scripts": [], + "own_tokenizer": false }, { "name": "Malay, North Moluccan", "iso_1_code": "ms", "iso_3_code": "max", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "3569", + "scripts": [ + "Latn" + ], + "own_tokenizer": true }, { "name": "Malay, Baba", "iso_1_code": null, "iso_3_code": "mbf", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3570", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Malay, Balinese", "iso_1_code": null, "iso_3_code": "mhp", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3571", + "scripts": [], + "own_tokenizer": false }, { "name": "Malay, Kupang", "iso_1_code": null, "iso_3_code": "mkn", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3572", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Indonesian, Peranakan", "iso_1_code": null, "iso_3_code": "pea", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3573", + "scripts": [], + "own_tokenizer": false }, { "name": "Malay, Papuan", "iso_1_code": null, "iso_3_code": "pmy", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3574", + "scripts": [], + "own_tokenizer": false }, { "name": "Sri Lankan Malay Creole", "iso_1_code": null, "iso_3_code": "sci", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3575", + "scripts": [], + "own_tokenizer": false }, { "name": "Malay, Manado", "iso_1_code": "ms", "iso_3_code": "xmm", - "tokenizer": { - "name": "malay", - "tokenizer": "SpaCyTokenizer(\"ms\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "macrolanguage", - "children": [] + "children": [], + "node_i": "3576", + "scripts": [ + "Latn" + ], + "own_tokenizer": true } - ] + ], + "node_i": "3562", + "scripts": [], + "own_tokenizer": false }, { "name": "Ngbandi based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Sango", "iso_1_code": "sg", "iso_3_code": "sag", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3578", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Sango, Riverain", "iso_1_code": null, "iso_3_code": "snj", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3579", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3577", + "scripts": [], + "own_tokenizer": false }, { "name": "Portuguese based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Angolar", "iso_1_code": null, "iso_3_code": "aoa", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3581", + "scripts": [], + "own_tokenizer": false }, { "name": "Cafundo Creole", "iso_1_code": null, "iso_3_code": "ccd", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3582", + "scripts": [], + "own_tokenizer": false }, { "name": "S\u00e3otomense", "iso_1_code": null, "iso_3_code": "cri", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3583", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Fa d\u2019Ambu", "iso_1_code": null, "iso_3_code": "fab", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3584", + "scripts": [], + "own_tokenizer": false }, { "name": "Indo-Portuguese", "iso_1_code": null, "iso_3_code": "idb", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3585", + "scripts": [], + "own_tokenizer": false }, { "name": "Kabuverdianu", "iso_1_code": null, "iso_3_code": "kea", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3586", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Malaccan Portuguese Creole", "iso_1_code": null, "iso_3_code": "mcm", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3587", + "scripts": [], + "own_tokenizer": false }, { "name": "Macanese", "iso_1_code": null, "iso_3_code": "mzs", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3588", + "scripts": [], + "own_tokenizer": false }, { "name": "Guinea-Bissau Creole", "iso_1_code": null, "iso_3_code": "pov", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3589", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Principense", "iso_1_code": null, "iso_3_code": "pre", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3590", + "scripts": [], + "own_tokenizer": false }, { "name": "Ternate\u00f1o", "iso_1_code": null, "iso_3_code": "tmg", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3591", + "scripts": [], + "own_tokenizer": false }, { "name": "Pidgin, Timor", "iso_1_code": null, "iso_3_code": "tvy", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3592", + "scripts": [], + "own_tokenizer": false }, { "name": "Korlai Portuguese Creole", "iso_1_code": null, "iso_3_code": "vkp", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3593", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3580", + "scripts": [], + "own_tokenizer": false }, { "name": "Spanish based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Chavacano", "iso_1_code": null, "iso_3_code": "cbk", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3595", + "scripts": [ + "Latn" + ], + "own_tokenizer": false }, { "name": "Palenquero", "iso_1_code": null, "iso_3_code": "pln", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3596", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3594", + "scripts": [], + "own_tokenizer": false }, { "name": "Swahili based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Cutchi-Swahili", "iso_1_code": null, "iso_3_code": "ccl", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3598", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3597", + "scripts": [], + "own_tokenizer": false }, { "name": "Tetun based", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Arab": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + }, + "Thai": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", "children": [ { "name": "Tetun Dili", "iso_1_code": null, "iso_3_code": "tdt", - "tokenizer": { - "name": "arabic", - "tokenizer": "SpaCyTokenizer(\"ar\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ms\")", + "original_lang_name": "malay", + "original_lang_code": "msa", + "scripts": [ + "Arab", + "Thai", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": true + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3600", + "scripts": [ + "Latn" + ], + "own_tokenizer": false } - ] + ], + "node_i": "3599", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3481", + "scripts": [], + "own_tokenizer": false } \ No newline at end of file