diff --git "a/data/Dravidian.json" "b/data/Dravidian.json" --- "a/data/Dravidian.json" +++ "b/data/Dravidian.json" @@ -2,1270 +2,2877 @@ "name": "Dravidian", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Central", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kolami-Naiki", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kolami, Northwestern", "iso_1_code": null, "iso_3_code": "kfb", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3604", + "scripts": [], + "own_tokenizer": false }, { "name": "Kolami, Southeastern", "iso_1_code": null, "iso_3_code": "nit", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3605", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3603", + "scripts": [], + "own_tokenizer": false }, { "name": "Parji-Gadaba", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Gadaba, Mudhili", "iso_1_code": null, "iso_3_code": "gau", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3607", + "scripts": [], + "own_tokenizer": false }, { "name": "Gadaba, Pottangi Ollar", "iso_1_code": null, "iso_3_code": "gdb", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3608", + "scripts": [], + "own_tokenizer": false }, { "name": "Duruwa", "iso_1_code": null, "iso_3_code": "pci", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3609", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3606", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3602", + "scripts": [], + "own_tokenizer": false }, { "name": "Northern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Brahui", "iso_1_code": null, "iso_3_code": "brh", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3611", + "scripts": [ + "Arab" + ], + "own_tokenizer": false }, { "name": "Kumarbhag Paharia", "iso_1_code": null, "iso_3_code": "kmj", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3612", + "scripts": [], + "own_tokenizer": false }, { "name": "Kurux", "iso_1_code": null, "iso_3_code": "kru", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3613", + "scripts": [ + "Deva" + ], + "own_tokenizer": false }, { "name": "Sauria Paharia", "iso_1_code": null, "iso_3_code": "mjt", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3614", + "scripts": [], + "own_tokenizer": false }, { "name": "Kisan", "iso_1_code": null, "iso_3_code": "xis", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3615", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3610", + "scripts": [], + "own_tokenizer": false }, { "name": "South-Central", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Gondi-Kui", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Gondi", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Maria, Dandami", "iso_1_code": null, "iso_3_code": "daq", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3619", + "scripts": [], + "own_tokenizer": false }, { "name": "Muria, Eastern", "iso_1_code": null, "iso_3_code": "emu", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3620", + "scripts": [], + "own_tokenizer": false }, { "name": "Gondi, Aheri", "iso_1_code": null, "iso_3_code": "esg", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3621", + "scripts": [], + "own_tokenizer": false }, { "name": "Muria, Far Western", "iso_1_code": null, "iso_3_code": "fmu", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3622", + "scripts": [ + "Deva" + ], + "own_tokenizer": false }, { "name": "Gondi, Northern", "iso_1_code": null, "iso_3_code": "gno", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3623", + "scripts": [], + "own_tokenizer": false }, { "name": "Khirwar", "iso_1_code": null, "iso_3_code": "kwx", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3624", + "scripts": [], + "own_tokenizer": false }, { "name": "Maria", "iso_1_code": null, "iso_3_code": "mrr", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3625", + "scripts": [], + "own_tokenizer": false }, { "name": "Muria, Western", "iso_1_code": null, "iso_3_code": "mut", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3626", + "scripts": [], + "own_tokenizer": false }, { "name": "Nagarchal", "iso_1_code": null, "iso_3_code": "nbg", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3627", + "scripts": [], + "own_tokenizer": false }, { "name": "Pardhan", "iso_1_code": null, "iso_3_code": "pch", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3628", + "scripts": [], + "own_tokenizer": false }, { "name": "Gondi, Adilabad", "iso_1_code": null, "iso_3_code": "wsg", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3629", + "scripts": [ + "Telu" + ], + "own_tokenizer": false } - ] + ], + "node_i": "3618", + "scripts": [], + "own_tokenizer": false }, { "name": "Konda-Kui", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Konda", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Konda-Dora", "iso_1_code": null, "iso_3_code": "kfc", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3632", + "scripts": [], + "own_tokenizer": false }, { "name": "Mukha-Dora", "iso_1_code": null, "iso_3_code": "mmk", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3633", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3631", + "scripts": [], + "own_tokenizer": false }, { "name": "Manda-Kui", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kui-Kuvi", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kui, Dawik", "iso_1_code": null, "iso_3_code": "dwk", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3636", + "scripts": [], + "own_tokenizer": false }, { "name": "Koya", "iso_1_code": null, "iso_3_code": "kff", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3637", + "scripts": [ + "Telu" + ], + "own_tokenizer": false }, { "name": "Kuvi", "iso_1_code": null, "iso_3_code": "kxv", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3638", + "scripts": [], + "own_tokenizer": false }, { "name": "Kui", "iso_1_code": null, "iso_3_code": "uki", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3639", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3635", + "scripts": [], + "own_tokenizer": false }, { "name": "Manda-Pengo", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Manda", "iso_1_code": null, "iso_3_code": "mha", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3641", + "scripts": [], + "own_tokenizer": false }, { "name": "Pengo", "iso_1_code": null, "iso_3_code": "peg", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3642", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3640", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3634", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3630", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3617", + "scripts": [], + "own_tokenizer": false }, { "name": "Telugu", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Chenchu", "iso_1_code": null, "iso_3_code": "cde", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3644", + "scripts": [], + "own_tokenizer": false }, { "name": "Manna-Dora", "iso_1_code": null, "iso_3_code": "mju", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3645", + "scripts": [], + "own_tokenizer": false }, { "name": "Telugu", "iso_1_code": "te", "iso_3_code": "tel", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "3646", + "scripts": [ + "Telu", + "Latn" + ], + "own_tokenizer": true }, { "name": "Waddar", "iso_1_code": null, "iso_3_code": "wbq", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3647", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3643", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3616", + "scripts": [], + "own_tokenizer": false }, { "name": "Southern", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Kurichiya", "iso_1_code": null, "iso_3_code": "kfh", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3649", + "scripts": [], + "own_tokenizer": false }, { "name": "Kurumba, Attapady", "iso_1_code": null, "iso_3_code": "pkr", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3650", + "scripts": [], + "own_tokenizer": false }, { "name": "Pathiya", "iso_1_code": null, "iso_3_code": "pty", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3651", + "scripts": [], + "own_tokenizer": false }, { "name": "Muduga", "iso_1_code": null, "iso_3_code": "udg", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3652", + "scripts": [], + "own_tokenizer": false }, { "name": "Kumbaran", "iso_1_code": null, "iso_3_code": "wkb", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3653", + "scripts": [], + "own_tokenizer": false }, { "name": "Kalanadi", "iso_1_code": null, "iso_3_code": "wkl", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3654", + "scripts": [], + "own_tokenizer": false }, { "name": "Kunduvadi", "iso_1_code": null, "iso_3_code": "wku", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3655", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamil-Kannada", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Kannada", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Badaga", "iso_1_code": null, "iso_3_code": "bfq", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3658", + "scripts": [], + "own_tokenizer": false }, { "name": "Holiya", "iso_1_code": null, "iso_3_code": "hoy", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3659", + "scripts": [], + "own_tokenizer": false }, { "name": "Kannada", "iso_1_code": "kn", "iso_3_code": "kan", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "3660", + "scripts": [ + "Latn", + "Knda" + ], + "own_tokenizer": true }, { "name": "Urali", "iso_1_code": null, "iso_3_code": "url", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3661", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3657", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamil-Kodagu", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Kodagu", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kodava", "iso_1_code": null, "iso_3_code": "kfa", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3664", + "scripts": [], + "own_tokenizer": false }, { "name": "Kurumba, Kannada", "iso_1_code": null, "iso_3_code": "kfi", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3665", + "scripts": [], + "own_tokenizer": false }, { "name": "Kurumba, Mullu", "iso_1_code": null, "iso_3_code": "kpb", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3666", + "scripts": [], + "own_tokenizer": false }, { "name": "Kurumba, Alu", "iso_1_code": null, "iso_3_code": "xua", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3667", + "scripts": [], + "own_tokenizer": false }, { "name": "Kurumba, Jennu", "iso_1_code": null, "iso_3_code": "xuj", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3668", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3663", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamil-Malayalam", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Mannan", "iso_1_code": null, "iso_3_code": "mjv", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3670", + "scripts": [], + "own_tokenizer": false }, { "name": "Malayalam", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Aranadan", "iso_1_code": null, "iso_3_code": "aaf", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3672", + "scripts": [], + "own_tokenizer": false }, { "name": "Kadar", "iso_1_code": null, "iso_3_code": "kej", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3673", + "scripts": [], + "own_tokenizer": false }, { "name": "Malayalam", "iso_1_code": "ml", "iso_3_code": "mal", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "3674", + "scripts": [ + "Latn", + "Mlym" + ], + "own_tokenizer": true }, { "name": "Malapandaram", "iso_1_code": null, "iso_3_code": "mjp", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3675", + "scripts": [], + "own_tokenizer": false }, { "name": "Malaryan", "iso_1_code": null, "iso_3_code": "mjq", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3676", + "scripts": [], + "own_tokenizer": false }, { "name": "Malavedan", "iso_1_code": null, "iso_3_code": "mjr", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3677", + "scripts": [], + "own_tokenizer": false }, { "name": "Paliyan", "iso_1_code": null, "iso_3_code": "pcf", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3678", + "scripts": [], + "own_tokenizer": false }, { "name": "Paniya", "iso_1_code": null, "iso_3_code": "pcg", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3679", + "scripts": [], + "own_tokenizer": false }, { "name": "Ravula", "iso_1_code": null, "iso_3_code": "yea", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3680", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3671", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamil", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" + "tokenizers": { + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "bottom", "children": [ { "name": "Eravallan", "iso_1_code": null, "iso_3_code": "era", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3682", + "scripts": [], + "own_tokenizer": false }, { "name": "Irula", "iso_1_code": null, "iso_3_code": "iru", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3683", + "scripts": [], + "own_tokenizer": false }, { "name": "Kaikadi", "iso_1_code": null, "iso_3_code": "kep", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3684", + "scripts": [], + "own_tokenizer": false }, { "name": "Kanikkaran", "iso_1_code": null, "iso_3_code": "kev", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3685", + "scripts": [], + "own_tokenizer": false }, { "name": "Muthuvan", "iso_1_code": null, "iso_3_code": "muv", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3686", + "scripts": [], + "own_tokenizer": false }, { "name": "Sholaga", "iso_1_code": null, "iso_3_code": "sle", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3687", + "scripts": [], + "own_tokenizer": false }, { "name": "Tamil", "iso_1_code": "ta", "iso_3_code": "tam", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" + "tokenizers": { + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "own", - "children": [] + "children": [], + "node_i": "3688", + "scripts": [ + "Taml", + "Latn" + ], + "own_tokenizer": true }, { "name": "Kurumba, Betta", "iso_1_code": null, "iso_3_code": "xub", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3689", + "scripts": [], + "own_tokenizer": false }, { "name": "Yerukula", "iso_1_code": null, "iso_3_code": "yeu", - "tokenizer": { - "name": "tamil", - "tokenizer": "SpaCyTokenizer(\"ta\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3690", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3681", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3669", + "scripts": [], + "own_tokenizer": false }, { "name": "Toda-Kota", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Kota", "iso_1_code": null, "iso_3_code": "kfe", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3692", + "scripts": [], + "own_tokenizer": false }, { "name": "Toda", "iso_1_code": null, "iso_3_code": "tcx", - "tokenizer": { - "name": "malayalam", - "tokenizer": "SpaCyTokenizer(\"ml\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3693", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3691", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3662", + "scripts": [], + "own_tokenizer": false }, { "name": "Unclassified", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Chetti, Wayanad", "iso_1_code": null, "iso_3_code": "ctt", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3695", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3694", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3656", + "scripts": [], + "own_tokenizer": false }, { "name": "Tulu", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Bellari", "iso_1_code": null, "iso_3_code": "brw", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3697", + "scripts": [], + "own_tokenizer": false }, { "name": "Kudiya", "iso_1_code": null, "iso_3_code": "kfg", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3698", + "scripts": [], + "own_tokenizer": false }, { "name": "Tulu", "iso_1_code": null, "iso_3_code": "tcy", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", - "children": [] + "children": [], + "node_i": "3699", + "scripts": [ + "Knda" + ], + "own_tokenizer": false }, { "name": "Koraga", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Koraga, Korra", "iso_1_code": null, "iso_3_code": "kfd", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3701", + "scripts": [], + "own_tokenizer": false }, { "name": "Koraga, Mudu", "iso_1_code": null, "iso_3_code": "vmd", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3702", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3700", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3696", + "scripts": [], + "own_tokenizer": false }, { "name": "Unclassified", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Mala Malasar", "iso_1_code": null, "iso_3_code": "ima", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3704", + "scripts": [], + "own_tokenizer": false }, { "name": "Thachanadan", "iso_1_code": null, "iso_3_code": "thn", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3705", + "scripts": [], + "own_tokenizer": false }, { "name": "Ullatan", "iso_1_code": null, "iso_3_code": "ull", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3706", + "scripts": [], + "own_tokenizer": false }, { "name": "Malasar", "iso_1_code": null, "iso_3_code": "ymr", - "tokenizer": { - "name": "kannada", - "tokenizer": "SpaCyTokenizer(\"kn\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3707", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3703", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3648", + "scripts": [], + "own_tokenizer": false }, { "name": "Unclassified", "iso_1_code": null, "iso_3_code": null, - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" + "tokenizers": { + "Latn": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Telu": { + "full_object": "SpaCyTokenizer(\"te\")", + "original_lang_name": "telugu", + "original_lang_code": "tel", + "scripts": [ + "Latn", + "Telu" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Knda": { + "full_object": "SpaCyTokenizer(\"kn\")", + "original_lang_name": "kannada", + "original_lang_code": "kan", + "scripts": [ + "Latn", + "Knda" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Mlym": { + "full_object": "SpaCyTokenizer(\"ml\")", + "original_lang_name": "malayalam", + "original_lang_code": "mal", + "scripts": [ + "Latn", + "Mlym" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + }, + "Taml": { + "full_object": "SpaCyTokenizer(\"ta\")", + "original_lang_name": "tamil", + "original_lang_code": "tam", + "scripts": [ + "Taml", + "Latn" + ], + "class_name": "SpaCyTokenizer", + "macrolanguage": false + } }, - "source": "down", "children": [ { "name": "Allar", "iso_1_code": null, "iso_3_code": "all", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3709", + "scripts": [], + "own_tokenizer": false }, { "name": "Bharia", "iso_1_code": null, "iso_3_code": "bha", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3710", + "scripts": [], + "own_tokenizer": false }, { "name": "Malankuravan", "iso_1_code": null, "iso_3_code": "mjo", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3711", + "scripts": [], + "own_tokenizer": false }, { "name": "Pattapu", "iso_1_code": null, "iso_3_code": "ptq", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3712", + "scripts": [], + "own_tokenizer": false }, { "name": "Vishavan", "iso_1_code": null, "iso_3_code": "vis", - "tokenizer": { - "name": "telugu", - "tokenizer": "SpaCyTokenizer(\"te\")" - }, - "source": "down", - "children": [] + "tokenizers": {}, + "children": [], + "node_i": "3713", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3708", + "scripts": [], + "own_tokenizer": false } - ] + ], + "node_i": "3601", + "scripts": [], + "own_tokenizer": false } \ No newline at end of file