{ "model_type": "vui", "library_name": "vui", "pipeline_tag": "text-to-speech", "license": "mit", "language": ["en"], "architectures": ["VuiForConditionalGeneration"], "model_files": { "base": "vui-100m-base.pt", "abraham": "vui-abraham-100m.pt", "cohost": "vui-cohost-100m.pt", "cohost_alt": "ckpts-vui-cohost-100m.pt", "tokenizer": "fluac-22hz-22khz.pt" }, "model_variants": { "vui-100m-base": { "description": "Base checkpoint trained on 40k hours of audio conversations", "file": "vui-100m-base.pt", "size_mb": 198 }, "vui-abraham-100m": { "description": "Single speaker model with context awareness", "file": "vui-abraham-100m.pt", "size_mb": 198 }, "vui-cohost-100m": { "description": "Two speakers that can interact with each other", "file": "vui-cohost-100m.pt", "size_mb": 198 } }, "tokenizer_config": { "audio_tokenizer": "fluac", "sample_rate": "22khz", "file": "fluac-22hz-22khz.pt", "size_mb": 307 }, "training_data": { "hours": 40000, "type": "audio_conversations" }, "capabilities": [ "text-to-speech", "conversational-speech", "voice-cloning", "on-device-inference" ], "torch_dtype": "float32", "framework": "pytorch" }