Orpheus Onnx Model
Browse filesBuild ONNX version of Orpheus Text-TO-Speech model
- .gitattributes +38 -35
- OrpheusOnnx.ipynb +0 -0
- README.md +99 -0
- genai_config.json +56 -0
- model.onnx +3 -0
- model.onnx.data +3 -0
- special_tokens_map.json +26 -0
- tokenizer.json +3 -0
- tokenizer_config.json +0 -0
.gitattributes
CHANGED
@@ -1,35 +1,38 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.onnx.data filter=lfs diff=lfs merge=lfs -text
|
38 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
OrpheusOnnx.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
README.md
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: Prince-1/orpheus_3b_0.1_ft_16bit
|
3 |
+
tags:
|
4 |
+
- bitsandbytes
|
5 |
+
- llama
|
6 |
+
- text-generation-inference
|
7 |
+
- transformers
|
8 |
+
- text-to-speech
|
9 |
+
- trl
|
10 |
+
- tts
|
11 |
+
- onnx
|
12 |
+
- onnxruntime-genai
|
13 |
+
license: apache-2.0
|
14 |
+
library_name: transformers
|
15 |
+
language:
|
16 |
+
- en
|
17 |
+
datasets:
|
18 |
+
- MrDragonFox/Elise
|
19 |
+
---
|
20 |
+
|
21 |
+
# Uploaded model
|
22 |
+
|
23 |
+
- **Converted by:** Prince-1
|
24 |
+
- **License:** apache-2.0
|
25 |
+
- **Original model :** Prince-1/orpheus_3b_0.1_ft_16bit
|
26 |
+
|
27 |
+
[<img src="https://raw.githubusercontent.com/microsoft/onnxruntime/main/docs/images/ONNX_Runtime_logo_dark.png" width="200"/>](https://github.com/microsoft/onnxruntime-genai)
|
28 |
+
|
29 |
+
|
30 |
+
Orpheus TTS is a state-of-the-art, Llama-based Speech-LLM designed for high-quality, empathetic text-to-speech generation. This model has been finetuned to deliver human-level speech synthesis, achieving exceptional clarity, expressiveness, and real-time streaming performances.
|
31 |
+
|
32 |
+
# Model Details
|
33 |
+
|
34 |
+
### Model Capabilities
|
35 |
+
|
36 |
+
- **Human-Like Speech**: Natural intonation, emotion, and rhythm that is superior to SOTA closed source models
|
37 |
+
- **Zero-Shot Voice Cloning**: Clone voices without prior fine-tuning
|
38 |
+
- **Guided Emotion and Intonation**: Control speech and emotion characteristics with simple tags
|
39 |
+
- **Low Latency**: ~200ms streaming latency for realtime applications, reducible to ~100ms with input streaming
|
40 |
+
|
41 |
+
# Prerequisites
|
42 |
+
|
43 |
+
Before starting the conversion process, ensure your system meets the following requirements:
|
44 |
+
|
45 |
+
- NVIDIA GPU with CUDA toolkit installe0d
|
46 |
+
|
47 |
+
- Minimum 16 GB RAM (recommended)
|
48 |
+
|
49 |
+
- Python with pip installed
|
50 |
+
|
51 |
+
### Model Sources
|
52 |
+
|
53 |
+
- **GitHub Repo:** [https://github.com/canopyai/Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS)
|
54 |
+
- **Blog Post:** [https://canopylabs.ai/model-releases](https://canopylabs.ai/model-releases)
|
55 |
+
- **Colab Inference Notebook:** [notebook link](https://colab.research.google.com/drive/1KhXT56UePPUHhqitJNUxq63k-pQomz3N?usp=sharing)
|
56 |
+
|
57 |
+
|
58 |
+
# Conversion Steps
|
59 |
+
|
60 |
+
## Clone the Repository
|
61 |
+
|
62 |
+
1. First, clone the official ONNX Runtime GenAI repository:
|
63 |
+
|
64 |
+
```bash
|
65 |
+
git clone https://github.com/microsoft/onnxruntime-genai
|
66 |
+
```
|
67 |
+
|
68 |
+
## Download Huggingface model
|
69 |
+
|
70 |
+
2. Download the Huggingface model using the following cli command
|
71 |
+
|
72 |
+
``` bash
|
73 |
+
huggingface-cli download Prince-1/orpheus_3b_0.1_ft_16bit --local-dir main
|
74 |
+
```
|
75 |
+
|
76 |
+
## Run the Model Builder
|
77 |
+
|
78 |
+
3. Use the model builder script to convert the Orpheus 3B model to ONNX format:
|
79 |
+
|
80 |
+
```bash
|
81 |
+
# Set the path to the builder script
|
82 |
+
$script_path="onnxruntime-genai/src/python/py/models/builder.py"
|
83 |
+
# Run the conversion
|
84 |
+
python $script_path -m "Prince-1/orpheus_3b_0.1_ft_16bit" -i "main" -o "onnx" -p "fp16" -e cuda
|
85 |
+
```
|
86 |
+
|
87 |
+
The command parameters:
|
88 |
+
|
89 |
+
**-m**: The model name/path (HuggingFace model identifier)
|
90 |
+
|
91 |
+
**-o**: Output directory for the ONNX model
|
92 |
+
|
93 |
+
**-p**: Precision setting (fp16 for half-precision floating point)
|
94 |
+
|
95 |
+
**-e**: Execution provider (cuda for NVIDIA GPU acceleration)
|
96 |
+
|
97 |
+
|
98 |
+
# Model Misuse
|
99 |
+
Do not use our models for impersonation without consent, misinformation or deception (including fake news or fraudulent calls), or any illegal or harmful activity. By using this model, you agree to follow all applicable laws and ethical guidelines. We disclaim responsibility for any use.
|
genai_config.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"bos_token_id": 128000,
|
4 |
+
"context_length": 131072,
|
5 |
+
"decoder": {
|
6 |
+
"session_options": {
|
7 |
+
"log_id": "onnxruntime-genai",
|
8 |
+
"provider_options": [
|
9 |
+
{
|
10 |
+
"cuda": {
|
11 |
+
"enable_cuda_graph": "0",
|
12 |
+
"enable_skip_layer_norm_strict_mode": "1"
|
13 |
+
}
|
14 |
+
}
|
15 |
+
]
|
16 |
+
},
|
17 |
+
"filename": "model.onnx",
|
18 |
+
"head_size": 128,
|
19 |
+
"hidden_size": 3072,
|
20 |
+
"inputs": {
|
21 |
+
"input_ids": "input_ids",
|
22 |
+
"attention_mask": "attention_mask",
|
23 |
+
"past_key_names": "past_key_values.%d.key",
|
24 |
+
"past_value_names": "past_key_values.%d.value"
|
25 |
+
},
|
26 |
+
"outputs": {
|
27 |
+
"logits": "logits",
|
28 |
+
"present_key_names": "present.%d.key",
|
29 |
+
"present_value_names": "present.%d.value"
|
30 |
+
},
|
31 |
+
"num_attention_heads": 24,
|
32 |
+
"num_hidden_layers": 28,
|
33 |
+
"num_key_value_heads": 8
|
34 |
+
},
|
35 |
+
"eos_token_id": 128009,
|
36 |
+
"pad_token_id": 128004,
|
37 |
+
"type": "llama",
|
38 |
+
"vocab_size": 156940
|
39 |
+
},
|
40 |
+
"search": {
|
41 |
+
"diversity_penalty": 0.0,
|
42 |
+
"do_sample": true,
|
43 |
+
"early_stopping": true,
|
44 |
+
"length_penalty": 1.0,
|
45 |
+
"max_length": 131072,
|
46 |
+
"min_length": 0,
|
47 |
+
"no_repeat_ngram_size": 0,
|
48 |
+
"num_beams": 1,
|
49 |
+
"num_return_sequences": 1,
|
50 |
+
"past_present_share_buffer": true,
|
51 |
+
"repetition_penalty": 1.0,
|
52 |
+
"temperature": 0.6,
|
53 |
+
"top_k": 1,
|
54 |
+
"top_p": 0.9
|
55 |
+
}
|
56 |
+
}
|
model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b7a513433ece58ff3c985de9270e1ebe957eef0f629dcdee1c64eae467cde5c
|
3 |
+
size 146414
|
model.onnx.data
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8617ee0eb6f9085a25d9a47631fd3132456fe5849068ddb13e4a838ded16cbc
|
3 |
+
size 7599527936
|
special_tokens_map.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|audio|>"
|
4 |
+
],
|
5 |
+
"bos_token": {
|
6 |
+
"content": "<|begin_of_text|>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"eos_token": {
|
13 |
+
"content": "<|eot_id|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false
|
18 |
+
},
|
19 |
+
"pad_token": {
|
20 |
+
"content": "<|finetune_right_pad_id|>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false
|
25 |
+
}
|
26 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc3fecb199b4170636dbfab986d25f628157268d37b861f9cadaca60b1353bce
|
3 |
+
size 22849547
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|