prithivMLmods commited on
Commit
3d2b2fc
·
verified ·
1 Parent(s): 4e615c4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +106 -0
README.md CHANGED
@@ -25,6 +25,10 @@ tags:
25
 
26
  <audio controls src="https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/iTcZ1e2UYo_CkurPR_fsh.wav"></audio>
27
 
 
 
 
 
28
  ## **Model Details**
29
 
30
  - **Base Model:** `canopylabs/orpheus-3b-0.1-ft`
@@ -63,6 +67,108 @@ notebook_login()
63
 
64
  ## **Usage**
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  ```python
67
  import torch
68
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
25
 
26
  <audio controls src="https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/iTcZ1e2UYo_CkurPR_fsh.wav"></audio>
27
 
28
+ [ paralinguistic emotions soft]
29
+
30
+ <audio controls src="https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/A8KfCQs7nwyk07kMM_r7P.wav"></audio>
31
+
32
  ## **Model Details**
33
 
34
  - **Base Model:** `canopylabs/orpheus-3b-0.1-ft`
 
67
 
68
  ## **Usage**
69
 
70
+ ```py
71
+ import torch
72
+ from transformers import AutoTokenizer, AutoModelForCausalLM
73
+ import gradio as gr
74
+ from snac import SNAC
75
+
76
+ def redistribute_codes(row):
77
+ """
78
+ Convert a sequence of token codes into an audio waveform using SNAC.
79
+ The code assumes each 7 tokens represent one group of instructions.
80
+ """
81
+ row_length = row.size(0)
82
+ new_length = (row_length // 7) * 7
83
+ trimmed_row = row[:new_length]
84
+ code_list = [t - 128266 for t in trimmed_row]
85
+
86
+ layer_1, layer_2, layer_3 = [], [], []
87
+
88
+ for i in range((len(code_list) + 1) // 7):
89
+ layer_1.append(code_list[7 * i][None])
90
+ layer_2.append(code_list[7 * i + 1][None] - 4096)
91
+ layer_3.append(code_list[7 * i + 2][None] - (2 * 4096))
92
+ layer_3.append(code_list[7 * i + 3][None] - (3 * 4096))
93
+ layer_2.append(code_list[7 * i + 4][None] - (4 * 4096))
94
+ layer_3.append(code_list[7 * i + 5][None] - (5 * 4096))
95
+ layer_3.append(code_list[7 * i + 6][None] - (6 * 4096))
96
+
97
+ with torch.no_grad():
98
+ codes = [
99
+ torch.concat(layer_1),
100
+ torch.concat(layer_2),
101
+ torch.concat(layer_3)
102
+ ]
103
+ for i in range(len(codes)):
104
+ codes[i][codes[i] < 0] = 0
105
+ codes[i] = codes[i][None]
106
+
107
+ audio_hat = snac_model.decode(codes)
108
+ return audio_hat.cpu()[0, 0]
109
+
110
+ # Load the SNAC model for audio decoding
111
+ snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
112
+
113
+ # Load the single-speaker language model
114
+ tokenizer = AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper')
115
+ model = AutoModelForCausalLM.from_pretrained(
116
+ 'prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16
117
+ ).cuda()
118
+
119
+ def generate_audio(text, temperature, top_p, max_new_tokens):
120
+ """
121
+ Given input text, generate speech audio.
122
+ """
123
+ speaker = "Cooper"
124
+ prompt = f'<custom_token_3><|begin_of_text|>{speaker}: {text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
125
+ input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
126
+
127
+ with torch.no_grad():
128
+ generated_ids = model.generate(
129
+ **input_ids,
130
+ max_new_tokens=max_new_tokens,
131
+ do_sample=True,
132
+ temperature=temperature,
133
+ top_p=top_p,
134
+ repetition_penalty=1.1,
135
+ num_return_sequences=1,
136
+ eos_token_id=128258,
137
+ )
138
+
139
+ row = generated_ids[0, input_ids['input_ids'].shape[1]:]
140
+ y_tensor = redistribute_codes(row)
141
+ y_np = y_tensor.detach().cpu().numpy()
142
+ return (24000, y_np)
143
+
144
+ # Gradio Interface
145
+ with gr.Blocks() as demo:
146
+ gr.Markdown("# Llama-3B-Mono-Cooper - Single Speaker Audio Generation")
147
+ gr.Markdown("Generate speech audio using the `prithivMLmods/Llama-3B-Mono-Cooper` model.")
148
+
149
+ with gr.Row():
150
+ text_input = gr.Textbox(lines=4, label="Input Text")
151
+
152
+ with gr.Row():
153
+ temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
154
+ top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
155
+ tokens_slider = gr.Slider(minimum=100, maximum=2000, step=50, value=1200, label="Max New Tokens")
156
+
157
+ output_audio = gr.Audio(type="numpy", label="Generated Audio")
158
+ generate_button = gr.Button("Generate Audio")
159
+
160
+ generate_button.click(
161
+ fn=generate_audio,
162
+ inputs=[text_input, temp_slider, top_p_slider, tokens_slider],
163
+ outputs=output_audio
164
+ )
165
+
166
+ if __name__ == "__main__":
167
+ demo.launch()
168
+ ```
169
+
170
+ [ or ]
171
+
172
  ```python
173
  import torch
174
  from transformers import AutoTokenizer, AutoModelForCausalLM