hungchiayu1 commited on
Commit
838c300
·
1 Parent(s): 348f0d7

update to tangoflux

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +2 -3
  2. TangoFlux.py +57 -0
  3. app.py +9 -249
  4. audioldm/__init__.py +0 -8
  5. audioldm/__main__.py +0 -183
  6. audioldm/__pycache__/__init__.cpython-310.pyc +0 -0
  7. audioldm/__pycache__/__init__.cpython-39.pyc +0 -0
  8. audioldm/__pycache__/ldm.cpython-310.pyc +0 -0
  9. audioldm/__pycache__/ldm.cpython-39.pyc +0 -0
  10. audioldm/__pycache__/pipeline.cpython-310.pyc +0 -0
  11. audioldm/__pycache__/pipeline.cpython-39.pyc +0 -0
  12. audioldm/__pycache__/utils.cpython-310.pyc +0 -0
  13. audioldm/__pycache__/utils.cpython-39.pyc +0 -0
  14. audioldm/audio/__init__.py +0 -2
  15. audioldm/audio/__pycache__/__init__.cpython-310.pyc +0 -0
  16. audioldm/audio/__pycache__/__init__.cpython-39.pyc +0 -0
  17. audioldm/audio/__pycache__/audio_processing.cpython-310.pyc +0 -0
  18. audioldm/audio/__pycache__/audio_processing.cpython-39.pyc +0 -0
  19. audioldm/audio/__pycache__/mix.cpython-39.pyc +0 -0
  20. audioldm/audio/__pycache__/stft.cpython-310.pyc +0 -0
  21. audioldm/audio/__pycache__/stft.cpython-39.pyc +0 -0
  22. audioldm/audio/__pycache__/tools.cpython-310.pyc +0 -0
  23. audioldm/audio/__pycache__/tools.cpython-39.pyc +0 -0
  24. audioldm/audio/__pycache__/torch_tools.cpython-39.pyc +0 -0
  25. audioldm/audio/audio_processing.py +0 -100
  26. audioldm/audio/stft.py +0 -186
  27. audioldm/audio/tools.py +0 -85
  28. audioldm/hifigan/__init__.py +0 -7
  29. audioldm/hifigan/__pycache__/__init__.cpython-310.pyc +0 -0
  30. audioldm/hifigan/__pycache__/__init__.cpython-39.pyc +0 -0
  31. audioldm/hifigan/__pycache__/models.cpython-310.pyc +0 -0
  32. audioldm/hifigan/__pycache__/models.cpython-39.pyc +0 -0
  33. audioldm/hifigan/__pycache__/utilities.cpython-310.pyc +0 -0
  34. audioldm/hifigan/__pycache__/utilities.cpython-39.pyc +0 -0
  35. audioldm/hifigan/models.py +0 -174
  36. audioldm/hifigan/utilities.py +0 -86
  37. audioldm/latent_diffusion/__init__.py +0 -0
  38. audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
  39. audioldm/latent_diffusion/__pycache__/__init__.cpython-39.pyc +0 -0
  40. audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc +0 -0
  41. audioldm/latent_diffusion/__pycache__/attention.cpython-39.pyc +0 -0
  42. audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc +0 -0
  43. audioldm/latent_diffusion/__pycache__/ddim.cpython-39.pyc +0 -0
  44. audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc +0 -0
  45. audioldm/latent_diffusion/__pycache__/ddpm.cpython-39.pyc +0 -0
  46. audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc +0 -0
  47. audioldm/latent_diffusion/__pycache__/ema.cpython-39.pyc +0 -0
  48. audioldm/latent_diffusion/__pycache__/openaimodel.cpython-39.pyc +0 -0
  49. audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc +0 -0
  50. audioldm/latent_diffusion/__pycache__/util.cpython-39.pyc +0 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Tango2
3
  emoji: 🐠
4
  colorFrom: indigo
5
  colorTo: pink
@@ -7,7 +7,6 @@ sdk: gradio
7
  sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Fast Text to Audio Generator
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: TangoFlux
3
  emoji: 🐠
4
  colorFrom: indigo
5
  colorTo: pink
 
7
  sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
TangoFlux.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diffusers import AutoencoderOobleck
2
+ import torch
3
+ from transformers import T5EncoderModel,T5TokenizerFast
4
+ from diffusers import FluxTransformer2DModel
5
+ from torch import nn
6
+ from typing import List
7
+ from diffusers import FlowMatchEulerDiscreteScheduler
8
+ from diffusers.training_utils import compute_density_for_timestep_sampling
9
+ import copy
10
+ import torch.nn.functional as F
11
+ import numpy as np
12
+ from model import TangoFlux
13
+ from huggingface_hub import snapshot_download
14
+ from tqdm import tqdm
15
+ from typing import Optional,Union,List
16
+ from datasets import load_dataset, Audio
17
+ from math import pi
18
+ import json
19
+ import inspect
20
+ import yaml
21
+ from safetensors.torch import load_file
22
+
23
+
24
+ class TangoFluxInference:
25
+
26
+ def __init__(self,name='declare-lab/TangoFlux',device="cuda"):
27
+
28
+
29
+ self.vae = AutoencoderOobleck.from_pretrained("stabilityai/stable-audio-open-1.0",subfolder='vae')
30
+
31
+ paths = snapshot_download(repo_id=name)
32
+ weights = load_file("{}/tangoflux.safetensors".format(paths))
33
+
34
+ with open('{}/config.json'.format(paths),'r') as f:
35
+ config = json.load(f)
36
+ self.model = TangoFlux(config)
37
+ self.model.load_state_dict(weights,strict=False)
38
+ # _IncompatibleKeys(missing_keys=['text_encoder.encoder.embed_tokens.weight'], unexpected_keys=[]) this behaviour is expected
39
+ self.vae.to(device)
40
+ self.model.to(device)
41
+
42
+ def generate(self,prompt,steps=25,duration=10,guidance_scale=4.5):
43
+
44
+ with torch.no_grad():
45
+ latents = self.model.inference_flow(prompt,
46
+ duration=duration,
47
+ num_inference_steps=steps,
48
+ guidance_scale=guidance_scale)
49
+
50
+
51
+
52
+ wave = self.vae.decode(latents.transpose(2,1)).sample.cpu()[0]
53
+ return wave
54
+
55
+
56
+
57
+
app.py CHANGED
@@ -13,245 +13,23 @@ from gradio import Markdown
13
 
14
  import torch
15
  #from diffusers.models.autoencoder_kl import AutoencoderKL
16
- from diffusers.models.unet_2d_condition import UNet2DConditionModel
17
  from diffusers import DiffusionPipeline,AudioPipelineOutput
18
  from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
19
  from typing import Union
20
  from diffusers.utils.torch_utils import randn_tensor
21
  from tqdm import tqdm
 
22
 
23
 
24
 
 
25
 
26
 
27
- class Tango2Pipeline(DiffusionPipeline):
28
 
29
-
30
- def __init__(
31
- self,
32
- vae: AutoencoderKL,
33
- text_encoder: T5EncoderModel,
34
- tokenizer: Union[T5Tokenizer, T5TokenizerFast],
35
- unet: UNet2DConditionModel,
36
- scheduler: DDPMScheduler
37
- ):
38
-
39
- super().__init__()
40
-
41
- self.register_modules(vae=vae,
42
- text_encoder=text_encoder,
43
- tokenizer=tokenizer,
44
- unet=unet,
45
- scheduler=scheduler
46
- )
47
-
48
-
49
- def _encode_prompt(self, prompt):
50
- device = self.text_encoder.device
51
-
52
- batch = self.tokenizer(
53
- prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
54
- )
55
- input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
56
 
57
-
58
- encoder_hidden_states = self.text_encoder(
59
- input_ids=input_ids, attention_mask=attention_mask
60
- )[0]
61
-
62
- boolean_encoder_mask = (attention_mask == 1).to(device)
63
-
64
- return encoder_hidden_states, boolean_encoder_mask
65
-
66
- def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
67
- device = self.text_encoder.device
68
- batch = self.tokenizer(
69
- prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
70
- )
71
- input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
72
-
73
- with torch.no_grad():
74
- prompt_embeds = self.text_encoder(
75
- input_ids=input_ids, attention_mask=attention_mask
76
- )[0]
77
-
78
- prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
79
- attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
80
-
81
- # get unconditional embeddings for classifier free guidance
82
- uncond_tokens = [""] * len(prompt)
83
-
84
- max_length = prompt_embeds.shape[1]
85
- uncond_batch = self.tokenizer(
86
- uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
87
- )
88
- uncond_input_ids = uncond_batch.input_ids.to(device)
89
- uncond_attention_mask = uncond_batch.attention_mask.to(device)
90
-
91
- with torch.no_grad():
92
- negative_prompt_embeds = self.text_encoder(
93
- input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
94
- )[0]
95
-
96
- negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
97
- uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
98
-
99
- # For classifier free guidance, we need to do two forward passes.
100
- # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
101
- prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
102
- prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
103
- boolean_prompt_mask = (prompt_mask == 1).to(device)
104
-
105
- return prompt_embeds, boolean_prompt_mask
106
-
107
- def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
108
- shape = (batch_size, num_channels_latents, 256, 16)
109
- latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
110
- # scale the initial noise by the standard deviation required by the scheduler
111
- latents = latents * inference_scheduler.init_noise_sigma
112
- return latents
113
-
114
- @torch.no_grad()
115
- def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
116
- disable_progress=True):
117
- device = self.text_encoder.device
118
- classifier_free_guidance = guidance_scale > 1.0
119
- batch_size = len(prompt) * num_samples_per_prompt
120
-
121
- if classifier_free_guidance:
122
- prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
123
- else:
124
- prompt_embeds, boolean_prompt_mask = self._encode_text(prompt)
125
- prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
126
- boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
127
-
128
- inference_scheduler.set_timesteps(num_steps, device=device)
129
- timesteps = inference_scheduler.timesteps
130
-
131
- num_channels_latents = self.unet.config.in_channels
132
- latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
133
-
134
- num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
135
- progress_bar = tqdm(range(num_steps), disable=disable_progress)
136
-
137
- for i, t in enumerate(timesteps):
138
- # expand the latents if we are doing classifier free guidance
139
- latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
140
- latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
141
-
142
- noise_pred = self.unet(
143
- latent_model_input, t, encoder_hidden_states=prompt_embeds,
144
- encoder_attention_mask=boolean_prompt_mask
145
- ).sample
146
-
147
- # perform guidance
148
- if classifier_free_guidance:
149
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
150
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
151
-
152
- # compute the previous noisy sample x_t -> x_t-1
153
- latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
154
-
155
- # call the callback, if provided
156
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
157
- progress_bar.update(1)
158
-
159
- return latents
160
-
161
- @torch.no_grad()
162
- def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
163
- """ Genrate audio for a single prompt string. """
164
- with torch.no_grad():
165
- latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
166
- mel = self.vae.decode_first_stage(latents)
167
- wave = self.vae.decode_to_waveform(mel)
168
-
169
-
170
- return AudioPipelineOutput(audios=wave)
171
-
172
-
173
- # Automatic device detection
174
- if torch.cuda.is_available():
175
- device_type = "cuda"
176
- device_selection = "cuda:0"
177
- else:
178
- device_type = "cpu"
179
- device_selection = "cpu"
180
-
181
- class Tango:
182
- def __init__(self, name="declare-lab/tango2", device=device_selection):
183
-
184
- path = snapshot_download(repo_id=name)
185
-
186
- vae_config = json.load(open("{}/vae_config.json".format(path)))
187
- stft_config = json.load(open("{}/stft_config.json".format(path)))
188
- main_config = json.load(open("{}/main_config.json".format(path)))
189
-
190
- self.vae = AutoencoderKL(**vae_config).to(device)
191
- self.stft = TacotronSTFT(**stft_config).to(device)
192
- self.model = AudioDiffusion(**main_config).to(device)
193
-
194
- vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location=device)
195
- stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location=device)
196
- main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location=device)
197
-
198
- self.vae.load_state_dict(vae_weights)
199
- self.stft.load_state_dict(stft_weights)
200
- self.model.load_state_dict(main_weights)
201
-
202
- print ("Successfully loaded checkpoint from:", name)
203
-
204
- self.vae.eval()
205
- self.stft.eval()
206
- self.model.eval()
207
-
208
- self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler")
209
-
210
- def chunks(self, lst, n):
211
- """ Yield successive n-sized chunks from a list. """
212
- for i in range(0, len(lst), n):
213
- yield lst[i:i + n]
214
-
215
- def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
216
- """ Genrate audio for a single prompt string. """
217
- with torch.no_grad():
218
- latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
219
- mel = self.vae.decode_first_stage(latents)
220
- wave = self.vae.decode_to_waveform(mel)
221
- return wave[0]
222
-
223
- def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
224
- """ Genrate audio for a list of prompt strings. """
225
- outputs = []
226
- for k in tqdm(range(0, len(prompts), batch_size)):
227
- batch = prompts[k: k+batch_size]
228
- with torch.no_grad():
229
- latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
230
- mel = self.vae.decode_first_stage(latents)
231
- wave = self.vae.decode_to_waveform(mel)
232
- outputs += [item for item in wave]
233
- if samples == 1:
234
- return outputs
235
- else:
236
- return list(self.chunks(outputs, samples))
237
-
238
- # Initialize TANGO
239
-
240
- tango = Tango(device="cpu")
241
- tango.vae.to(device_type)
242
- tango.stft.to(device_type)
243
- tango.model.to(device_type)
244
-
245
- pipe = Tango2Pipeline(vae=tango.vae,
246
- text_encoder=tango.model.text_encoder,
247
- tokenizer=tango.model.tokenizer,
248
- unet=tango.model.unet,
249
- scheduler=tango.scheduler
250
- )
251
-
252
-
253
- @spaces.GPU(duration=60)
254
- def gradio_generate(prompt, output_format, steps, guidance):
255
  output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
256
  #output_wave = tango.generate(prompt, steps, guidance)
257
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
@@ -265,25 +43,6 @@ def gradio_generate(prompt, output_format, steps, guidance):
265
 
266
  return output_filename
267
 
268
- # description_text = """
269
- # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
270
- # Generate audio using TANGO by providing a text prompt.
271
- # <br/><br/>Limitations: TANGO is trained on the small AudioCaps dataset so it may not generate good audio \
272
- # samples related to concepts that it has not seen in training (e.g. singing). For the same reason, TANGO \
273
- # is not always able to finely control its generations over textual control prompts. For example, \
274
- # the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes \
275
- # on a metal table are very similar. \
276
- # <br/><br/>We are currently training another version of TANGO on larger datasets to enhance its generalization, \
277
- # compositional and controllable generation ability.
278
- # <br/><br/>We recommend using a guidance scale of 3. The default number of steps is set to 100. More steps generally lead to better quality of generated audios but will take longer.
279
- # <br/><br/>
280
- # <h1> ChatGPT-enhanced audio generation</h1>
281
- # <br/>
282
- # As TANGO consists of an instruction-tuned LLM, it is able to process complex sound descriptions allowing us to provide more detailed instructions to improve the generation quality.
283
- # For example, ``A boat is moving on the sea'' vs ``The sound of the water lapping against the hull of the boat or splashing as you move through the waves''. The latter is obtained by prompting ChatGPT to explain the sound generated when a boat moves on the sea.
284
- # Using this ChatGPT-generated description of the sound, TANGO provides superior results.
285
- # <p/>
286
- # """
287
  description_text = """
288
  <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
289
  Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
@@ -294,15 +53,16 @@ Generate audio using Tango2 by providing a text prompt. Tango2 was built from Ta
294
  input_text = gr.Textbox(lines=2, label="Prompt")
295
  output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
296
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
297
- denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
298
  guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
 
299
 
300
  # Gradio interface
301
  gr_interface = gr.Interface(
302
  fn=gradio_generate,
303
- inputs=[input_text, output_format, denoising_steps, guidance_scale],
304
  outputs=[output_audio],
305
- title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
306
  description=description_text,
307
  allow_flagging=False,
308
  examples=[
 
13
 
14
  import torch
15
  #from diffusers.models.autoencoder_kl import AutoencoderKL
 
16
  from diffusers import DiffusionPipeline,AudioPipelineOutput
17
  from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
18
  from typing import Union
19
  from diffusers.utils.torch_utils import randn_tensor
20
  from tqdm import tqdm
21
+ from TangoFlux import TangoFluxInference
22
 
23
 
24
 
25
+ tangoflux = TangoFluxInference(path="declare-lab/TangoFlux")
26
 
27
 
 
28
 
29
+ @spaces.GPU(duration=15)
30
+ def gradio_generate(prompt, output_format, steps, guidance,duration=10):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ output_wave = tangoflux.generate(prompt,steps=steps,guidance=guidance,duration=duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
34
  #output_wave = tango.generate(prompt, steps, guidance)
35
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
 
43
 
44
  return output_filename
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  description_text = """
47
  <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
48
  Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
 
53
  input_text = gr.Textbox(lines=2, label="Prompt")
54
  output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
55
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
56
+ denoising_steps = gr.Slider(minimum=10, maximum=100, value=25, step=1, label="Steps", interactive=True)
57
  guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
58
+ duration_scale = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
59
 
60
  # Gradio interface
61
  gr_interface = gr.Interface(
62
  fn=gradio_generate,
63
+ inputs=[input_text, output_format, denoising_steps, guidance_scale,duration_scale],
64
  outputs=[output_audio],
65
+ title="TangoFlux: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
66
  description=description_text,
67
  allow_flagging=False,
68
  examples=[
audioldm/__init__.py DELETED
@@ -1,8 +0,0 @@
1
- from .ldm import LatentDiffusion
2
- from .utils import seed_everything, save_wave, get_time, get_duration
3
- from .pipeline import *
4
-
5
-
6
-
7
-
8
-
 
 
 
 
 
 
 
 
 
audioldm/__main__.py DELETED
@@ -1,183 +0,0 @@
1
- #!/usr/bin/python3
2
- import os
3
- from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
4
- import argparse
5
-
6
- CACHE_DIR = os.getenv(
7
- "AUDIOLDM_CACHE_DIR",
8
- os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
9
-
10
- parser = argparse.ArgumentParser()
11
-
12
- parser.add_argument(
13
- "--mode",
14
- type=str,
15
- required=False,
16
- default="generation",
17
- help="generation: text-to-audio generation; transfer: style transfer",
18
- choices=["generation", "transfer"]
19
- )
20
-
21
- parser.add_argument(
22
- "-t",
23
- "--text",
24
- type=str,
25
- required=False,
26
- default="",
27
- help="Text prompt to the model for audio generation",
28
- )
29
-
30
- parser.add_argument(
31
- "-f",
32
- "--file_path",
33
- type=str,
34
- required=False,
35
- default=None,
36
- help="(--mode transfer): Original audio file for style transfer; Or (--mode generation): the guidance audio file for generating simialr audio",
37
- )
38
-
39
- parser.add_argument(
40
- "--transfer_strength",
41
- type=float,
42
- required=False,
43
- default=0.5,
44
- help="A value between 0 and 1. 0 means original audio without transfer, 1 means completely transfer to the audio indicated by text",
45
- )
46
-
47
- parser.add_argument(
48
- "-s",
49
- "--save_path",
50
- type=str,
51
- required=False,
52
- help="The path to save model output",
53
- default="./output",
54
- )
55
-
56
- parser.add_argument(
57
- "--model_name",
58
- type=str,
59
- required=False,
60
- help="The checkpoint you gonna use",
61
- default="audioldm-s-full",
62
- choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"]
63
- )
64
-
65
- parser.add_argument(
66
- "-ckpt",
67
- "--ckpt_path",
68
- type=str,
69
- required=False,
70
- help="The path to the pretrained .ckpt model",
71
- default=None,
72
- )
73
-
74
- parser.add_argument(
75
- "-b",
76
- "--batchsize",
77
- type=int,
78
- required=False,
79
- default=1,
80
- help="Generate how many samples at the same time",
81
- )
82
-
83
- parser.add_argument(
84
- "--ddim_steps",
85
- type=int,
86
- required=False,
87
- default=200,
88
- help="The sampling step for DDIM",
89
- )
90
-
91
- parser.add_argument(
92
- "-gs",
93
- "--guidance_scale",
94
- type=float,
95
- required=False,
96
- default=2.5,
97
- help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
98
- )
99
-
100
- parser.add_argument(
101
- "-dur",
102
- "--duration",
103
- type=float,
104
- required=False,
105
- default=10.0,
106
- help="The duration of the samples",
107
- )
108
-
109
- parser.add_argument(
110
- "-n",
111
- "--n_candidate_gen_per_text",
112
- type=int,
113
- required=False,
114
- default=3,
115
- help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
116
- )
117
-
118
- parser.add_argument(
119
- "--seed",
120
- type=int,
121
- required=False,
122
- default=42,
123
- help="Change this value (any integer number) will lead to a different generation result.",
124
- )
125
-
126
- args = parser.parse_args()
127
-
128
- if(args.ckpt_path is not None):
129
- print("Warning: ckpt_path has no effect after version 0.0.20.")
130
-
131
- assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5"
132
-
133
- mode = args.mode
134
- if(mode == "generation" and args.file_path is not None):
135
- mode = "generation_audio_to_audio"
136
- if(len(args.text) > 0):
137
- print("Warning: You have specified the --file_path. --text will be ignored")
138
- args.text = ""
139
-
140
- save_path = os.path.join(args.save_path, mode)
141
-
142
- if(args.file_path is not None):
143
- save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0]))
144
-
145
- text = args.text
146
- random_seed = args.seed
147
- duration = args.duration
148
- guidance_scale = args.guidance_scale
149
- n_candidate_gen_per_text = args.n_candidate_gen_per_text
150
-
151
- os.makedirs(save_path, exist_ok=True)
152
- audioldm = build_model(model_name=args.model_name)
153
-
154
- if(args.mode == "generation"):
155
- waveform = text_to_audio(
156
- audioldm,
157
- text,
158
- args.file_path,
159
- random_seed,
160
- duration=duration,
161
- guidance_scale=guidance_scale,
162
- ddim_steps=args.ddim_steps,
163
- n_candidate_gen_per_text=n_candidate_gen_per_text,
164
- batchsize=args.batchsize,
165
- )
166
-
167
- elif(args.mode == "transfer"):
168
- assert args.file_path is not None
169
- assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path
170
- waveform = style_transfer(
171
- audioldm,
172
- text,
173
- args.file_path,
174
- args.transfer_strength,
175
- random_seed,
176
- duration=duration,
177
- guidance_scale=guidance_scale,
178
- ddim_steps=args.ddim_steps,
179
- batchsize=args.batchsize,
180
- )
181
- waveform = waveform[:,None,:]
182
-
183
- save_wave(waveform, save_path, name="%s_%s" % (get_time(), text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audioldm/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (315 Bytes)
 
audioldm/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (322 Bytes)
 
audioldm/__pycache__/ldm.cpython-310.pyc DELETED
Binary file (16.1 kB)
 
audioldm/__pycache__/ldm.cpython-39.pyc DELETED
Binary file (16 kB)
 
audioldm/__pycache__/pipeline.cpython-310.pyc DELETED
Binary file (6.63 kB)
 
audioldm/__pycache__/pipeline.cpython-39.pyc DELETED
Binary file (6.54 kB)
 
audioldm/__pycache__/utils.cpython-310.pyc DELETED
Binary file (8.01 kB)
 
audioldm/__pycache__/utils.cpython-39.pyc DELETED
Binary file (7.35 kB)
 
audioldm/audio/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .tools import wav_to_fbank, read_wav_file
2
- from .stft import TacotronSTFT
 
 
 
audioldm/audio/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (253 Bytes)
 
audioldm/audio/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (260 Bytes)
 
audioldm/audio/__pycache__/audio_processing.cpython-310.pyc DELETED
Binary file (2.78 kB)
 
audioldm/audio/__pycache__/audio_processing.cpython-39.pyc DELETED
Binary file (2.78 kB)
 
audioldm/audio/__pycache__/mix.cpython-39.pyc DELETED
Binary file (1.7 kB)
 
audioldm/audio/__pycache__/stft.cpython-310.pyc DELETED
Binary file (4.98 kB)
 
audioldm/audio/__pycache__/stft.cpython-39.pyc DELETED
Binary file (4.99 kB)
 
audioldm/audio/__pycache__/tools.cpython-310.pyc DELETED
Binary file (2.18 kB)
 
audioldm/audio/__pycache__/tools.cpython-39.pyc DELETED
Binary file (2.19 kB)
 
audioldm/audio/__pycache__/torch_tools.cpython-39.pyc DELETED
Binary file (3.79 kB)
 
audioldm/audio/audio_processing.py DELETED
@@ -1,100 +0,0 @@
1
- import torch
2
- import numpy as np
3
- import librosa.util as librosa_util
4
- from scipy.signal import get_window
5
-
6
-
7
- def window_sumsquare(
8
- window,
9
- n_frames,
10
- hop_length,
11
- win_length,
12
- n_fft,
13
- dtype=np.float32,
14
- norm=None,
15
- ):
16
- """
17
- # from librosa 0.6
18
- Compute the sum-square envelope of a window function at a given hop length.
19
-
20
- This is used to estimate modulation effects induced by windowing
21
- observations in short-time fourier transforms.
22
-
23
- Parameters
24
- ----------
25
- window : string, tuple, number, callable, or list-like
26
- Window specification, as in `get_window`
27
-
28
- n_frames : int > 0
29
- The number of analysis frames
30
-
31
- hop_length : int > 0
32
- The number of samples to advance between frames
33
-
34
- win_length : [optional]
35
- The length of the window function. By default, this matches `n_fft`.
36
-
37
- n_fft : int > 0
38
- The length of each analysis frame.
39
-
40
- dtype : np.dtype
41
- The data type of the output
42
-
43
- Returns
44
- -------
45
- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
46
- The sum-squared envelope of the window function
47
- """
48
- if win_length is None:
49
- win_length = n_fft
50
-
51
- n = n_fft + hop_length * (n_frames - 1)
52
- x = np.zeros(n, dtype=dtype)
53
-
54
- # Compute the squared window at the desired length
55
- win_sq = get_window(window, win_length, fftbins=True)
56
- win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
57
- win_sq = librosa_util.pad_center(win_sq, n_fft)
58
-
59
- # Fill the envelope
60
- for i in range(n_frames):
61
- sample = i * hop_length
62
- x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
63
- return x
64
-
65
-
66
- def griffin_lim(magnitudes, stft_fn, n_iters=30):
67
- """
68
- PARAMS
69
- ------
70
- magnitudes: spectrogram magnitudes
71
- stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
72
- """
73
-
74
- angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
75
- angles = angles.astype(np.float32)
76
- angles = torch.autograd.Variable(torch.from_numpy(angles))
77
- signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
78
-
79
- for i in range(n_iters):
80
- _, angles = stft_fn.transform(signal)
81
- signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
82
- return signal
83
-
84
-
85
- def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
86
- """
87
- PARAMS
88
- ------
89
- C: compression factor
90
- """
91
- return normalize_fun(torch.clamp(x, min=clip_val) * C)
92
-
93
-
94
- def dynamic_range_decompression(x, C=1):
95
- """
96
- PARAMS
97
- ------
98
- C: compression factor used to compress
99
- """
100
- return torch.exp(x) / C
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audioldm/audio/stft.py DELETED
@@ -1,186 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- import numpy as np
4
- from scipy.signal import get_window
5
- from librosa.util import pad_center, tiny
6
- from librosa.filters import mel as librosa_mel_fn
7
-
8
- from audioldm.audio.audio_processing import (
9
- dynamic_range_compression,
10
- dynamic_range_decompression,
11
- window_sumsquare,
12
- )
13
-
14
-
15
- class STFT(torch.nn.Module):
16
- """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
17
-
18
- def __init__(self, filter_length, hop_length, win_length, window="hann"):
19
- super(STFT, self).__init__()
20
- self.filter_length = filter_length
21
- self.hop_length = hop_length
22
- self.win_length = win_length
23
- self.window = window
24
- self.forward_transform = None
25
- scale = self.filter_length / self.hop_length
26
- fourier_basis = np.fft.fft(np.eye(self.filter_length))
27
-
28
- cutoff = int((self.filter_length / 2 + 1))
29
- fourier_basis = np.vstack(
30
- [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
31
- )
32
-
33
- forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
34
- inverse_basis = torch.FloatTensor(
35
- np.linalg.pinv(scale * fourier_basis).T[:, None, :]
36
- )
37
-
38
- if window is not None:
39
- assert filter_length >= win_length
40
- # get window and zero center pad it to filter_length
41
- fft_window = get_window(window, win_length, fftbins=True)
42
- fft_window = pad_center(fft_window, filter_length)
43
- fft_window = torch.from_numpy(fft_window).float()
44
-
45
- # window the bases
46
- forward_basis *= fft_window
47
- inverse_basis *= fft_window
48
-
49
- self.register_buffer("forward_basis", forward_basis.float())
50
- self.register_buffer("inverse_basis", inverse_basis.float())
51
-
52
- def transform(self, input_data):
53
- device = self.forward_basis.device
54
- input_data = input_data.to(device)
55
-
56
- num_batches = input_data.size(0)
57
- num_samples = input_data.size(1)
58
-
59
- self.num_samples = num_samples
60
-
61
- # similar to librosa, reflect-pad the input
62
- input_data = input_data.view(num_batches, 1, num_samples)
63
- input_data = F.pad(
64
- input_data.unsqueeze(1),
65
- (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
66
- mode="reflect",
67
- )
68
- input_data = input_data.squeeze(1)
69
-
70
- forward_transform = F.conv1d(
71
- input_data,
72
- torch.autograd.Variable(self.forward_basis, requires_grad=False),
73
- stride=self.hop_length,
74
- padding=0,
75
- )#.cpu()
76
-
77
- cutoff = int((self.filter_length / 2) + 1)
78
- real_part = forward_transform[:, :cutoff, :]
79
- imag_part = forward_transform[:, cutoff:, :]
80
-
81
- magnitude = torch.sqrt(real_part**2 + imag_part**2)
82
- phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
83
-
84
- return magnitude, phase
85
-
86
- def inverse(self, magnitude, phase):
87
- device = self.forward_basis.device
88
- magnitude, phase = magnitude.to(device), phase.to(device)
89
-
90
- recombine_magnitude_phase = torch.cat(
91
- [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
92
- )
93
-
94
- inverse_transform = F.conv_transpose1d(
95
- recombine_magnitude_phase,
96
- torch.autograd.Variable(self.inverse_basis, requires_grad=False),
97
- stride=self.hop_length,
98
- padding=0,
99
- )
100
-
101
- if self.window is not None:
102
- window_sum = window_sumsquare(
103
- self.window,
104
- magnitude.size(-1),
105
- hop_length=self.hop_length,
106
- win_length=self.win_length,
107
- n_fft=self.filter_length,
108
- dtype=np.float32,
109
- )
110
- # remove modulation effects
111
- approx_nonzero_indices = torch.from_numpy(
112
- np.where(window_sum > tiny(window_sum))[0]
113
- )
114
- window_sum = torch.autograd.Variable(
115
- torch.from_numpy(window_sum), requires_grad=False
116
- )
117
- window_sum = window_sum
118
- inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
119
- approx_nonzero_indices
120
- ]
121
-
122
- # scale by hop ratio
123
- inverse_transform *= float(self.filter_length) / self.hop_length
124
-
125
- inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
126
- inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
127
-
128
- return inverse_transform
129
-
130
- def forward(self, input_data):
131
- self.magnitude, self.phase = self.transform(input_data)
132
- reconstruction = self.inverse(self.magnitude, self.phase)
133
- return reconstruction
134
-
135
-
136
- class TacotronSTFT(torch.nn.Module):
137
- def __init__(
138
- self,
139
- filter_length,
140
- hop_length,
141
- win_length,
142
- n_mel_channels,
143
- sampling_rate,
144
- mel_fmin,
145
- mel_fmax,
146
- ):
147
- super(TacotronSTFT, self).__init__()
148
- self.n_mel_channels = n_mel_channels
149
- self.sampling_rate = sampling_rate
150
- self.stft_fn = STFT(filter_length, hop_length, win_length)
151
- mel_basis = librosa_mel_fn(
152
- sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
153
- )
154
- mel_basis = torch.from_numpy(mel_basis).float()
155
- self.register_buffer("mel_basis", mel_basis)
156
-
157
- def spectral_normalize(self, magnitudes, normalize_fun):
158
- output = dynamic_range_compression(magnitudes, normalize_fun)
159
- return output
160
-
161
- def spectral_de_normalize(self, magnitudes):
162
- output = dynamic_range_decompression(magnitudes)
163
- return output
164
-
165
- def mel_spectrogram(self, y, normalize_fun=torch.log):
166
- """Computes mel-spectrograms from a batch of waves
167
- PARAMS
168
- ------
169
- y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
170
-
171
- RETURNS
172
- -------
173
- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
174
- """
175
- assert torch.min(y.data) >= -1, torch.min(y.data)
176
- assert torch.max(y.data) <= 1, torch.max(y.data)
177
-
178
- magnitudes, phases = self.stft_fn.transform(y)
179
- magnitudes = magnitudes.data
180
- mel_output = torch.matmul(self.mel_basis, magnitudes)
181
- mel_output = self.spectral_normalize(mel_output, normalize_fun)
182
- energy = torch.norm(magnitudes, dim=1)
183
-
184
- log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
185
-
186
- return mel_output, log_magnitudes, energy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audioldm/audio/tools.py DELETED
@@ -1,85 +0,0 @@
1
- import torch
2
- import numpy as np
3
- import torchaudio
4
-
5
-
6
- def get_mel_from_wav(audio, _stft):
7
- audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
8
- audio = torch.autograd.Variable(audio, requires_grad=False)
9
- melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
10
- melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
11
- log_magnitudes_stft = (
12
- torch.squeeze(log_magnitudes_stft, 0).numpy().astype(np.float32)
13
- )
14
- energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
15
- return melspec, log_magnitudes_stft, energy
16
-
17
-
18
- def _pad_spec(fbank, target_length=1024):
19
- n_frames = fbank.shape[0]
20
- p = target_length - n_frames
21
- # cut and pad
22
- if p > 0:
23
- m = torch.nn.ZeroPad2d((0, 0, 0, p))
24
- fbank = m(fbank)
25
- elif p < 0:
26
- fbank = fbank[0:target_length, :]
27
-
28
- if fbank.size(-1) % 2 != 0:
29
- fbank = fbank[..., :-1]
30
-
31
- return fbank
32
-
33
-
34
- def pad_wav(waveform, segment_length):
35
- waveform_length = waveform.shape[-1]
36
- assert waveform_length > 100, "Waveform is too short, %s" % waveform_length
37
- if segment_length is None or waveform_length == segment_length:
38
- return waveform
39
- elif waveform_length > segment_length:
40
- return waveform[:segment_length]
41
- elif waveform_length < segment_length:
42
- temp_wav = np.zeros((1, segment_length))
43
- temp_wav[:, :waveform_length] = waveform
44
- return temp_wav
45
-
46
- def normalize_wav(waveform):
47
- waveform = waveform - np.mean(waveform)
48
- waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
49
- return waveform * 0.5
50
-
51
-
52
- def read_wav_file(filename, segment_length):
53
- # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
54
- waveform, sr = torchaudio.load(filename) # Faster!!!
55
- waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
56
- waveform = waveform.numpy()[0, ...]
57
- waveform = normalize_wav(waveform)
58
- waveform = waveform[None, ...]
59
- waveform = pad_wav(waveform, segment_length)
60
-
61
- waveform = waveform / np.max(np.abs(waveform))
62
- waveform = 0.5 * waveform
63
-
64
- return waveform
65
-
66
-
67
- def wav_to_fbank(filename, target_length=1024, fn_STFT=None):
68
- assert fn_STFT is not None
69
-
70
- # mixup
71
- waveform = read_wav_file(filename, target_length * 160) # hop size is 160
72
-
73
- waveform = waveform[0, ...]
74
- waveform = torch.FloatTensor(waveform)
75
-
76
- fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
77
-
78
- fbank = torch.FloatTensor(fbank.T)
79
- log_magnitudes_stft = torch.FloatTensor(log_magnitudes_stft.T)
80
-
81
- fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
82
- log_magnitudes_stft, target_length
83
- )
84
-
85
- return fbank, log_magnitudes_stft, waveform
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audioldm/hifigan/__init__.py DELETED
@@ -1,7 +0,0 @@
1
- from .models import Generator
2
-
3
-
4
- class AttrDict(dict):
5
- def __init__(self, *args, **kwargs):
6
- super(AttrDict, self).__init__(*args, **kwargs)
7
- self.__dict__ = self
 
 
 
 
 
 
 
 
audioldm/hifigan/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (569 Bytes)
 
audioldm/hifigan/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (574 Bytes)
 
audioldm/hifigan/__pycache__/models.cpython-310.pyc DELETED
Binary file (3.73 kB)
 
audioldm/hifigan/__pycache__/models.cpython-39.pyc DELETED
Binary file (3.73 kB)
 
audioldm/hifigan/__pycache__/utilities.cpython-310.pyc DELETED
Binary file (2.48 kB)
 
audioldm/hifigan/__pycache__/utilities.cpython-39.pyc DELETED
Binary file (2.37 kB)
 
audioldm/hifigan/models.py DELETED
@@ -1,174 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
- from torch.nn import Conv1d, ConvTranspose1d
5
- from torch.nn.utils import weight_norm, remove_weight_norm
6
-
7
- LRELU_SLOPE = 0.1
8
-
9
-
10
- def init_weights(m, mean=0.0, std=0.01):
11
- classname = m.__class__.__name__
12
- if classname.find("Conv") != -1:
13
- m.weight.data.normal_(mean, std)
14
-
15
-
16
- def get_padding(kernel_size, dilation=1):
17
- return int((kernel_size * dilation - dilation) / 2)
18
-
19
-
20
- class ResBlock(torch.nn.Module):
21
- def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
22
- super(ResBlock, self).__init__()
23
- self.h = h
24
- self.convs1 = nn.ModuleList(
25
- [
26
- weight_norm(
27
- Conv1d(
28
- channels,
29
- channels,
30
- kernel_size,
31
- 1,
32
- dilation=dilation[0],
33
- padding=get_padding(kernel_size, dilation[0]),
34
- )
35
- ),
36
- weight_norm(
37
- Conv1d(
38
- channels,
39
- channels,
40
- kernel_size,
41
- 1,
42
- dilation=dilation[1],
43
- padding=get_padding(kernel_size, dilation[1]),
44
- )
45
- ),
46
- weight_norm(
47
- Conv1d(
48
- channels,
49
- channels,
50
- kernel_size,
51
- 1,
52
- dilation=dilation[2],
53
- padding=get_padding(kernel_size, dilation[2]),
54
- )
55
- ),
56
- ]
57
- )
58
- self.convs1.apply(init_weights)
59
-
60
- self.convs2 = nn.ModuleList(
61
- [
62
- weight_norm(
63
- Conv1d(
64
- channels,
65
- channels,
66
- kernel_size,
67
- 1,
68
- dilation=1,
69
- padding=get_padding(kernel_size, 1),
70
- )
71
- ),
72
- weight_norm(
73
- Conv1d(
74
- channels,
75
- channels,
76
- kernel_size,
77
- 1,
78
- dilation=1,
79
- padding=get_padding(kernel_size, 1),
80
- )
81
- ),
82
- weight_norm(
83
- Conv1d(
84
- channels,
85
- channels,
86
- kernel_size,
87
- 1,
88
- dilation=1,
89
- padding=get_padding(kernel_size, 1),
90
- )
91
- ),
92
- ]
93
- )
94
- self.convs2.apply(init_weights)
95
-
96
- def forward(self, x):
97
- for c1, c2 in zip(self.convs1, self.convs2):
98
- xt = F.leaky_relu(x, LRELU_SLOPE)
99
- xt = c1(xt)
100
- xt = F.leaky_relu(xt, LRELU_SLOPE)
101
- xt = c2(xt)
102
- x = xt + x
103
- return x
104
-
105
- def remove_weight_norm(self):
106
- for l in self.convs1:
107
- remove_weight_norm(l)
108
- for l in self.convs2:
109
- remove_weight_norm(l)
110
-
111
-
112
- class Generator(torch.nn.Module):
113
- def __init__(self, h):
114
- super(Generator, self).__init__()
115
- self.h = h
116
- self.num_kernels = len(h.resblock_kernel_sizes)
117
- self.num_upsamples = len(h.upsample_rates)
118
- self.conv_pre = weight_norm(
119
- Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
120
- )
121
- resblock = ResBlock
122
-
123
- self.ups = nn.ModuleList()
124
- for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
125
- self.ups.append(
126
- weight_norm(
127
- ConvTranspose1d(
128
- h.upsample_initial_channel // (2**i),
129
- h.upsample_initial_channel // (2 ** (i + 1)),
130
- k,
131
- u,
132
- padding=(k - u) // 2,
133
- )
134
- )
135
- )
136
-
137
- self.resblocks = nn.ModuleList()
138
- for i in range(len(self.ups)):
139
- ch = h.upsample_initial_channel // (2 ** (i + 1))
140
- for j, (k, d) in enumerate(
141
- zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
142
- ):
143
- self.resblocks.append(resblock(h, ch, k, d))
144
-
145
- self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
146
- self.ups.apply(init_weights)
147
- self.conv_post.apply(init_weights)
148
-
149
- def forward(self, x):
150
- x = self.conv_pre(x)
151
- for i in range(self.num_upsamples):
152
- x = F.leaky_relu(x, LRELU_SLOPE)
153
- x = self.ups[i](x)
154
- xs = None
155
- for j in range(self.num_kernels):
156
- if xs is None:
157
- xs = self.resblocks[i * self.num_kernels + j](x)
158
- else:
159
- xs += self.resblocks[i * self.num_kernels + j](x)
160
- x = xs / self.num_kernels
161
- x = F.leaky_relu(x)
162
- x = self.conv_post(x)
163
- x = torch.tanh(x)
164
-
165
- return x
166
-
167
- def remove_weight_norm(self):
168
- # print("Removing weight norm...")
169
- for l in self.ups:
170
- remove_weight_norm(l)
171
- for l in self.resblocks:
172
- l.remove_weight_norm()
173
- remove_weight_norm(self.conv_pre)
174
- remove_weight_norm(self.conv_post)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audioldm/hifigan/utilities.py DELETED
@@ -1,86 +0,0 @@
1
- import os
2
- import json
3
-
4
- import torch
5
- import numpy as np
6
-
7
- import audioldm.hifigan as hifigan
8
-
9
- HIFIGAN_16K_64 = {
10
- "resblock": "1",
11
- "num_gpus": 6,
12
- "batch_size": 16,
13
- "learning_rate": 0.0002,
14
- "adam_b1": 0.8,
15
- "adam_b2": 0.99,
16
- "lr_decay": 0.999,
17
- "seed": 1234,
18
- "upsample_rates": [5, 4, 2, 2, 2],
19
- "upsample_kernel_sizes": [16, 16, 8, 4, 4],
20
- "upsample_initial_channel": 1024,
21
- "resblock_kernel_sizes": [3, 7, 11],
22
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
23
- "segment_size": 8192,
24
- "num_mels": 64,
25
- "num_freq": 1025,
26
- "n_fft": 1024,
27
- "hop_size": 160,
28
- "win_size": 1024,
29
- "sampling_rate": 16000,
30
- "fmin": 0,
31
- "fmax": 8000,
32
- "fmax_for_loss": None,
33
- "num_workers": 4,
34
- "dist_config": {
35
- "dist_backend": "nccl",
36
- "dist_url": "tcp://localhost:54321",
37
- "world_size": 1,
38
- },
39
- }
40
-
41
-
42
- def get_available_checkpoint_keys(model, ckpt):
43
- print("==> Attemp to reload from %s" % ckpt)
44
- state_dict = torch.load(ckpt)["state_dict"]
45
- current_state_dict = model.state_dict()
46
- new_state_dict = {}
47
- for k in state_dict.keys():
48
- if (
49
- k in current_state_dict.keys()
50
- and current_state_dict[k].size() == state_dict[k].size()
51
- ):
52
- new_state_dict[k] = state_dict[k]
53
- else:
54
- print("==> WARNING: Skipping %s" % k)
55
- print(
56
- "%s out of %s keys are matched"
57
- % (len(new_state_dict.keys()), len(state_dict.keys()))
58
- )
59
- return new_state_dict
60
-
61
-
62
- def get_param_num(model):
63
- num_param = sum(param.numel() for param in model.parameters())
64
- return num_param
65
-
66
-
67
- def get_vocoder(config, device):
68
- config = hifigan.AttrDict(HIFIGAN_16K_64)
69
- vocoder = hifigan.Generator(config)
70
- vocoder.eval()
71
- vocoder.remove_weight_norm()
72
- vocoder.to(device)
73
- return vocoder
74
-
75
-
76
- def vocoder_infer(mels, vocoder, lengths=None):
77
- vocoder.eval()
78
- with torch.no_grad():
79
- wavs = vocoder(mels).squeeze(1)
80
-
81
- wavs = (wavs.cpu().numpy() * 32768).astype("int16")
82
-
83
- if lengths is not None:
84
- wavs = wavs[:, :lengths]
85
-
86
- return wavs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
audioldm/latent_diffusion/__init__.py DELETED
File without changes
audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (157 Bytes)
 
audioldm/latent_diffusion/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (164 Bytes)
 
audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc DELETED
Binary file (11.4 kB)
 
audioldm/latent_diffusion/__pycache__/attention.cpython-39.pyc DELETED
Binary file (11.4 kB)
 
audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc DELETED
Binary file (7.2 kB)
 
audioldm/latent_diffusion/__pycache__/ddim.cpython-39.pyc DELETED
Binary file (7.11 kB)
 
audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc DELETED
Binary file (11.1 kB)
 
audioldm/latent_diffusion/__pycache__/ddpm.cpython-39.pyc DELETED
Binary file (11 kB)
 
audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc DELETED
Binary file (3.01 kB)
 
audioldm/latent_diffusion/__pycache__/ema.cpython-39.pyc DELETED
Binary file (3 kB)
 
audioldm/latent_diffusion/__pycache__/openaimodel.cpython-39.pyc DELETED
Binary file (23.7 kB)
 
audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc DELETED
Binary file (9.53 kB)
 
audioldm/latent_diffusion/__pycache__/util.cpython-39.pyc DELETED
Binary file (9.6 kB)