Spaces:
Running
on
Zero
Running
on
Zero
hungchiayu1
commited on
Commit
·
838c300
1
Parent(s):
348f0d7
update to tangoflux
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +2 -3
- TangoFlux.py +57 -0
- app.py +9 -249
- audioldm/__init__.py +0 -8
- audioldm/__main__.py +0 -183
- audioldm/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/__pycache__/__init__.cpython-39.pyc +0 -0
- audioldm/__pycache__/ldm.cpython-310.pyc +0 -0
- audioldm/__pycache__/ldm.cpython-39.pyc +0 -0
- audioldm/__pycache__/pipeline.cpython-310.pyc +0 -0
- audioldm/__pycache__/pipeline.cpython-39.pyc +0 -0
- audioldm/__pycache__/utils.cpython-310.pyc +0 -0
- audioldm/__pycache__/utils.cpython-39.pyc +0 -0
- audioldm/audio/__init__.py +0 -2
- audioldm/audio/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/audio/__pycache__/__init__.cpython-39.pyc +0 -0
- audioldm/audio/__pycache__/audio_processing.cpython-310.pyc +0 -0
- audioldm/audio/__pycache__/audio_processing.cpython-39.pyc +0 -0
- audioldm/audio/__pycache__/mix.cpython-39.pyc +0 -0
- audioldm/audio/__pycache__/stft.cpython-310.pyc +0 -0
- audioldm/audio/__pycache__/stft.cpython-39.pyc +0 -0
- audioldm/audio/__pycache__/tools.cpython-310.pyc +0 -0
- audioldm/audio/__pycache__/tools.cpython-39.pyc +0 -0
- audioldm/audio/__pycache__/torch_tools.cpython-39.pyc +0 -0
- audioldm/audio/audio_processing.py +0 -100
- audioldm/audio/stft.py +0 -186
- audioldm/audio/tools.py +0 -85
- audioldm/hifigan/__init__.py +0 -7
- audioldm/hifigan/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/hifigan/__pycache__/__init__.cpython-39.pyc +0 -0
- audioldm/hifigan/__pycache__/models.cpython-310.pyc +0 -0
- audioldm/hifigan/__pycache__/models.cpython-39.pyc +0 -0
- audioldm/hifigan/__pycache__/utilities.cpython-310.pyc +0 -0
- audioldm/hifigan/__pycache__/utilities.cpython-39.pyc +0 -0
- audioldm/hifigan/models.py +0 -174
- audioldm/hifigan/utilities.py +0 -86
- audioldm/latent_diffusion/__init__.py +0 -0
- audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/__init__.cpython-39.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/attention.cpython-39.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ddim.cpython-39.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ddpm.cpython-39.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ema.cpython-39.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/openaimodel.cpython-39.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/util.cpython-39.pyc +0 -0
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🐠
|
4 |
colorFrom: indigo
|
5 |
colorTo: pink
|
@@ -7,7 +7,6 @@ sdk: gradio
|
|
7 |
sdk_version: 4.26.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
short_description: Fast Text to Audio Generator
|
11 |
---
|
12 |
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: TangoFlux
|
3 |
emoji: 🐠
|
4 |
colorFrom: indigo
|
5 |
colorTo: pink
|
|
|
7 |
sdk_version: 4.26.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
TangoFlux.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from diffusers import AutoencoderOobleck
|
2 |
+
import torch
|
3 |
+
from transformers import T5EncoderModel,T5TokenizerFast
|
4 |
+
from diffusers import FluxTransformer2DModel
|
5 |
+
from torch import nn
|
6 |
+
from typing import List
|
7 |
+
from diffusers import FlowMatchEulerDiscreteScheduler
|
8 |
+
from diffusers.training_utils import compute_density_for_timestep_sampling
|
9 |
+
import copy
|
10 |
+
import torch.nn.functional as F
|
11 |
+
import numpy as np
|
12 |
+
from model import TangoFlux
|
13 |
+
from huggingface_hub import snapshot_download
|
14 |
+
from tqdm import tqdm
|
15 |
+
from typing import Optional,Union,List
|
16 |
+
from datasets import load_dataset, Audio
|
17 |
+
from math import pi
|
18 |
+
import json
|
19 |
+
import inspect
|
20 |
+
import yaml
|
21 |
+
from safetensors.torch import load_file
|
22 |
+
|
23 |
+
|
24 |
+
class TangoFluxInference:
|
25 |
+
|
26 |
+
def __init__(self,name='declare-lab/TangoFlux',device="cuda"):
|
27 |
+
|
28 |
+
|
29 |
+
self.vae = AutoencoderOobleck.from_pretrained("stabilityai/stable-audio-open-1.0",subfolder='vae')
|
30 |
+
|
31 |
+
paths = snapshot_download(repo_id=name)
|
32 |
+
weights = load_file("{}/tangoflux.safetensors".format(paths))
|
33 |
+
|
34 |
+
with open('{}/config.json'.format(paths),'r') as f:
|
35 |
+
config = json.load(f)
|
36 |
+
self.model = TangoFlux(config)
|
37 |
+
self.model.load_state_dict(weights,strict=False)
|
38 |
+
# _IncompatibleKeys(missing_keys=['text_encoder.encoder.embed_tokens.weight'], unexpected_keys=[]) this behaviour is expected
|
39 |
+
self.vae.to(device)
|
40 |
+
self.model.to(device)
|
41 |
+
|
42 |
+
def generate(self,prompt,steps=25,duration=10,guidance_scale=4.5):
|
43 |
+
|
44 |
+
with torch.no_grad():
|
45 |
+
latents = self.model.inference_flow(prompt,
|
46 |
+
duration=duration,
|
47 |
+
num_inference_steps=steps,
|
48 |
+
guidance_scale=guidance_scale)
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
wave = self.vae.decode(latents.transpose(2,1)).sample.cpu()[0]
|
53 |
+
return wave
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
app.py
CHANGED
@@ -13,245 +13,23 @@ from gradio import Markdown
|
|
13 |
|
14 |
import torch
|
15 |
#from diffusers.models.autoencoder_kl import AutoencoderKL
|
16 |
-
from diffusers.models.unet_2d_condition import UNet2DConditionModel
|
17 |
from diffusers import DiffusionPipeline,AudioPipelineOutput
|
18 |
from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
|
19 |
from typing import Union
|
20 |
from diffusers.utils.torch_utils import randn_tensor
|
21 |
from tqdm import tqdm
|
|
|
22 |
|
23 |
|
24 |
|
|
|
25 |
|
26 |
|
27 |
-
class Tango2Pipeline(DiffusionPipeline):
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
self,
|
32 |
-
vae: AutoencoderKL,
|
33 |
-
text_encoder: T5EncoderModel,
|
34 |
-
tokenizer: Union[T5Tokenizer, T5TokenizerFast],
|
35 |
-
unet: UNet2DConditionModel,
|
36 |
-
scheduler: DDPMScheduler
|
37 |
-
):
|
38 |
-
|
39 |
-
super().__init__()
|
40 |
-
|
41 |
-
self.register_modules(vae=vae,
|
42 |
-
text_encoder=text_encoder,
|
43 |
-
tokenizer=tokenizer,
|
44 |
-
unet=unet,
|
45 |
-
scheduler=scheduler
|
46 |
-
)
|
47 |
-
|
48 |
-
|
49 |
-
def _encode_prompt(self, prompt):
|
50 |
-
device = self.text_encoder.device
|
51 |
-
|
52 |
-
batch = self.tokenizer(
|
53 |
-
prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
|
54 |
-
)
|
55 |
-
input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
|
56 |
|
57 |
-
|
58 |
-
encoder_hidden_states = self.text_encoder(
|
59 |
-
input_ids=input_ids, attention_mask=attention_mask
|
60 |
-
)[0]
|
61 |
-
|
62 |
-
boolean_encoder_mask = (attention_mask == 1).to(device)
|
63 |
-
|
64 |
-
return encoder_hidden_states, boolean_encoder_mask
|
65 |
-
|
66 |
-
def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
|
67 |
-
device = self.text_encoder.device
|
68 |
-
batch = self.tokenizer(
|
69 |
-
prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
|
70 |
-
)
|
71 |
-
input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
|
72 |
-
|
73 |
-
with torch.no_grad():
|
74 |
-
prompt_embeds = self.text_encoder(
|
75 |
-
input_ids=input_ids, attention_mask=attention_mask
|
76 |
-
)[0]
|
77 |
-
|
78 |
-
prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
|
79 |
-
attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
|
80 |
-
|
81 |
-
# get unconditional embeddings for classifier free guidance
|
82 |
-
uncond_tokens = [""] * len(prompt)
|
83 |
-
|
84 |
-
max_length = prompt_embeds.shape[1]
|
85 |
-
uncond_batch = self.tokenizer(
|
86 |
-
uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
|
87 |
-
)
|
88 |
-
uncond_input_ids = uncond_batch.input_ids.to(device)
|
89 |
-
uncond_attention_mask = uncond_batch.attention_mask.to(device)
|
90 |
-
|
91 |
-
with torch.no_grad():
|
92 |
-
negative_prompt_embeds = self.text_encoder(
|
93 |
-
input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
|
94 |
-
)[0]
|
95 |
-
|
96 |
-
negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
|
97 |
-
uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
|
98 |
-
|
99 |
-
# For classifier free guidance, we need to do two forward passes.
|
100 |
-
# We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
|
101 |
-
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
102 |
-
prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
|
103 |
-
boolean_prompt_mask = (prompt_mask == 1).to(device)
|
104 |
-
|
105 |
-
return prompt_embeds, boolean_prompt_mask
|
106 |
-
|
107 |
-
def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
|
108 |
-
shape = (batch_size, num_channels_latents, 256, 16)
|
109 |
-
latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
|
110 |
-
# scale the initial noise by the standard deviation required by the scheduler
|
111 |
-
latents = latents * inference_scheduler.init_noise_sigma
|
112 |
-
return latents
|
113 |
-
|
114 |
-
@torch.no_grad()
|
115 |
-
def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
|
116 |
-
disable_progress=True):
|
117 |
-
device = self.text_encoder.device
|
118 |
-
classifier_free_guidance = guidance_scale > 1.0
|
119 |
-
batch_size = len(prompt) * num_samples_per_prompt
|
120 |
-
|
121 |
-
if classifier_free_guidance:
|
122 |
-
prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
|
123 |
-
else:
|
124 |
-
prompt_embeds, boolean_prompt_mask = self._encode_text(prompt)
|
125 |
-
prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
|
126 |
-
boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
|
127 |
-
|
128 |
-
inference_scheduler.set_timesteps(num_steps, device=device)
|
129 |
-
timesteps = inference_scheduler.timesteps
|
130 |
-
|
131 |
-
num_channels_latents = self.unet.config.in_channels
|
132 |
-
latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
|
133 |
-
|
134 |
-
num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
|
135 |
-
progress_bar = tqdm(range(num_steps), disable=disable_progress)
|
136 |
-
|
137 |
-
for i, t in enumerate(timesteps):
|
138 |
-
# expand the latents if we are doing classifier free guidance
|
139 |
-
latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
|
140 |
-
latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
|
141 |
-
|
142 |
-
noise_pred = self.unet(
|
143 |
-
latent_model_input, t, encoder_hidden_states=prompt_embeds,
|
144 |
-
encoder_attention_mask=boolean_prompt_mask
|
145 |
-
).sample
|
146 |
-
|
147 |
-
# perform guidance
|
148 |
-
if classifier_free_guidance:
|
149 |
-
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
150 |
-
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
151 |
-
|
152 |
-
# compute the previous noisy sample x_t -> x_t-1
|
153 |
-
latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
|
154 |
-
|
155 |
-
# call the callback, if provided
|
156 |
-
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
|
157 |
-
progress_bar.update(1)
|
158 |
-
|
159 |
-
return latents
|
160 |
-
|
161 |
-
@torch.no_grad()
|
162 |
-
def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
|
163 |
-
""" Genrate audio for a single prompt string. """
|
164 |
-
with torch.no_grad():
|
165 |
-
latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
166 |
-
mel = self.vae.decode_first_stage(latents)
|
167 |
-
wave = self.vae.decode_to_waveform(mel)
|
168 |
-
|
169 |
-
|
170 |
-
return AudioPipelineOutput(audios=wave)
|
171 |
-
|
172 |
-
|
173 |
-
# Automatic device detection
|
174 |
-
if torch.cuda.is_available():
|
175 |
-
device_type = "cuda"
|
176 |
-
device_selection = "cuda:0"
|
177 |
-
else:
|
178 |
-
device_type = "cpu"
|
179 |
-
device_selection = "cpu"
|
180 |
-
|
181 |
-
class Tango:
|
182 |
-
def __init__(self, name="declare-lab/tango2", device=device_selection):
|
183 |
-
|
184 |
-
path = snapshot_download(repo_id=name)
|
185 |
-
|
186 |
-
vae_config = json.load(open("{}/vae_config.json".format(path)))
|
187 |
-
stft_config = json.load(open("{}/stft_config.json".format(path)))
|
188 |
-
main_config = json.load(open("{}/main_config.json".format(path)))
|
189 |
-
|
190 |
-
self.vae = AutoencoderKL(**vae_config).to(device)
|
191 |
-
self.stft = TacotronSTFT(**stft_config).to(device)
|
192 |
-
self.model = AudioDiffusion(**main_config).to(device)
|
193 |
-
|
194 |
-
vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location=device)
|
195 |
-
stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location=device)
|
196 |
-
main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location=device)
|
197 |
-
|
198 |
-
self.vae.load_state_dict(vae_weights)
|
199 |
-
self.stft.load_state_dict(stft_weights)
|
200 |
-
self.model.load_state_dict(main_weights)
|
201 |
-
|
202 |
-
print ("Successfully loaded checkpoint from:", name)
|
203 |
-
|
204 |
-
self.vae.eval()
|
205 |
-
self.stft.eval()
|
206 |
-
self.model.eval()
|
207 |
-
|
208 |
-
self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler")
|
209 |
-
|
210 |
-
def chunks(self, lst, n):
|
211 |
-
""" Yield successive n-sized chunks from a list. """
|
212 |
-
for i in range(0, len(lst), n):
|
213 |
-
yield lst[i:i + n]
|
214 |
-
|
215 |
-
def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
|
216 |
-
""" Genrate audio for a single prompt string. """
|
217 |
-
with torch.no_grad():
|
218 |
-
latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
219 |
-
mel = self.vae.decode_first_stage(latents)
|
220 |
-
wave = self.vae.decode_to_waveform(mel)
|
221 |
-
return wave[0]
|
222 |
-
|
223 |
-
def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
|
224 |
-
""" Genrate audio for a list of prompt strings. """
|
225 |
-
outputs = []
|
226 |
-
for k in tqdm(range(0, len(prompts), batch_size)):
|
227 |
-
batch = prompts[k: k+batch_size]
|
228 |
-
with torch.no_grad():
|
229 |
-
latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
230 |
-
mel = self.vae.decode_first_stage(latents)
|
231 |
-
wave = self.vae.decode_to_waveform(mel)
|
232 |
-
outputs += [item for item in wave]
|
233 |
-
if samples == 1:
|
234 |
-
return outputs
|
235 |
-
else:
|
236 |
-
return list(self.chunks(outputs, samples))
|
237 |
-
|
238 |
-
# Initialize TANGO
|
239 |
-
|
240 |
-
tango = Tango(device="cpu")
|
241 |
-
tango.vae.to(device_type)
|
242 |
-
tango.stft.to(device_type)
|
243 |
-
tango.model.to(device_type)
|
244 |
-
|
245 |
-
pipe = Tango2Pipeline(vae=tango.vae,
|
246 |
-
text_encoder=tango.model.text_encoder,
|
247 |
-
tokenizer=tango.model.tokenizer,
|
248 |
-
unet=tango.model.unet,
|
249 |
-
scheduler=tango.scheduler
|
250 |
-
)
|
251 |
-
|
252 |
-
|
253 |
-
@spaces.GPU(duration=60)
|
254 |
-
def gradio_generate(prompt, output_format, steps, guidance):
|
255 |
output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
|
256 |
#output_wave = tango.generate(prompt, steps, guidance)
|
257 |
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
|
@@ -265,25 +43,6 @@ def gradio_generate(prompt, output_format, steps, guidance):
|
|
265 |
|
266 |
return output_filename
|
267 |
|
268 |
-
# description_text = """
|
269 |
-
# <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
|
270 |
-
# Generate audio using TANGO by providing a text prompt.
|
271 |
-
# <br/><br/>Limitations: TANGO is trained on the small AudioCaps dataset so it may not generate good audio \
|
272 |
-
# samples related to concepts that it has not seen in training (e.g. singing). For the same reason, TANGO \
|
273 |
-
# is not always able to finely control its generations over textual control prompts. For example, \
|
274 |
-
# the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes \
|
275 |
-
# on a metal table are very similar. \
|
276 |
-
# <br/><br/>We are currently training another version of TANGO on larger datasets to enhance its generalization, \
|
277 |
-
# compositional and controllable generation ability.
|
278 |
-
# <br/><br/>We recommend using a guidance scale of 3. The default number of steps is set to 100. More steps generally lead to better quality of generated audios but will take longer.
|
279 |
-
# <br/><br/>
|
280 |
-
# <h1> ChatGPT-enhanced audio generation</h1>
|
281 |
-
# <br/>
|
282 |
-
# As TANGO consists of an instruction-tuned LLM, it is able to process complex sound descriptions allowing us to provide more detailed instructions to improve the generation quality.
|
283 |
-
# For example, ``A boat is moving on the sea'' vs ``The sound of the water lapping against the hull of the boat or splashing as you move through the waves''. The latter is obtained by prompting ChatGPT to explain the sound generated when a boat moves on the sea.
|
284 |
-
# Using this ChatGPT-generated description of the sound, TANGO provides superior results.
|
285 |
-
# <p/>
|
286 |
-
# """
|
287 |
description_text = """
|
288 |
<p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
|
289 |
Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
|
@@ -294,15 +53,16 @@ Generate audio using Tango2 by providing a text prompt. Tango2 was built from Ta
|
|
294 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
295 |
output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
|
296 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
297 |
-
denoising_steps = gr.Slider(minimum=
|
298 |
guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
|
|
|
299 |
|
300 |
# Gradio interface
|
301 |
gr_interface = gr.Interface(
|
302 |
fn=gradio_generate,
|
303 |
-
inputs=[input_text, output_format, denoising_steps, guidance_scale],
|
304 |
outputs=[output_audio],
|
305 |
-
title="
|
306 |
description=description_text,
|
307 |
allow_flagging=False,
|
308 |
examples=[
|
|
|
13 |
|
14 |
import torch
|
15 |
#from diffusers.models.autoencoder_kl import AutoencoderKL
|
|
|
16 |
from diffusers import DiffusionPipeline,AudioPipelineOutput
|
17 |
from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
|
18 |
from typing import Union
|
19 |
from diffusers.utils.torch_utils import randn_tensor
|
20 |
from tqdm import tqdm
|
21 |
+
from TangoFlux import TangoFluxInference
|
22 |
|
23 |
|
24 |
|
25 |
+
tangoflux = TangoFluxInference(path="declare-lab/TangoFlux")
|
26 |
|
27 |
|
|
|
28 |
|
29 |
+
@spaces.GPU(duration=15)
|
30 |
+
def gradio_generate(prompt, output_format, steps, guidance,duration=10):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
output_wave = tangoflux.generate(prompt,steps=steps,guidance=guidance,duration=duration)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
|
34 |
#output_wave = tango.generate(prompt, steps, guidance)
|
35 |
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
|
|
|
43 |
|
44 |
return output_filename
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
description_text = """
|
47 |
<p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
|
48 |
Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
|
|
|
53 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
54 |
output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
|
55 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
56 |
+
denoising_steps = gr.Slider(minimum=10, maximum=100, value=25, step=1, label="Steps", interactive=True)
|
57 |
guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
|
58 |
+
duration_scale = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
59 |
|
60 |
# Gradio interface
|
61 |
gr_interface = gr.Interface(
|
62 |
fn=gradio_generate,
|
63 |
+
inputs=[input_text, output_format, denoising_steps, guidance_scale,duration_scale],
|
64 |
outputs=[output_audio],
|
65 |
+
title="TangoFlux: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
|
66 |
description=description_text,
|
67 |
allow_flagging=False,
|
68 |
examples=[
|
audioldm/__init__.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
from .ldm import LatentDiffusion
|
2 |
-
from .utils import seed_everything, save_wave, get_time, get_duration
|
3 |
-
from .pipeline import *
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/__main__.py
DELETED
@@ -1,183 +0,0 @@
|
|
1 |
-
#!/usr/bin/python3
|
2 |
-
import os
|
3 |
-
from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
|
4 |
-
import argparse
|
5 |
-
|
6 |
-
CACHE_DIR = os.getenv(
|
7 |
-
"AUDIOLDM_CACHE_DIR",
|
8 |
-
os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
|
9 |
-
|
10 |
-
parser = argparse.ArgumentParser()
|
11 |
-
|
12 |
-
parser.add_argument(
|
13 |
-
"--mode",
|
14 |
-
type=str,
|
15 |
-
required=False,
|
16 |
-
default="generation",
|
17 |
-
help="generation: text-to-audio generation; transfer: style transfer",
|
18 |
-
choices=["generation", "transfer"]
|
19 |
-
)
|
20 |
-
|
21 |
-
parser.add_argument(
|
22 |
-
"-t",
|
23 |
-
"--text",
|
24 |
-
type=str,
|
25 |
-
required=False,
|
26 |
-
default="",
|
27 |
-
help="Text prompt to the model for audio generation",
|
28 |
-
)
|
29 |
-
|
30 |
-
parser.add_argument(
|
31 |
-
"-f",
|
32 |
-
"--file_path",
|
33 |
-
type=str,
|
34 |
-
required=False,
|
35 |
-
default=None,
|
36 |
-
help="(--mode transfer): Original audio file for style transfer; Or (--mode generation): the guidance audio file for generating simialr audio",
|
37 |
-
)
|
38 |
-
|
39 |
-
parser.add_argument(
|
40 |
-
"--transfer_strength",
|
41 |
-
type=float,
|
42 |
-
required=False,
|
43 |
-
default=0.5,
|
44 |
-
help="A value between 0 and 1. 0 means original audio without transfer, 1 means completely transfer to the audio indicated by text",
|
45 |
-
)
|
46 |
-
|
47 |
-
parser.add_argument(
|
48 |
-
"-s",
|
49 |
-
"--save_path",
|
50 |
-
type=str,
|
51 |
-
required=False,
|
52 |
-
help="The path to save model output",
|
53 |
-
default="./output",
|
54 |
-
)
|
55 |
-
|
56 |
-
parser.add_argument(
|
57 |
-
"--model_name",
|
58 |
-
type=str,
|
59 |
-
required=False,
|
60 |
-
help="The checkpoint you gonna use",
|
61 |
-
default="audioldm-s-full",
|
62 |
-
choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"]
|
63 |
-
)
|
64 |
-
|
65 |
-
parser.add_argument(
|
66 |
-
"-ckpt",
|
67 |
-
"--ckpt_path",
|
68 |
-
type=str,
|
69 |
-
required=False,
|
70 |
-
help="The path to the pretrained .ckpt model",
|
71 |
-
default=None,
|
72 |
-
)
|
73 |
-
|
74 |
-
parser.add_argument(
|
75 |
-
"-b",
|
76 |
-
"--batchsize",
|
77 |
-
type=int,
|
78 |
-
required=False,
|
79 |
-
default=1,
|
80 |
-
help="Generate how many samples at the same time",
|
81 |
-
)
|
82 |
-
|
83 |
-
parser.add_argument(
|
84 |
-
"--ddim_steps",
|
85 |
-
type=int,
|
86 |
-
required=False,
|
87 |
-
default=200,
|
88 |
-
help="The sampling step for DDIM",
|
89 |
-
)
|
90 |
-
|
91 |
-
parser.add_argument(
|
92 |
-
"-gs",
|
93 |
-
"--guidance_scale",
|
94 |
-
type=float,
|
95 |
-
required=False,
|
96 |
-
default=2.5,
|
97 |
-
help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
|
98 |
-
)
|
99 |
-
|
100 |
-
parser.add_argument(
|
101 |
-
"-dur",
|
102 |
-
"--duration",
|
103 |
-
type=float,
|
104 |
-
required=False,
|
105 |
-
default=10.0,
|
106 |
-
help="The duration of the samples",
|
107 |
-
)
|
108 |
-
|
109 |
-
parser.add_argument(
|
110 |
-
"-n",
|
111 |
-
"--n_candidate_gen_per_text",
|
112 |
-
type=int,
|
113 |
-
required=False,
|
114 |
-
default=3,
|
115 |
-
help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
|
116 |
-
)
|
117 |
-
|
118 |
-
parser.add_argument(
|
119 |
-
"--seed",
|
120 |
-
type=int,
|
121 |
-
required=False,
|
122 |
-
default=42,
|
123 |
-
help="Change this value (any integer number) will lead to a different generation result.",
|
124 |
-
)
|
125 |
-
|
126 |
-
args = parser.parse_args()
|
127 |
-
|
128 |
-
if(args.ckpt_path is not None):
|
129 |
-
print("Warning: ckpt_path has no effect after version 0.0.20.")
|
130 |
-
|
131 |
-
assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5"
|
132 |
-
|
133 |
-
mode = args.mode
|
134 |
-
if(mode == "generation" and args.file_path is not None):
|
135 |
-
mode = "generation_audio_to_audio"
|
136 |
-
if(len(args.text) > 0):
|
137 |
-
print("Warning: You have specified the --file_path. --text will be ignored")
|
138 |
-
args.text = ""
|
139 |
-
|
140 |
-
save_path = os.path.join(args.save_path, mode)
|
141 |
-
|
142 |
-
if(args.file_path is not None):
|
143 |
-
save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0]))
|
144 |
-
|
145 |
-
text = args.text
|
146 |
-
random_seed = args.seed
|
147 |
-
duration = args.duration
|
148 |
-
guidance_scale = args.guidance_scale
|
149 |
-
n_candidate_gen_per_text = args.n_candidate_gen_per_text
|
150 |
-
|
151 |
-
os.makedirs(save_path, exist_ok=True)
|
152 |
-
audioldm = build_model(model_name=args.model_name)
|
153 |
-
|
154 |
-
if(args.mode == "generation"):
|
155 |
-
waveform = text_to_audio(
|
156 |
-
audioldm,
|
157 |
-
text,
|
158 |
-
args.file_path,
|
159 |
-
random_seed,
|
160 |
-
duration=duration,
|
161 |
-
guidance_scale=guidance_scale,
|
162 |
-
ddim_steps=args.ddim_steps,
|
163 |
-
n_candidate_gen_per_text=n_candidate_gen_per_text,
|
164 |
-
batchsize=args.batchsize,
|
165 |
-
)
|
166 |
-
|
167 |
-
elif(args.mode == "transfer"):
|
168 |
-
assert args.file_path is not None
|
169 |
-
assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path
|
170 |
-
waveform = style_transfer(
|
171 |
-
audioldm,
|
172 |
-
text,
|
173 |
-
args.file_path,
|
174 |
-
args.transfer_strength,
|
175 |
-
random_seed,
|
176 |
-
duration=duration,
|
177 |
-
guidance_scale=guidance_scale,
|
178 |
-
ddim_steps=args.ddim_steps,
|
179 |
-
batchsize=args.batchsize,
|
180 |
-
)
|
181 |
-
waveform = waveform[:,None,:]
|
182 |
-
|
183 |
-
save_wave(waveform, save_path, name="%s_%s" % (get_time(), text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (315 Bytes)
|
|
audioldm/__pycache__/__init__.cpython-39.pyc
DELETED
Binary file (322 Bytes)
|
|
audioldm/__pycache__/ldm.cpython-310.pyc
DELETED
Binary file (16.1 kB)
|
|
audioldm/__pycache__/ldm.cpython-39.pyc
DELETED
Binary file (16 kB)
|
|
audioldm/__pycache__/pipeline.cpython-310.pyc
DELETED
Binary file (6.63 kB)
|
|
audioldm/__pycache__/pipeline.cpython-39.pyc
DELETED
Binary file (6.54 kB)
|
|
audioldm/__pycache__/utils.cpython-310.pyc
DELETED
Binary file (8.01 kB)
|
|
audioldm/__pycache__/utils.cpython-39.pyc
DELETED
Binary file (7.35 kB)
|
|
audioldm/audio/__init__.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
from .tools import wav_to_fbank, read_wav_file
|
2 |
-
from .stft import TacotronSTFT
|
|
|
|
|
|
audioldm/audio/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (253 Bytes)
|
|
audioldm/audio/__pycache__/__init__.cpython-39.pyc
DELETED
Binary file (260 Bytes)
|
|
audioldm/audio/__pycache__/audio_processing.cpython-310.pyc
DELETED
Binary file (2.78 kB)
|
|
audioldm/audio/__pycache__/audio_processing.cpython-39.pyc
DELETED
Binary file (2.78 kB)
|
|
audioldm/audio/__pycache__/mix.cpython-39.pyc
DELETED
Binary file (1.7 kB)
|
|
audioldm/audio/__pycache__/stft.cpython-310.pyc
DELETED
Binary file (4.98 kB)
|
|
audioldm/audio/__pycache__/stft.cpython-39.pyc
DELETED
Binary file (4.99 kB)
|
|
audioldm/audio/__pycache__/tools.cpython-310.pyc
DELETED
Binary file (2.18 kB)
|
|
audioldm/audio/__pycache__/tools.cpython-39.pyc
DELETED
Binary file (2.19 kB)
|
|
audioldm/audio/__pycache__/torch_tools.cpython-39.pyc
DELETED
Binary file (3.79 kB)
|
|
audioldm/audio/audio_processing.py
DELETED
@@ -1,100 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import numpy as np
|
3 |
-
import librosa.util as librosa_util
|
4 |
-
from scipy.signal import get_window
|
5 |
-
|
6 |
-
|
7 |
-
def window_sumsquare(
|
8 |
-
window,
|
9 |
-
n_frames,
|
10 |
-
hop_length,
|
11 |
-
win_length,
|
12 |
-
n_fft,
|
13 |
-
dtype=np.float32,
|
14 |
-
norm=None,
|
15 |
-
):
|
16 |
-
"""
|
17 |
-
# from librosa 0.6
|
18 |
-
Compute the sum-square envelope of a window function at a given hop length.
|
19 |
-
|
20 |
-
This is used to estimate modulation effects induced by windowing
|
21 |
-
observations in short-time fourier transforms.
|
22 |
-
|
23 |
-
Parameters
|
24 |
-
----------
|
25 |
-
window : string, tuple, number, callable, or list-like
|
26 |
-
Window specification, as in `get_window`
|
27 |
-
|
28 |
-
n_frames : int > 0
|
29 |
-
The number of analysis frames
|
30 |
-
|
31 |
-
hop_length : int > 0
|
32 |
-
The number of samples to advance between frames
|
33 |
-
|
34 |
-
win_length : [optional]
|
35 |
-
The length of the window function. By default, this matches `n_fft`.
|
36 |
-
|
37 |
-
n_fft : int > 0
|
38 |
-
The length of each analysis frame.
|
39 |
-
|
40 |
-
dtype : np.dtype
|
41 |
-
The data type of the output
|
42 |
-
|
43 |
-
Returns
|
44 |
-
-------
|
45 |
-
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
|
46 |
-
The sum-squared envelope of the window function
|
47 |
-
"""
|
48 |
-
if win_length is None:
|
49 |
-
win_length = n_fft
|
50 |
-
|
51 |
-
n = n_fft + hop_length * (n_frames - 1)
|
52 |
-
x = np.zeros(n, dtype=dtype)
|
53 |
-
|
54 |
-
# Compute the squared window at the desired length
|
55 |
-
win_sq = get_window(window, win_length, fftbins=True)
|
56 |
-
win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
|
57 |
-
win_sq = librosa_util.pad_center(win_sq, n_fft)
|
58 |
-
|
59 |
-
# Fill the envelope
|
60 |
-
for i in range(n_frames):
|
61 |
-
sample = i * hop_length
|
62 |
-
x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
|
63 |
-
return x
|
64 |
-
|
65 |
-
|
66 |
-
def griffin_lim(magnitudes, stft_fn, n_iters=30):
|
67 |
-
"""
|
68 |
-
PARAMS
|
69 |
-
------
|
70 |
-
magnitudes: spectrogram magnitudes
|
71 |
-
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
|
72 |
-
"""
|
73 |
-
|
74 |
-
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
|
75 |
-
angles = angles.astype(np.float32)
|
76 |
-
angles = torch.autograd.Variable(torch.from_numpy(angles))
|
77 |
-
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
78 |
-
|
79 |
-
for i in range(n_iters):
|
80 |
-
_, angles = stft_fn.transform(signal)
|
81 |
-
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
82 |
-
return signal
|
83 |
-
|
84 |
-
|
85 |
-
def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
|
86 |
-
"""
|
87 |
-
PARAMS
|
88 |
-
------
|
89 |
-
C: compression factor
|
90 |
-
"""
|
91 |
-
return normalize_fun(torch.clamp(x, min=clip_val) * C)
|
92 |
-
|
93 |
-
|
94 |
-
def dynamic_range_decompression(x, C=1):
|
95 |
-
"""
|
96 |
-
PARAMS
|
97 |
-
------
|
98 |
-
C: compression factor used to compress
|
99 |
-
"""
|
100 |
-
return torch.exp(x) / C
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/audio/stft.py
DELETED
@@ -1,186 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn.functional as F
|
3 |
-
import numpy as np
|
4 |
-
from scipy.signal import get_window
|
5 |
-
from librosa.util import pad_center, tiny
|
6 |
-
from librosa.filters import mel as librosa_mel_fn
|
7 |
-
|
8 |
-
from audioldm.audio.audio_processing import (
|
9 |
-
dynamic_range_compression,
|
10 |
-
dynamic_range_decompression,
|
11 |
-
window_sumsquare,
|
12 |
-
)
|
13 |
-
|
14 |
-
|
15 |
-
class STFT(torch.nn.Module):
|
16 |
-
"""adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
|
17 |
-
|
18 |
-
def __init__(self, filter_length, hop_length, win_length, window="hann"):
|
19 |
-
super(STFT, self).__init__()
|
20 |
-
self.filter_length = filter_length
|
21 |
-
self.hop_length = hop_length
|
22 |
-
self.win_length = win_length
|
23 |
-
self.window = window
|
24 |
-
self.forward_transform = None
|
25 |
-
scale = self.filter_length / self.hop_length
|
26 |
-
fourier_basis = np.fft.fft(np.eye(self.filter_length))
|
27 |
-
|
28 |
-
cutoff = int((self.filter_length / 2 + 1))
|
29 |
-
fourier_basis = np.vstack(
|
30 |
-
[np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
|
31 |
-
)
|
32 |
-
|
33 |
-
forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
|
34 |
-
inverse_basis = torch.FloatTensor(
|
35 |
-
np.linalg.pinv(scale * fourier_basis).T[:, None, :]
|
36 |
-
)
|
37 |
-
|
38 |
-
if window is not None:
|
39 |
-
assert filter_length >= win_length
|
40 |
-
# get window and zero center pad it to filter_length
|
41 |
-
fft_window = get_window(window, win_length, fftbins=True)
|
42 |
-
fft_window = pad_center(fft_window, filter_length)
|
43 |
-
fft_window = torch.from_numpy(fft_window).float()
|
44 |
-
|
45 |
-
# window the bases
|
46 |
-
forward_basis *= fft_window
|
47 |
-
inverse_basis *= fft_window
|
48 |
-
|
49 |
-
self.register_buffer("forward_basis", forward_basis.float())
|
50 |
-
self.register_buffer("inverse_basis", inverse_basis.float())
|
51 |
-
|
52 |
-
def transform(self, input_data):
|
53 |
-
device = self.forward_basis.device
|
54 |
-
input_data = input_data.to(device)
|
55 |
-
|
56 |
-
num_batches = input_data.size(0)
|
57 |
-
num_samples = input_data.size(1)
|
58 |
-
|
59 |
-
self.num_samples = num_samples
|
60 |
-
|
61 |
-
# similar to librosa, reflect-pad the input
|
62 |
-
input_data = input_data.view(num_batches, 1, num_samples)
|
63 |
-
input_data = F.pad(
|
64 |
-
input_data.unsqueeze(1),
|
65 |
-
(int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
|
66 |
-
mode="reflect",
|
67 |
-
)
|
68 |
-
input_data = input_data.squeeze(1)
|
69 |
-
|
70 |
-
forward_transform = F.conv1d(
|
71 |
-
input_data,
|
72 |
-
torch.autograd.Variable(self.forward_basis, requires_grad=False),
|
73 |
-
stride=self.hop_length,
|
74 |
-
padding=0,
|
75 |
-
)#.cpu()
|
76 |
-
|
77 |
-
cutoff = int((self.filter_length / 2) + 1)
|
78 |
-
real_part = forward_transform[:, :cutoff, :]
|
79 |
-
imag_part = forward_transform[:, cutoff:, :]
|
80 |
-
|
81 |
-
magnitude = torch.sqrt(real_part**2 + imag_part**2)
|
82 |
-
phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
|
83 |
-
|
84 |
-
return magnitude, phase
|
85 |
-
|
86 |
-
def inverse(self, magnitude, phase):
|
87 |
-
device = self.forward_basis.device
|
88 |
-
magnitude, phase = magnitude.to(device), phase.to(device)
|
89 |
-
|
90 |
-
recombine_magnitude_phase = torch.cat(
|
91 |
-
[magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
|
92 |
-
)
|
93 |
-
|
94 |
-
inverse_transform = F.conv_transpose1d(
|
95 |
-
recombine_magnitude_phase,
|
96 |
-
torch.autograd.Variable(self.inverse_basis, requires_grad=False),
|
97 |
-
stride=self.hop_length,
|
98 |
-
padding=0,
|
99 |
-
)
|
100 |
-
|
101 |
-
if self.window is not None:
|
102 |
-
window_sum = window_sumsquare(
|
103 |
-
self.window,
|
104 |
-
magnitude.size(-1),
|
105 |
-
hop_length=self.hop_length,
|
106 |
-
win_length=self.win_length,
|
107 |
-
n_fft=self.filter_length,
|
108 |
-
dtype=np.float32,
|
109 |
-
)
|
110 |
-
# remove modulation effects
|
111 |
-
approx_nonzero_indices = torch.from_numpy(
|
112 |
-
np.where(window_sum > tiny(window_sum))[0]
|
113 |
-
)
|
114 |
-
window_sum = torch.autograd.Variable(
|
115 |
-
torch.from_numpy(window_sum), requires_grad=False
|
116 |
-
)
|
117 |
-
window_sum = window_sum
|
118 |
-
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
|
119 |
-
approx_nonzero_indices
|
120 |
-
]
|
121 |
-
|
122 |
-
# scale by hop ratio
|
123 |
-
inverse_transform *= float(self.filter_length) / self.hop_length
|
124 |
-
|
125 |
-
inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
|
126 |
-
inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
|
127 |
-
|
128 |
-
return inverse_transform
|
129 |
-
|
130 |
-
def forward(self, input_data):
|
131 |
-
self.magnitude, self.phase = self.transform(input_data)
|
132 |
-
reconstruction = self.inverse(self.magnitude, self.phase)
|
133 |
-
return reconstruction
|
134 |
-
|
135 |
-
|
136 |
-
class TacotronSTFT(torch.nn.Module):
|
137 |
-
def __init__(
|
138 |
-
self,
|
139 |
-
filter_length,
|
140 |
-
hop_length,
|
141 |
-
win_length,
|
142 |
-
n_mel_channels,
|
143 |
-
sampling_rate,
|
144 |
-
mel_fmin,
|
145 |
-
mel_fmax,
|
146 |
-
):
|
147 |
-
super(TacotronSTFT, self).__init__()
|
148 |
-
self.n_mel_channels = n_mel_channels
|
149 |
-
self.sampling_rate = sampling_rate
|
150 |
-
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
151 |
-
mel_basis = librosa_mel_fn(
|
152 |
-
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
|
153 |
-
)
|
154 |
-
mel_basis = torch.from_numpy(mel_basis).float()
|
155 |
-
self.register_buffer("mel_basis", mel_basis)
|
156 |
-
|
157 |
-
def spectral_normalize(self, magnitudes, normalize_fun):
|
158 |
-
output = dynamic_range_compression(magnitudes, normalize_fun)
|
159 |
-
return output
|
160 |
-
|
161 |
-
def spectral_de_normalize(self, magnitudes):
|
162 |
-
output = dynamic_range_decompression(magnitudes)
|
163 |
-
return output
|
164 |
-
|
165 |
-
def mel_spectrogram(self, y, normalize_fun=torch.log):
|
166 |
-
"""Computes mel-spectrograms from a batch of waves
|
167 |
-
PARAMS
|
168 |
-
------
|
169 |
-
y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
|
170 |
-
|
171 |
-
RETURNS
|
172 |
-
-------
|
173 |
-
mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
|
174 |
-
"""
|
175 |
-
assert torch.min(y.data) >= -1, torch.min(y.data)
|
176 |
-
assert torch.max(y.data) <= 1, torch.max(y.data)
|
177 |
-
|
178 |
-
magnitudes, phases = self.stft_fn.transform(y)
|
179 |
-
magnitudes = magnitudes.data
|
180 |
-
mel_output = torch.matmul(self.mel_basis, magnitudes)
|
181 |
-
mel_output = self.spectral_normalize(mel_output, normalize_fun)
|
182 |
-
energy = torch.norm(magnitudes, dim=1)
|
183 |
-
|
184 |
-
log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
|
185 |
-
|
186 |
-
return mel_output, log_magnitudes, energy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/audio/tools.py
DELETED
@@ -1,85 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import numpy as np
|
3 |
-
import torchaudio
|
4 |
-
|
5 |
-
|
6 |
-
def get_mel_from_wav(audio, _stft):
|
7 |
-
audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
|
8 |
-
audio = torch.autograd.Variable(audio, requires_grad=False)
|
9 |
-
melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
|
10 |
-
melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
|
11 |
-
log_magnitudes_stft = (
|
12 |
-
torch.squeeze(log_magnitudes_stft, 0).numpy().astype(np.float32)
|
13 |
-
)
|
14 |
-
energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
|
15 |
-
return melspec, log_magnitudes_stft, energy
|
16 |
-
|
17 |
-
|
18 |
-
def _pad_spec(fbank, target_length=1024):
|
19 |
-
n_frames = fbank.shape[0]
|
20 |
-
p = target_length - n_frames
|
21 |
-
# cut and pad
|
22 |
-
if p > 0:
|
23 |
-
m = torch.nn.ZeroPad2d((0, 0, 0, p))
|
24 |
-
fbank = m(fbank)
|
25 |
-
elif p < 0:
|
26 |
-
fbank = fbank[0:target_length, :]
|
27 |
-
|
28 |
-
if fbank.size(-1) % 2 != 0:
|
29 |
-
fbank = fbank[..., :-1]
|
30 |
-
|
31 |
-
return fbank
|
32 |
-
|
33 |
-
|
34 |
-
def pad_wav(waveform, segment_length):
|
35 |
-
waveform_length = waveform.shape[-1]
|
36 |
-
assert waveform_length > 100, "Waveform is too short, %s" % waveform_length
|
37 |
-
if segment_length is None or waveform_length == segment_length:
|
38 |
-
return waveform
|
39 |
-
elif waveform_length > segment_length:
|
40 |
-
return waveform[:segment_length]
|
41 |
-
elif waveform_length < segment_length:
|
42 |
-
temp_wav = np.zeros((1, segment_length))
|
43 |
-
temp_wav[:, :waveform_length] = waveform
|
44 |
-
return temp_wav
|
45 |
-
|
46 |
-
def normalize_wav(waveform):
|
47 |
-
waveform = waveform - np.mean(waveform)
|
48 |
-
waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
|
49 |
-
return waveform * 0.5
|
50 |
-
|
51 |
-
|
52 |
-
def read_wav_file(filename, segment_length):
|
53 |
-
# waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
|
54 |
-
waveform, sr = torchaudio.load(filename) # Faster!!!
|
55 |
-
waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
|
56 |
-
waveform = waveform.numpy()[0, ...]
|
57 |
-
waveform = normalize_wav(waveform)
|
58 |
-
waveform = waveform[None, ...]
|
59 |
-
waveform = pad_wav(waveform, segment_length)
|
60 |
-
|
61 |
-
waveform = waveform / np.max(np.abs(waveform))
|
62 |
-
waveform = 0.5 * waveform
|
63 |
-
|
64 |
-
return waveform
|
65 |
-
|
66 |
-
|
67 |
-
def wav_to_fbank(filename, target_length=1024, fn_STFT=None):
|
68 |
-
assert fn_STFT is not None
|
69 |
-
|
70 |
-
# mixup
|
71 |
-
waveform = read_wav_file(filename, target_length * 160) # hop size is 160
|
72 |
-
|
73 |
-
waveform = waveform[0, ...]
|
74 |
-
waveform = torch.FloatTensor(waveform)
|
75 |
-
|
76 |
-
fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
|
77 |
-
|
78 |
-
fbank = torch.FloatTensor(fbank.T)
|
79 |
-
log_magnitudes_stft = torch.FloatTensor(log_magnitudes_stft.T)
|
80 |
-
|
81 |
-
fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
|
82 |
-
log_magnitudes_stft, target_length
|
83 |
-
)
|
84 |
-
|
85 |
-
return fbank, log_magnitudes_stft, waveform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/hifigan/__init__.py
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
from .models import Generator
|
2 |
-
|
3 |
-
|
4 |
-
class AttrDict(dict):
|
5 |
-
def __init__(self, *args, **kwargs):
|
6 |
-
super(AttrDict, self).__init__(*args, **kwargs)
|
7 |
-
self.__dict__ = self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/hifigan/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (569 Bytes)
|
|
audioldm/hifigan/__pycache__/__init__.cpython-39.pyc
DELETED
Binary file (574 Bytes)
|
|
audioldm/hifigan/__pycache__/models.cpython-310.pyc
DELETED
Binary file (3.73 kB)
|
|
audioldm/hifigan/__pycache__/models.cpython-39.pyc
DELETED
Binary file (3.73 kB)
|
|
audioldm/hifigan/__pycache__/utilities.cpython-310.pyc
DELETED
Binary file (2.48 kB)
|
|
audioldm/hifigan/__pycache__/utilities.cpython-39.pyc
DELETED
Binary file (2.37 kB)
|
|
audioldm/hifigan/models.py
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
import torch.nn.functional as F
|
4 |
-
from torch.nn import Conv1d, ConvTranspose1d
|
5 |
-
from torch.nn.utils import weight_norm, remove_weight_norm
|
6 |
-
|
7 |
-
LRELU_SLOPE = 0.1
|
8 |
-
|
9 |
-
|
10 |
-
def init_weights(m, mean=0.0, std=0.01):
|
11 |
-
classname = m.__class__.__name__
|
12 |
-
if classname.find("Conv") != -1:
|
13 |
-
m.weight.data.normal_(mean, std)
|
14 |
-
|
15 |
-
|
16 |
-
def get_padding(kernel_size, dilation=1):
|
17 |
-
return int((kernel_size * dilation - dilation) / 2)
|
18 |
-
|
19 |
-
|
20 |
-
class ResBlock(torch.nn.Module):
|
21 |
-
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
22 |
-
super(ResBlock, self).__init__()
|
23 |
-
self.h = h
|
24 |
-
self.convs1 = nn.ModuleList(
|
25 |
-
[
|
26 |
-
weight_norm(
|
27 |
-
Conv1d(
|
28 |
-
channels,
|
29 |
-
channels,
|
30 |
-
kernel_size,
|
31 |
-
1,
|
32 |
-
dilation=dilation[0],
|
33 |
-
padding=get_padding(kernel_size, dilation[0]),
|
34 |
-
)
|
35 |
-
),
|
36 |
-
weight_norm(
|
37 |
-
Conv1d(
|
38 |
-
channels,
|
39 |
-
channels,
|
40 |
-
kernel_size,
|
41 |
-
1,
|
42 |
-
dilation=dilation[1],
|
43 |
-
padding=get_padding(kernel_size, dilation[1]),
|
44 |
-
)
|
45 |
-
),
|
46 |
-
weight_norm(
|
47 |
-
Conv1d(
|
48 |
-
channels,
|
49 |
-
channels,
|
50 |
-
kernel_size,
|
51 |
-
1,
|
52 |
-
dilation=dilation[2],
|
53 |
-
padding=get_padding(kernel_size, dilation[2]),
|
54 |
-
)
|
55 |
-
),
|
56 |
-
]
|
57 |
-
)
|
58 |
-
self.convs1.apply(init_weights)
|
59 |
-
|
60 |
-
self.convs2 = nn.ModuleList(
|
61 |
-
[
|
62 |
-
weight_norm(
|
63 |
-
Conv1d(
|
64 |
-
channels,
|
65 |
-
channels,
|
66 |
-
kernel_size,
|
67 |
-
1,
|
68 |
-
dilation=1,
|
69 |
-
padding=get_padding(kernel_size, 1),
|
70 |
-
)
|
71 |
-
),
|
72 |
-
weight_norm(
|
73 |
-
Conv1d(
|
74 |
-
channels,
|
75 |
-
channels,
|
76 |
-
kernel_size,
|
77 |
-
1,
|
78 |
-
dilation=1,
|
79 |
-
padding=get_padding(kernel_size, 1),
|
80 |
-
)
|
81 |
-
),
|
82 |
-
weight_norm(
|
83 |
-
Conv1d(
|
84 |
-
channels,
|
85 |
-
channels,
|
86 |
-
kernel_size,
|
87 |
-
1,
|
88 |
-
dilation=1,
|
89 |
-
padding=get_padding(kernel_size, 1),
|
90 |
-
)
|
91 |
-
),
|
92 |
-
]
|
93 |
-
)
|
94 |
-
self.convs2.apply(init_weights)
|
95 |
-
|
96 |
-
def forward(self, x):
|
97 |
-
for c1, c2 in zip(self.convs1, self.convs2):
|
98 |
-
xt = F.leaky_relu(x, LRELU_SLOPE)
|
99 |
-
xt = c1(xt)
|
100 |
-
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
101 |
-
xt = c2(xt)
|
102 |
-
x = xt + x
|
103 |
-
return x
|
104 |
-
|
105 |
-
def remove_weight_norm(self):
|
106 |
-
for l in self.convs1:
|
107 |
-
remove_weight_norm(l)
|
108 |
-
for l in self.convs2:
|
109 |
-
remove_weight_norm(l)
|
110 |
-
|
111 |
-
|
112 |
-
class Generator(torch.nn.Module):
|
113 |
-
def __init__(self, h):
|
114 |
-
super(Generator, self).__init__()
|
115 |
-
self.h = h
|
116 |
-
self.num_kernels = len(h.resblock_kernel_sizes)
|
117 |
-
self.num_upsamples = len(h.upsample_rates)
|
118 |
-
self.conv_pre = weight_norm(
|
119 |
-
Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
|
120 |
-
)
|
121 |
-
resblock = ResBlock
|
122 |
-
|
123 |
-
self.ups = nn.ModuleList()
|
124 |
-
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
125 |
-
self.ups.append(
|
126 |
-
weight_norm(
|
127 |
-
ConvTranspose1d(
|
128 |
-
h.upsample_initial_channel // (2**i),
|
129 |
-
h.upsample_initial_channel // (2 ** (i + 1)),
|
130 |
-
k,
|
131 |
-
u,
|
132 |
-
padding=(k - u) // 2,
|
133 |
-
)
|
134 |
-
)
|
135 |
-
)
|
136 |
-
|
137 |
-
self.resblocks = nn.ModuleList()
|
138 |
-
for i in range(len(self.ups)):
|
139 |
-
ch = h.upsample_initial_channel // (2 ** (i + 1))
|
140 |
-
for j, (k, d) in enumerate(
|
141 |
-
zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
|
142 |
-
):
|
143 |
-
self.resblocks.append(resblock(h, ch, k, d))
|
144 |
-
|
145 |
-
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
146 |
-
self.ups.apply(init_weights)
|
147 |
-
self.conv_post.apply(init_weights)
|
148 |
-
|
149 |
-
def forward(self, x):
|
150 |
-
x = self.conv_pre(x)
|
151 |
-
for i in range(self.num_upsamples):
|
152 |
-
x = F.leaky_relu(x, LRELU_SLOPE)
|
153 |
-
x = self.ups[i](x)
|
154 |
-
xs = None
|
155 |
-
for j in range(self.num_kernels):
|
156 |
-
if xs is None:
|
157 |
-
xs = self.resblocks[i * self.num_kernels + j](x)
|
158 |
-
else:
|
159 |
-
xs += self.resblocks[i * self.num_kernels + j](x)
|
160 |
-
x = xs / self.num_kernels
|
161 |
-
x = F.leaky_relu(x)
|
162 |
-
x = self.conv_post(x)
|
163 |
-
x = torch.tanh(x)
|
164 |
-
|
165 |
-
return x
|
166 |
-
|
167 |
-
def remove_weight_norm(self):
|
168 |
-
# print("Removing weight norm...")
|
169 |
-
for l in self.ups:
|
170 |
-
remove_weight_norm(l)
|
171 |
-
for l in self.resblocks:
|
172 |
-
l.remove_weight_norm()
|
173 |
-
remove_weight_norm(self.conv_pre)
|
174 |
-
remove_weight_norm(self.conv_post)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/hifigan/utilities.py
DELETED
@@ -1,86 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import json
|
3 |
-
|
4 |
-
import torch
|
5 |
-
import numpy as np
|
6 |
-
|
7 |
-
import audioldm.hifigan as hifigan
|
8 |
-
|
9 |
-
HIFIGAN_16K_64 = {
|
10 |
-
"resblock": "1",
|
11 |
-
"num_gpus": 6,
|
12 |
-
"batch_size": 16,
|
13 |
-
"learning_rate": 0.0002,
|
14 |
-
"adam_b1": 0.8,
|
15 |
-
"adam_b2": 0.99,
|
16 |
-
"lr_decay": 0.999,
|
17 |
-
"seed": 1234,
|
18 |
-
"upsample_rates": [5, 4, 2, 2, 2],
|
19 |
-
"upsample_kernel_sizes": [16, 16, 8, 4, 4],
|
20 |
-
"upsample_initial_channel": 1024,
|
21 |
-
"resblock_kernel_sizes": [3, 7, 11],
|
22 |
-
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
23 |
-
"segment_size": 8192,
|
24 |
-
"num_mels": 64,
|
25 |
-
"num_freq": 1025,
|
26 |
-
"n_fft": 1024,
|
27 |
-
"hop_size": 160,
|
28 |
-
"win_size": 1024,
|
29 |
-
"sampling_rate": 16000,
|
30 |
-
"fmin": 0,
|
31 |
-
"fmax": 8000,
|
32 |
-
"fmax_for_loss": None,
|
33 |
-
"num_workers": 4,
|
34 |
-
"dist_config": {
|
35 |
-
"dist_backend": "nccl",
|
36 |
-
"dist_url": "tcp://localhost:54321",
|
37 |
-
"world_size": 1,
|
38 |
-
},
|
39 |
-
}
|
40 |
-
|
41 |
-
|
42 |
-
def get_available_checkpoint_keys(model, ckpt):
|
43 |
-
print("==> Attemp to reload from %s" % ckpt)
|
44 |
-
state_dict = torch.load(ckpt)["state_dict"]
|
45 |
-
current_state_dict = model.state_dict()
|
46 |
-
new_state_dict = {}
|
47 |
-
for k in state_dict.keys():
|
48 |
-
if (
|
49 |
-
k in current_state_dict.keys()
|
50 |
-
and current_state_dict[k].size() == state_dict[k].size()
|
51 |
-
):
|
52 |
-
new_state_dict[k] = state_dict[k]
|
53 |
-
else:
|
54 |
-
print("==> WARNING: Skipping %s" % k)
|
55 |
-
print(
|
56 |
-
"%s out of %s keys are matched"
|
57 |
-
% (len(new_state_dict.keys()), len(state_dict.keys()))
|
58 |
-
)
|
59 |
-
return new_state_dict
|
60 |
-
|
61 |
-
|
62 |
-
def get_param_num(model):
|
63 |
-
num_param = sum(param.numel() for param in model.parameters())
|
64 |
-
return num_param
|
65 |
-
|
66 |
-
|
67 |
-
def get_vocoder(config, device):
|
68 |
-
config = hifigan.AttrDict(HIFIGAN_16K_64)
|
69 |
-
vocoder = hifigan.Generator(config)
|
70 |
-
vocoder.eval()
|
71 |
-
vocoder.remove_weight_norm()
|
72 |
-
vocoder.to(device)
|
73 |
-
return vocoder
|
74 |
-
|
75 |
-
|
76 |
-
def vocoder_infer(mels, vocoder, lengths=None):
|
77 |
-
vocoder.eval()
|
78 |
-
with torch.no_grad():
|
79 |
-
wavs = vocoder(mels).squeeze(1)
|
80 |
-
|
81 |
-
wavs = (wavs.cpu().numpy() * 32768).astype("int16")
|
82 |
-
|
83 |
-
if lengths is not None:
|
84 |
-
wavs = wavs[:, :lengths]
|
85 |
-
|
86 |
-
return wavs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audioldm/latent_diffusion/__init__.py
DELETED
File without changes
|
audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (157 Bytes)
|
|
audioldm/latent_diffusion/__pycache__/__init__.cpython-39.pyc
DELETED
Binary file (164 Bytes)
|
|
audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc
DELETED
Binary file (11.4 kB)
|
|
audioldm/latent_diffusion/__pycache__/attention.cpython-39.pyc
DELETED
Binary file (11.4 kB)
|
|
audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc
DELETED
Binary file (7.2 kB)
|
|
audioldm/latent_diffusion/__pycache__/ddim.cpython-39.pyc
DELETED
Binary file (7.11 kB)
|
|
audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc
DELETED
Binary file (11.1 kB)
|
|
audioldm/latent_diffusion/__pycache__/ddpm.cpython-39.pyc
DELETED
Binary file (11 kB)
|
|
audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc
DELETED
Binary file (3.01 kB)
|
|
audioldm/latent_diffusion/__pycache__/ema.cpython-39.pyc
DELETED
Binary file (3 kB)
|
|
audioldm/latent_diffusion/__pycache__/openaimodel.cpython-39.pyc
DELETED
Binary file (23.7 kB)
|
|
audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc
DELETED
Binary file (9.53 kB)
|
|
audioldm/latent_diffusion/__pycache__/util.cpython-39.pyc
DELETED
Binary file (9.6 kB)
|
|