Spaces:

declare-lab
/

TangoFlux

Running on Zero

App Files Files Community

hungchiayu1 commited on Dec 31, 2024

Commit

838c300

1 Parent(s): 348f0d7

update to tangoflux

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +2 -3
TangoFlux.py +57 -0
app.py +9 -249
audioldm/__init__.py +0 -8
audioldm/__main__.py +0 -183
audioldm/__pycache__/__init__.cpython-310.pyc +0 -0
audioldm/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/__pycache__/ldm.cpython-310.pyc +0 -0
audioldm/__pycache__/ldm.cpython-39.pyc +0 -0
audioldm/__pycache__/pipeline.cpython-310.pyc +0 -0
audioldm/__pycache__/pipeline.cpython-39.pyc +0 -0
audioldm/__pycache__/utils.cpython-310.pyc +0 -0
audioldm/__pycache__/utils.cpython-39.pyc +0 -0
audioldm/audio/__init__.py +0 -2
audioldm/audio/__pycache__/__init__.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/audio_processing.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/audio_processing.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/mix.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/stft.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/stft.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/tools.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/tools.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/torch_tools.cpython-39.pyc +0 -0
audioldm/audio/audio_processing.py +0 -100
audioldm/audio/stft.py +0 -186
audioldm/audio/tools.py +0 -85
audioldm/hifigan/__init__.py +0 -7
audioldm/hifigan/__pycache__/__init__.cpython-310.pyc +0 -0
audioldm/hifigan/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/hifigan/__pycache__/models.cpython-310.pyc +0 -0
audioldm/hifigan/__pycache__/models.cpython-39.pyc +0 -0
audioldm/hifigan/__pycache__/utilities.cpython-310.pyc +0 -0
audioldm/hifigan/__pycache__/utilities.cpython-39.pyc +0 -0
audioldm/hifigan/models.py +0 -174
audioldm/hifigan/utilities.py +0 -86
audioldm/latent_diffusion/__init__.py +0 -0
audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
audioldm/latent_diffusion/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc +0 -0
audioldm/latent_diffusion/__pycache__/attention.cpython-39.pyc +0 -0
audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc +0 -0
audioldm/latent_diffusion/__pycache__/ddim.cpython-39.pyc +0 -0
audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc +0 -0
audioldm/latent_diffusion/__pycache__/ddpm.cpython-39.pyc +0 -0
audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc +0 -0
audioldm/latent_diffusion/__pycache__/ema.cpython-39.pyc +0 -0
audioldm/latent_diffusion/__pycache__/openaimodel.cpython-39.pyc +0 -0
audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc +0 -0
audioldm/latent_diffusion/__pycache__/util.cpython-39.pyc +0 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Tango2
 emoji: 🐠
 colorFrom: indigo
 colorTo: pink
@@ -7,7 +7,6 @@ sdk: gradio
 sdk_version: 4.26.0
 app_file: app.py
 pinned: false
-short_description: Fast Text to Audio Generator
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TangoFlux
 emoji: 🐠
 colorFrom: indigo
 colorTo: pink
 sdk_version: 4.26.0
 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

TangoFlux.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from diffusers import AutoencoderOobleck
+import torch
+from transformers  import T5EncoderModel,T5TokenizerFast
+from diffusers import   FluxTransformer2DModel
+from torch import nn
+from typing import List
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.training_utils import compute_density_for_timestep_sampling
+import copy
+import torch.nn.functional as F
+import numpy as np
+from model import TangoFlux
+from huggingface_hub import snapshot_download
+from tqdm import tqdm
+from typing import Optional,Union,List
+from datasets import load_dataset, Audio
+from math import pi
+import json
+import inspect
+import yaml
+from safetensors.torch import load_file
+class TangoFluxInference:
+    def __init__(self,name='declare-lab/TangoFlux',device="cuda"):
+        self.vae = AutoencoderOobleck.from_pretrained("stabilityai/stable-audio-open-1.0",subfolder='vae')
+        paths = snapshot_download(repo_id=name)
+        weights = load_file("{}/tangoflux.safetensors".format(paths))
+        with open('{}/config.json'.format(paths),'r') as f:
+            config = json.load(f)
+        self.model = TangoFlux(config)
+        self.model.load_state_dict(weights,strict=False)
+        # _IncompatibleKeys(missing_keys=['text_encoder.encoder.embed_tokens.weight'], unexpected_keys=[]) this behaviour is expected
+        self.vae.to(device)
+        self.model.to(device)
+    def generate(self,prompt,steps=25,duration=10,guidance_scale=4.5):
+        with torch.no_grad():
+            latents = self.model.inference_flow(prompt,
+            duration=duration,
+            num_inference_steps=steps,
+            guidance_scale=guidance_scale)
+            wave = self.vae.decode(latents.transpose(2,1)).sample.cpu()[0]
+        return wave

app.py CHANGED Viewed

@@ -13,245 +13,23 @@ from gradio import Markdown
 import torch
 #from diffusers.models.autoencoder_kl import AutoencoderKL
-from diffusers.models.unet_2d_condition import UNet2DConditionModel
 from diffusers import DiffusionPipeline,AudioPipelineOutput
 from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
 from typing import Union
 from diffusers.utils.torch_utils import randn_tensor
 from tqdm import tqdm
-class Tango2Pipeline(DiffusionPipeline):
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: T5EncoderModel,
-        tokenizer: Union[T5Tokenizer, T5TokenizerFast],
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler
-    ):
-        super().__init__()
-        self.register_modules(vae=vae,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        unet=unet,
-        scheduler=scheduler
-        )
-    def _encode_prompt(self, prompt):
-        device = self.text_encoder.device
-        batch = self.tokenizer(
-            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
-        )
-        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
-        encoder_hidden_states = self.text_encoder(
-                input_ids=input_ids, attention_mask=attention_mask
-            )[0]
-        boolean_encoder_mask = (attention_mask == 1).to(device)
-        return encoder_hidden_states, boolean_encoder_mask
-    def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
-        device = self.text_encoder.device
-        batch = self.tokenizer(
-            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
-        )
-        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
-        with torch.no_grad():
-            prompt_embeds = self.text_encoder(
-                input_ids=input_ids, attention_mask=attention_mask
-            )[0]
-        prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
-        attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
-        # get unconditional embeddings for classifier free guidance
-        uncond_tokens = [""] * len(prompt)
-        max_length = prompt_embeds.shape[1]
-        uncond_batch = self.tokenizer(
-            uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
-        )
-        uncond_input_ids = uncond_batch.input_ids.to(device)
-        uncond_attention_mask = uncond_batch.attention_mask.to(device)
-        with torch.no_grad():
-            negative_prompt_embeds = self.text_encoder(
-                input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
-            )[0]
-        negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
-        uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
-        # For classifier free guidance, we need to do two forward passes.
-        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
-        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-        prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
-        boolean_prompt_mask = (prompt_mask == 1).to(device)
-        return prompt_embeds, boolean_prompt_mask
-    def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
-        shape = (batch_size, num_channels_latents, 256, 16)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * inference_scheduler.init_noise_sigma
-        return latents
-    @torch.no_grad()
-    def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
-                  disable_progress=True):
-        device = self.text_encoder.device
-        classifier_free_guidance = guidance_scale > 1.0
-        batch_size = len(prompt) * num_samples_per_prompt
-        if classifier_free_guidance:
-            prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
-        else:
-            prompt_embeds, boolean_prompt_mask = self._encode_text(prompt)
-            prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
-            boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
-        inference_scheduler.set_timesteps(num_steps, device=device)
-        timesteps = inference_scheduler.timesteps
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
-        num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
-        progress_bar = tqdm(range(num_steps), disable=disable_progress)
-        for i, t in enumerate(timesteps):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
-            latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
-            noise_pred = self.unet(
-                latent_model_input, t, encoder_hidden_states=prompt_embeds,
-                encoder_attention_mask=boolean_prompt_mask
-            ).sample
-            # perform guidance
-            if classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
-                progress_bar.update(1)
-        return latents
-    @torch.no_grad()
-    def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
-        """ Genrate audio for a single prompt string. """
-        with torch.no_grad():
-            latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
-            mel = self.vae.decode_first_stage(latents)
-            wave = self.vae.decode_to_waveform(mel)
-        return AudioPipelineOutput(audios=wave)
-# Automatic device detection
-if torch.cuda.is_available():
-    device_type = "cuda"
-    device_selection = "cuda:0"
-else:
-    device_type = "cpu"
-    device_selection = "cpu"
-class Tango:
-    def __init__(self, name="declare-lab/tango2", device=device_selection):
-        path = snapshot_download(repo_id=name)
-        vae_config = json.load(open("{}/vae_config.json".format(path)))
-        stft_config = json.load(open("{}/stft_config.json".format(path)))
-        main_config = json.load(open("{}/main_config.json".format(path)))
-        self.vae = AutoencoderKL(**vae_config).to(device)
-        self.stft = TacotronSTFT(**stft_config).to(device)
-        self.model = AudioDiffusion(**main_config).to(device)
-        vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location=device)
-        stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location=device)
-        main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location=device)
-        self.vae.load_state_dict(vae_weights)
-        self.stft.load_state_dict(stft_weights)
-        self.model.load_state_dict(main_weights)
-        print ("Successfully loaded checkpoint from:", name)
-        self.vae.eval()
-        self.stft.eval()
-        self.model.eval()
-        self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler")
-    def chunks(self, lst, n):
-        """ Yield successive n-sized chunks from a list. """
-        for i in range(0, len(lst), n):
-            yield lst[i:i + n]
-    def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
-        """ Genrate audio for a single prompt string. """
-        with torch.no_grad():
-            latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
-            mel = self.vae.decode_first_stage(latents)
-            wave = self.vae.decode_to_waveform(mel)
-        return wave[0]
-    def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
-        """ Genrate audio for a list of prompt strings. """
-        outputs = []
-        for k in tqdm(range(0, len(prompts), batch_size)):
-            batch = prompts[k: k+batch_size]
-            with torch.no_grad():
-                latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
-                mel = self.vae.decode_first_stage(latents)
-                wave = self.vae.decode_to_waveform(mel)
-                outputs += [item for item in wave]
-        if samples == 1:
-            return outputs
-        else:
-            return list(self.chunks(outputs, samples))
-# Initialize TANGO
-tango = Tango(device="cpu")
-tango.vae.to(device_type)
-tango.stft.to(device_type)
-tango.model.to(device_type)
-pipe = Tango2Pipeline(vae=tango.vae,
-                      text_encoder=tango.model.text_encoder,
-                      tokenizer=tango.model.tokenizer,
-                      unet=tango.model.unet,
-                      scheduler=tango.scheduler
-                      )
-@spaces.GPU(duration=60)
-def gradio_generate(prompt, output_format, steps, guidance):
     output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
     #output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
@@ -265,25 +43,6 @@ def gradio_generate(prompt, output_format, steps, guidance):
     return output_filename
-# description_text = """
-# <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
-# Generate audio using TANGO by providing a text prompt.
-# <br/><br/>Limitations: TANGO is trained on the small AudioCaps dataset so it may not generate good audio \
-# samples related to concepts that it has not seen in training (e.g. singing). For the same reason, TANGO \
-# is not always able to finely control its generations over textual control prompts. For example, \
-# the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes \
-# on a metal table are very similar. \
-# <br/><br/>We are currently training another version of TANGO on larger datasets to enhance its generalization, \
-# compositional and controllable generation ability.
-# <br/><br/>We recommend using a guidance scale of 3. The default number of steps is set to 100. More steps generally lead to better quality of generated audios but will take longer.
-# <br/><br/>
-# <h1> ChatGPT-enhanced audio generation</h1>
-# <br/>
-# As TANGO consists of an instruction-tuned LLM, it is able to process complex sound descriptions allowing us to provide more detailed instructions to improve the generation quality.
-# For example, ``A boat is moving on the sea'' vs ``The sound of the water lapping against the hull of the boat or splashing as you move through the waves''. The latter is obtained by prompting ChatGPT to explain the sound generated when a boat moves on the sea.
-# Using this ChatGPT-generated description of the sound, TANGO provides superior results.
-# <p/>
-# """
 description_text = """
 <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
@@ -294,15 +53,16 @@ Generate audio using Tango2 by providing a text prompt. Tango2 was built from Ta
 input_text = gr.Textbox(lines=2, label="Prompt")
 output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
 output_audio = gr.Audio(label="Generated Audio", type="filepath")
-denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
 guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
 # Gradio interface
 gr_interface = gr.Interface(
     fn=gradio_generate,
-    inputs=[input_text, output_format, denoising_steps, guidance_scale],
     outputs=[output_audio],
-    title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
     description=description_text,
     allow_flagging=False,
     examples=[

 import torch
 #from diffusers.models.autoencoder_kl import AutoencoderKL
 from diffusers import DiffusionPipeline,AudioPipelineOutput
 from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
 from typing import Union
 from diffusers.utils.torch_utils import randn_tensor
 from tqdm import tqdm
+from TangoFlux import TangoFluxInference
+tangoflux = TangoFluxInference(path="declare-lab/TangoFlux")
+@spaces.GPU(duration=15)
+def gradio_generate(prompt, output_format, steps, guidance,duration=10):
+    output_wave = tangoflux.generate(prompt,steps=steps,guidance=guidance,duration=duration)
     output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
     #output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
     return output_filename
 description_text = """
 <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
 input_text = gr.Textbox(lines=2, label="Prompt")
 output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
 output_audio = gr.Audio(label="Generated Audio", type="filepath")
+denoising_steps = gr.Slider(minimum=10, maximum=100, value=25, step=1, label="Steps", interactive=True)
 guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
+duration_scale = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
 # Gradio interface
 gr_interface = gr.Interface(
     fn=gradio_generate,
+    inputs=[input_text, output_format, denoising_steps, guidance_scale,duration_scale],
     outputs=[output_audio],
+    title="TangoFlux: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
     description=description_text,
     allow_flagging=False,
     examples=[

audioldm/__init__.py DELETED Viewed

@@ -1,8 +0,0 @@
-from .ldm import LatentDiffusion
-from .utils import seed_everything, save_wave, get_time, get_duration
-from .pipeline import *

audioldm/__main__.py DELETED Viewed

@@ -1,183 +0,0 @@
-#!/usr/bin/python3
-import os
-from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
-import argparse
-CACHE_DIR = os.getenv(
-    "AUDIOLDM_CACHE_DIR",
-    os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--mode",
-    type=str,
-    required=False,
-    default="generation",
-    help="generation: text-to-audio generation; transfer: style transfer",
-    choices=["generation", "transfer"]
-)
-parser.add_argument(
-    "-t",
-    "--text",
-    type=str,
-    required=False,
-    default="",
-    help="Text prompt to the model for audio generation",
-)
-parser.add_argument(
-    "-f",
-    "--file_path",
-    type=str,
-    required=False,
-    default=None,
-    help="(--mode transfer): Original audio file for style transfer; Or (--mode generation): the guidance audio file for generating simialr audio",
-)
-parser.add_argument(
-    "--transfer_strength",
-    type=float,
-    required=False,
-    default=0.5,
-    help="A value between 0 and 1. 0 means original audio without transfer, 1 means completely transfer to the audio indicated by text",
-)
-parser.add_argument(
-    "-s",
-    "--save_path",
-    type=str,
-    required=False,
-    help="The path to save model output",
-    default="./output",
-)
-parser.add_argument(
-    "--model_name",
-    type=str,
-    required=False,
-    help="The checkpoint you gonna use",
-    default="audioldm-s-full",
-    choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"]
-)
-parser.add_argument(
-    "-ckpt",
-    "--ckpt_path",
-    type=str,
-    required=False,
-    help="The path to the pretrained .ckpt model",
-    default=None,
-)
-parser.add_argument(
-    "-b",
-    "--batchsize",
-    type=int,
-    required=False,
-    default=1,
-    help="Generate how many samples at the same time",
-)
-parser.add_argument(
-    "--ddim_steps",
-    type=int,
-    required=False,
-    default=200,
-    help="The sampling step for DDIM",
-)
-parser.add_argument(
-    "-gs",
-    "--guidance_scale",
-    type=float,
-    required=False,
-    default=2.5,
-    help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
-)
-parser.add_argument(
-    "-dur",
-    "--duration",
-    type=float,
-    required=False,
-    default=10.0,
-    help="The duration of the samples",
-)
-parser.add_argument(
-    "-n",
-    "--n_candidate_gen_per_text",
-    type=int,
-    required=False,
-    default=3,
-    help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
-)
-parser.add_argument(
-    "--seed",
-    type=int,
-    required=False,
-    default=42,
-    help="Change this value (any integer number) will lead to a different generation result.",
-)
-args = parser.parse_args()
-if(args.ckpt_path is not None):
-    print("Warning: ckpt_path has no effect after version 0.0.20.")
-assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5"
-mode = args.mode
-if(mode == "generation" and args.file_path is not None):
-    mode = "generation_audio_to_audio"
-    if(len(args.text) > 0):
-        print("Warning: You have specified the --file_path. --text will be ignored")
-        args.text = ""
-save_path = os.path.join(args.save_path, mode)
-if(args.file_path is not None):
-    save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0]))
-text = args.text
-random_seed = args.seed
-duration = args.duration
-guidance_scale = args.guidance_scale
-n_candidate_gen_per_text = args.n_candidate_gen_per_text
-os.makedirs(save_path, exist_ok=True)
-audioldm = build_model(model_name=args.model_name)
-if(args.mode == "generation"):
-    waveform = text_to_audio(
-        audioldm,
-        text,
-        args.file_path,
-        random_seed,
-        duration=duration,
-        guidance_scale=guidance_scale,
-        ddim_steps=args.ddim_steps,
-        n_candidate_gen_per_text=n_candidate_gen_per_text,
-        batchsize=args.batchsize,
-    )
-elif(args.mode == "transfer"):
-    assert args.file_path is not None
-    assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path
-    waveform = style_transfer(
-        audioldm,
-        text,
-        args.file_path,
-        args.transfer_strength,
-        random_seed,
-        duration=duration,
-        guidance_scale=guidance_scale,
-        ddim_steps=args.ddim_steps,
-        batchsize=args.batchsize,
-    )
-    waveform = waveform[:,None,:]
-save_wave(waveform, save_path, name="%s_%s" % (get_time(), text))

audioldm/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (315 Bytes)

audioldm/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (322 Bytes)

audioldm/__pycache__/ldm.cpython-310.pyc DELETED Viewed

Binary file (16.1 kB)

audioldm/__pycache__/ldm.cpython-39.pyc DELETED Viewed

Binary file (16 kB)

audioldm/__pycache__/pipeline.cpython-310.pyc DELETED Viewed

Binary file (6.63 kB)

audioldm/__pycache__/pipeline.cpython-39.pyc DELETED Viewed

Binary file (6.54 kB)

audioldm/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (8.01 kB)

audioldm/__pycache__/utils.cpython-39.pyc DELETED Viewed

Binary file (7.35 kB)

audioldm/audio/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .tools import wav_to_fbank, read_wav_file
2	- from .stft import TacotronSTFT

audioldm/audio/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (253 Bytes)

audioldm/audio/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (260 Bytes)

audioldm/audio/__pycache__/audio_processing.cpython-310.pyc DELETED Viewed

Binary file (2.78 kB)

audioldm/audio/__pycache__/audio_processing.cpython-39.pyc DELETED Viewed

Binary file (2.78 kB)

audioldm/audio/__pycache__/mix.cpython-39.pyc DELETED Viewed

Binary file (1.7 kB)

audioldm/audio/__pycache__/stft.cpython-310.pyc DELETED Viewed

Binary file (4.98 kB)

audioldm/audio/__pycache__/stft.cpython-39.pyc DELETED Viewed

Binary file (4.99 kB)

audioldm/audio/__pycache__/tools.cpython-310.pyc DELETED Viewed

Binary file (2.18 kB)

audioldm/audio/__pycache__/tools.cpython-39.pyc DELETED Viewed

Binary file (2.19 kB)

audioldm/audio/__pycache__/torch_tools.cpython-39.pyc DELETED Viewed

Binary file (3.79 kB)

audioldm/audio/audio_processing.py DELETED Viewed

@@ -1,100 +0,0 @@
-import torch
-import numpy as np
-import librosa.util as librosa_util
-from scipy.signal import get_window
-def window_sumsquare(
-    window,
-    n_frames,
-    hop_length,
-    win_length,
-    n_fft,
-    dtype=np.float32,
-    norm=None,
-):
-    """
-    # from librosa 0.6
-    Compute the sum-square envelope of a window function at a given hop length.
-    This is used to estimate modulation effects induced by windowing
-    observations in short-time fourier transforms.
-    Parameters
-    ----------
-    window : string, tuple, number, callable, or list-like
-        Window specification, as in `get_window`
-    n_frames : int > 0
-        The number of analysis frames
-    hop_length : int > 0
-        The number of samples to advance between frames
-    win_length : [optional]
-        The length of the window function.  By default, this matches `n_fft`.
-    n_fft : int > 0
-        The length of each analysis frame.
-    dtype : np.dtype
-        The data type of the output
-    Returns
-    -------
-    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
-        The sum-squared envelope of the window function
-    """
-    if win_length is None:
-        win_length = n_fft
-    n = n_fft + hop_length * (n_frames - 1)
-    x = np.zeros(n, dtype=dtype)
-    # Compute the squared window at the desired length
-    win_sq = get_window(window, win_length, fftbins=True)
-    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
-    win_sq = librosa_util.pad_center(win_sq, n_fft)
-    # Fill the envelope
-    for i in range(n_frames):
-        sample = i * hop_length
-        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
-    return x
-def griffin_lim(magnitudes, stft_fn, n_iters=30):
-    """
-    PARAMS
-    ------
-    magnitudes: spectrogram magnitudes
-    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
-    """
-    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
-    angles = angles.astype(np.float32)
-    angles = torch.autograd.Variable(torch.from_numpy(angles))
-    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
-    for i in range(n_iters):
-        _, angles = stft_fn.transform(signal)
-        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
-    return signal
-def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
-    """
-    PARAMS
-    ------
-    C: compression factor
-    """
-    return normalize_fun(torch.clamp(x, min=clip_val) * C)
-def dynamic_range_decompression(x, C=1):
-    """
-    PARAMS
-    ------
-    C: compression factor used to compress
-    """
-    return torch.exp(x) / C

audioldm/audio/stft.py DELETED Viewed

@@ -1,186 +0,0 @@
-import torch
-import torch.nn.functional as F
-import numpy as np
-from scipy.signal import get_window
-from librosa.util import pad_center, tiny
-from librosa.filters import mel as librosa_mel_fn
-from audioldm.audio.audio_processing import (
-    dynamic_range_compression,
-    dynamic_range_decompression,
-    window_sumsquare,
-)
-class STFT(torch.nn.Module):
-    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
-    def __init__(self, filter_length, hop_length, win_length, window="hann"):
-        super(STFT, self).__init__()
-        self.filter_length = filter_length
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.window = window
-        self.forward_transform = None
-        scale = self.filter_length / self.hop_length
-        fourier_basis = np.fft.fft(np.eye(self.filter_length))
-        cutoff = int((self.filter_length / 2 + 1))
-        fourier_basis = np.vstack(
-            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
-        )
-        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
-        inverse_basis = torch.FloatTensor(
-            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
-        )
-        if window is not None:
-            assert filter_length >= win_length
-            # get window and zero center pad it to filter_length
-            fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, filter_length)
-            fft_window = torch.from_numpy(fft_window).float()
-            # window the bases
-            forward_basis *= fft_window
-            inverse_basis *= fft_window
-        self.register_buffer("forward_basis", forward_basis.float())
-        self.register_buffer("inverse_basis", inverse_basis.float())
-    def transform(self, input_data):
-        device = self.forward_basis.device
-        input_data = input_data.to(device)
-        num_batches = input_data.size(0)
-        num_samples = input_data.size(1)
-        self.num_samples = num_samples
-        # similar to librosa, reflect-pad the input
-        input_data = input_data.view(num_batches, 1, num_samples)
-        input_data = F.pad(
-            input_data.unsqueeze(1),
-            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
-            mode="reflect",
-        )
-        input_data = input_data.squeeze(1)
-        forward_transform = F.conv1d(
-            input_data,
-            torch.autograd.Variable(self.forward_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0,
-        )#.cpu()
-        cutoff = int((self.filter_length / 2) + 1)
-        real_part = forward_transform[:, :cutoff, :]
-        imag_part = forward_transform[:, cutoff:, :]
-        magnitude = torch.sqrt(real_part**2 + imag_part**2)
-        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
-        return magnitude, phase
-    def inverse(self, magnitude, phase):
-        device = self.forward_basis.device
-        magnitude, phase = magnitude.to(device), phase.to(device)
-        recombine_magnitude_phase = torch.cat(
-            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
-        )
-        inverse_transform = F.conv_transpose1d(
-            recombine_magnitude_phase,
-            torch.autograd.Variable(self.inverse_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0,
-        )
-        if self.window is not None:
-            window_sum = window_sumsquare(
-                self.window,
-                magnitude.size(-1),
-                hop_length=self.hop_length,
-                win_length=self.win_length,
-                n_fft=self.filter_length,
-                dtype=np.float32,
-            )
-            # remove modulation effects
-            approx_nonzero_indices = torch.from_numpy(
-                np.where(window_sum > tiny(window_sum))[0]
-            )
-            window_sum = torch.autograd.Variable(
-                torch.from_numpy(window_sum), requires_grad=False
-            )
-            window_sum = window_sum
-            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
-                approx_nonzero_indices
-            ]
-            # scale by hop ratio
-            inverse_transform *= float(self.filter_length) / self.hop_length
-        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
-        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
-        return inverse_transform
-    def forward(self, input_data):
-        self.magnitude, self.phase = self.transform(input_data)
-        reconstruction = self.inverse(self.magnitude, self.phase)
-        return reconstruction
-class TacotronSTFT(torch.nn.Module):
-    def __init__(
-        self,
-        filter_length,
-        hop_length,
-        win_length,
-        n_mel_channels,
-        sampling_rate,
-        mel_fmin,
-        mel_fmax,
-    ):
-        super(TacotronSTFT, self).__init__()
-        self.n_mel_channels = n_mel_channels
-        self.sampling_rate = sampling_rate
-        self.stft_fn = STFT(filter_length, hop_length, win_length)
-        mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
-        )
-        mel_basis = torch.from_numpy(mel_basis).float()
-        self.register_buffer("mel_basis", mel_basis)
-    def spectral_normalize(self, magnitudes, normalize_fun):
-        output = dynamic_range_compression(magnitudes, normalize_fun)
-        return output
-    def spectral_de_normalize(self, magnitudes):
-        output = dynamic_range_decompression(magnitudes)
-        return output
-    def mel_spectrogram(self, y, normalize_fun=torch.log):
-        """Computes mel-spectrograms from a batch of waves
-        PARAMS
-        ------
-        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
-        RETURNS
-        -------
-        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
-        """
-        assert torch.min(y.data) >= -1, torch.min(y.data)
-        assert torch.max(y.data) <= 1, torch.max(y.data)
-        magnitudes, phases = self.stft_fn.transform(y)
-        magnitudes = magnitudes.data
-        mel_output = torch.matmul(self.mel_basis, magnitudes)
-        mel_output = self.spectral_normalize(mel_output, normalize_fun)
-        energy = torch.norm(magnitudes, dim=1)
-        log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
-        return mel_output, log_magnitudes, energy

audioldm/audio/tools.py DELETED Viewed

@@ -1,85 +0,0 @@
-import torch
-import numpy as np
-import torchaudio
-def get_mel_from_wav(audio, _stft):
-    audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
-    audio = torch.autograd.Variable(audio, requires_grad=False)
-    melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
-    melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
-    log_magnitudes_stft = (
-        torch.squeeze(log_magnitudes_stft, 0).numpy().astype(np.float32)
-    )
-    energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
-    return melspec, log_magnitudes_stft, energy
-def _pad_spec(fbank, target_length=1024):
-    n_frames = fbank.shape[0]
-    p = target_length - n_frames
-    # cut and pad
-    if p > 0:
-        m = torch.nn.ZeroPad2d((0, 0, 0, p))
-        fbank = m(fbank)
-    elif p < 0:
-        fbank = fbank[0:target_length, :]
-    if fbank.size(-1) % 2 != 0:
-        fbank = fbank[..., :-1]
-    return fbank
-def pad_wav(waveform, segment_length):
-    waveform_length = waveform.shape[-1]
-    assert waveform_length > 100, "Waveform is too short, %s" % waveform_length
-    if segment_length is None or waveform_length == segment_length:
-        return waveform
-    elif waveform_length > segment_length:
-        return waveform[:segment_length]
-    elif waveform_length < segment_length:
-        temp_wav = np.zeros((1, segment_length))
-        temp_wav[:, :waveform_length] = waveform
-    return temp_wav
-def normalize_wav(waveform):
-    waveform = waveform - np.mean(waveform)
-    waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
-    return waveform * 0.5
-def read_wav_file(filename, segment_length):
-    # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
-    waveform, sr = torchaudio.load(filename)  # Faster!!!
-    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
-    waveform = waveform.numpy()[0, ...]
-    waveform = normalize_wav(waveform)
-    waveform = waveform[None, ...]
-    waveform = pad_wav(waveform, segment_length)
-    waveform = waveform / np.max(np.abs(waveform))
-    waveform = 0.5 * waveform
-    return waveform
-def wav_to_fbank(filename, target_length=1024, fn_STFT=None):
-    assert fn_STFT is not None
-    # mixup
-    waveform = read_wav_file(filename, target_length * 160)  # hop size is 160
-    waveform = waveform[0, ...]
-    waveform = torch.FloatTensor(waveform)
-    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
-    fbank = torch.FloatTensor(fbank.T)
-    log_magnitudes_stft = torch.FloatTensor(log_magnitudes_stft.T)
-    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
-        log_magnitudes_stft, target_length
-    )
-    return fbank, log_magnitudes_stft, waveform

audioldm/hifigan/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-from .models import Generator
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self

audioldm/hifigan/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (569 Bytes)

audioldm/hifigan/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (574 Bytes)

audioldm/hifigan/__pycache__/models.cpython-310.pyc DELETED Viewed

Binary file (3.73 kB)

audioldm/hifigan/__pycache__/models.cpython-39.pyc DELETED Viewed

Binary file (3.73 kB)

audioldm/hifigan/__pycache__/utilities.cpython-310.pyc DELETED Viewed

Binary file (2.48 kB)

audioldm/hifigan/__pycache__/utilities.cpython-39.pyc DELETED Viewed

Binary file (2.37 kB)

audioldm/hifigan/models.py DELETED Viewed

@@ -1,174 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import Conv1d, ConvTranspose1d
-from torch.nn.utils import weight_norm, remove_weight_norm
-LRELU_SLOPE = 0.1
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-class ResBlock(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-        self.convs2.apply(init_weights)
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-class Generator(torch.nn.Module):
-    def __init__(self, h):
-        super(Generator, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        self.conv_pre = weight_norm(
-            Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
-        )
-        resblock = ResBlock
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        h.upsample_initial_channel // (2**i),
-                        h.upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(h, ch, k, d))
-        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        # print("Removing weight norm...")
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)

audioldm/hifigan/utilities.py DELETED Viewed

@@ -1,86 +0,0 @@
-import os
-import json
-import torch
-import numpy as np
-import audioldm.hifigan as hifigan
-HIFIGAN_16K_64 = {
-    "resblock": "1",
-    "num_gpus": 6,
-    "batch_size": 16,
-    "learning_rate": 0.0002,
-    "adam_b1": 0.8,
-    "adam_b2": 0.99,
-    "lr_decay": 0.999,
-    "seed": 1234,
-    "upsample_rates": [5, 4, 2, 2, 2],
-    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
-    "upsample_initial_channel": 1024,
-    "resblock_kernel_sizes": [3, 7, 11],
-    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-    "segment_size": 8192,
-    "num_mels": 64,
-    "num_freq": 1025,
-    "n_fft": 1024,
-    "hop_size": 160,
-    "win_size": 1024,
-    "sampling_rate": 16000,
-    "fmin": 0,
-    "fmax": 8000,
-    "fmax_for_loss": None,
-    "num_workers": 4,
-    "dist_config": {
-        "dist_backend": "nccl",
-        "dist_url": "tcp://localhost:54321",
-        "world_size": 1,
-    },
-}
-def get_available_checkpoint_keys(model, ckpt):
-    print("==> Attemp to reload from %s" % ckpt)
-    state_dict = torch.load(ckpt)["state_dict"]
-    current_state_dict = model.state_dict()
-    new_state_dict = {}
-    for k in state_dict.keys():
-        if (
-            k in current_state_dict.keys()
-            and current_state_dict[k].size() == state_dict[k].size()
-        ):
-            new_state_dict[k] = state_dict[k]
-        else:
-            print("==> WARNING: Skipping %s" % k)
-    print(
-        "%s out of %s keys are matched"
-        % (len(new_state_dict.keys()), len(state_dict.keys()))
-    )
-    return new_state_dict
-def get_param_num(model):
-    num_param = sum(param.numel() for param in model.parameters())
-    return num_param
-def get_vocoder(config, device):
-    config = hifigan.AttrDict(HIFIGAN_16K_64)
-    vocoder = hifigan.Generator(config)
-    vocoder.eval()
-    vocoder.remove_weight_norm()
-    vocoder.to(device)
-    return vocoder
-def vocoder_infer(mels, vocoder, lengths=None):
-    vocoder.eval()
-    with torch.no_grad():
-        wavs = vocoder(mels).squeeze(1)
-    wavs = (wavs.cpu().numpy() * 32768).astype("int16")
-    if lengths is not None:
-        wavs = wavs[:, :lengths]
-    return wavs

audioldm/latent_diffusion/__init__.py DELETED Viewed

File without changes

audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (157 Bytes)

audioldm/latent_diffusion/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (164 Bytes)

audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc DELETED Viewed

Binary file (11.4 kB)

audioldm/latent_diffusion/__pycache__/attention.cpython-39.pyc DELETED Viewed

Binary file (11.4 kB)

audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc DELETED Viewed

Binary file (7.2 kB)

audioldm/latent_diffusion/__pycache__/ddim.cpython-39.pyc DELETED Viewed

Binary file (7.11 kB)

audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc DELETED Viewed

Binary file (11.1 kB)

audioldm/latent_diffusion/__pycache__/ddpm.cpython-39.pyc DELETED Viewed

Binary file (11 kB)

audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc DELETED Viewed

Binary file (3.01 kB)

audioldm/latent_diffusion/__pycache__/ema.cpython-39.pyc DELETED Viewed

Binary file (3 kB)

audioldm/latent_diffusion/__pycache__/openaimodel.cpython-39.pyc DELETED Viewed

Binary file (23.7 kB)

audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc DELETED Viewed

Binary file (9.53 kB)

audioldm/latent_diffusion/__pycache__/util.cpython-39.pyc DELETED Viewed

Binary file (9.6 kB)