MusicGen

Build error

App Files Files Community

Alexandre Défossez commited on Jun 11, 2023

Commit

23fe483

unverified ·

1 Parent(s): 8e10a53

Improve demo (#51)

Browse files

* allowing sharing directly, changelog, reduce volume.

* activate

* plop

Files changed (9) hide show

CHANGELOG.md +11 -2
README.md +1 -1
app.py +11 -9
app_batched.py +3 -1
audiocraft/__init__.py +1 -1
audiocraft/data/audio.py +3 -1
audiocraft/data/audio_utils.py +9 -4
audiocraft/models/musicgen.py +2 -0
audiocraft/modules/conditioners.py +6 -2

CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
-## [0.0.1a] - TBD
-Initial release, with model evaluation only.

 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [0.0.2a] - TBD
+Improved demo, fixed top p (thanks @jnordberg).
+Compressor tanh on output to avoid clipping with some style (especially piano).
+Now repeating the conditioning periodically if it is too short.
+More options when launching Gradio app locally (thanks @ashleykleynhans).
+## [0.0.1] - 2023-06-09
+Initial release, with model evaluation only.

README.md CHANGED Viewed

@@ -80,7 +80,7 @@ wav = model.generate_with_chroma(descriptions, melody[None].expand(3, -1, -1), s
 for idx, one_wav in enumerate(wav):
     # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
-    audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness")
 ```

 for idx, one_wav in enumerate(wav):
     # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
+    audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
 ```

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ import gradio as gr
 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
 MODEL = None
@@ -56,7 +55,9 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
-        audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
         waveform_video = gr.make_waveform(file.name)
     return waveform_video
@@ -66,7 +67,7 @@ def ui(**kwargs):
         gr.Markdown(
             """
             # MusicGen
             This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
             <br/>
@@ -129,19 +130,19 @@ def ui(**kwargs):
         gr.Markdown(
             """
             ### More details
             The model will generate a short music extract based on the description you provided.
             You can generate up to 30 seconds of audio.
             We present 4 model variations:
             1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
             2. Small -- a 300M transformer decoder conditioned on text only.
             3. Medium -- a 1.5B transformer decoder conditioned on text only.
             4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
             When using `melody`, ou can optionaly provide a reference audio from
             which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
             You can also use your own GPU or a Google Colab by following the instructions on our repo.
             See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
             for more details.
@@ -168,7 +169,8 @@ def ui(**kwargs):
         if share:
             launch_kwargs['share'] = share
-        interface.launch(**launch_kwargs)
 if __name__ == "__main__":
     # torch.cuda.set_per_process_memory_fraction(0.48)
@@ -207,4 +209,4 @@ if __name__ == "__main__":
         server_port=args.server_port,
         share=args.share,
         listen=args.listen
-    )

 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
 MODEL = None
     output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+        audio_write(
+            file.name, output, MODEL.sample_rate, strategy="loudness",
+            loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
         waveform_video = gr.make_waveform(file.name)
     return waveform_video
         gr.Markdown(
             """
             # MusicGen
             This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
             <br/>
         gr.Markdown(
             """
             ### More details
             The model will generate a short music extract based on the description you provided.
             You can generate up to 30 seconds of audio.
             We present 4 model variations:
             1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
             2. Small -- a 300M transformer decoder conditioned on text only.
             3. Medium -- a 1.5B transformer decoder conditioned on text only.
             4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
             When using `melody`, ou can optionaly provide a reference audio from
             which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
             You can also use your own GPU or a Google Colab by following the instructions on our repo.
             See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
             for more details.
         if share:
             launch_kwargs['share'] = share
+        interface.queue().launch(**launch_kwargs, max_threads=1)
 if __name__ == "__main__":
     # torch.cuda.set_per_process_memory_fraction(0.48)
         server_port=args.server_port,
         share=args.share,
         listen=args.listen
+    )

app_batched.py CHANGED Viewed

@@ -57,7 +57,9 @@ def predict(texts, melodies):
     out_files = []
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
-            audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
             waveform_video = gr.make_waveform(file.name)
             out_files.append(waveform_video)
     return [out_files]

     out_files = []
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            audio_write(
+                file.name, output, MODEL.sample_rate, strategy="loudness",
+                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
             waveform_video = gr.make_waveform(file.name)
             out_files.append(waveform_video)
     return [out_files]

audiocraft/__init__.py CHANGED Viewed

@@ -7,4 +7,4 @@
 # flake8: noqa
 from . import data, modules, models
-__version__ = '0.0.1'

 # flake8: noqa
 from . import data, modules, models
+__version__ = '0.0.2a1'

audiocraft/data/audio.py CHANGED Viewed

@@ -155,6 +155,7 @@ def audio_write(stem_name: tp.Union[str, Path],
                 format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
                 strategy: str = 'peak', peak_clip_headroom_db: float = 1,
                 rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
                 log_clipping: bool = True, make_parent_dir: bool = True,
                 add_suffix: bool = True) -> Path:
     """Convenience function for saving audio to disk. Returns the filename the audio was written to.
@@ -173,7 +174,8 @@ def audio_write(stem_name: tp.Union[str, Path],
         rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
             than the `peak_clip` one to avoid further clipping.
         loudness_headroom_db (float): Target loudness for loudness normalization.
-        log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         make_parent_dir (bool): Make parent directory if it doesn't exist.
     Returns:

                 format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
                 strategy: str = 'peak', peak_clip_headroom_db: float = 1,
                 rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                loudness_compressor: bool = False,
                 log_clipping: bool = True, make_parent_dir: bool = True,
                 add_suffix: bool = True) -> Path:
     """Convenience function for saving audio to disk. Returns the filename the audio was written to.
         rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
             than the `peak_clip` one to avoid further clipping.
         loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
+         when strategy is 'loudness'log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         make_parent_dir (bool): Make parent directory if it doesn't exist.
     Returns:

audiocraft/data/audio_utils.py CHANGED Viewed

@@ -54,8 +54,8 @@ def convert_audio(wav: torch.Tensor, from_rate: float,
     return wav
-def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 12,
-                       energy_floor: float = 2e-3):
     """Normalize an input signal to a user loudness in dB LKFS.
     Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
@@ -63,6 +63,7 @@ def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db
         wav (torch.Tensor): Input multichannel audio data.
         sample_rate (int): Sample rate.
         loudness_headroom_db (float): Target loudness of the output in dB LUFS.
         energy_floor (float): anything below that RMS level will not be rescaled.
     Returns:
         output (torch.Tensor): Loudness normalized output data.
@@ -76,6 +77,8 @@ def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db
     delta_loudness = -loudness_headroom_db - input_loudness_db
     gain = 10.0 ** (delta_loudness / 20.0)
     output = gain * wav
     assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
     return output
@@ -93,7 +96,8 @@ def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optio
 def normalize_audio(wav: torch.Tensor, normalize: bool = True,
                     strategy: str = 'peak', peak_clip_headroom_db: float = 1,
                     rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                    log_clipping: bool = False, sample_rate: tp.Optional[int] = None,
                     stem_name: tp.Optional[str] = None) -> torch.Tensor:
     """Normalize the audio according to the prescribed strategy (see after).
@@ -109,6 +113,7 @@ def normalize_audio(wav: torch.Tensor, normalize: bool = True,
         rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
             than the `peak_clip` one to avoid further clipping.
         loudness_headroom_db (float): Target loudness for loudness normalization.
         log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         sample_rate (int): Sample rate for the audio data (required for loudness).
@@ -132,7 +137,7 @@ def normalize_audio(wav: torch.Tensor, normalize: bool = True,
         _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
     elif strategy == 'loudness':
         assert sample_rate is not None, "Loudness normalization requires sample rate."
-        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db)
         _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
     else:
         assert wav.abs().max() < 1

     return wav
+def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
+                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
     """Normalize an input signal to a user loudness in dB LKFS.
     Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
         wav (torch.Tensor): Input multichannel audio data.
         sample_rate (int): Sample rate.
         loudness_headroom_db (float): Target loudness of the output in dB LUFS.
+        loudness_compressor (bool): Uses tanh for soft clipping.
         energy_floor (float): anything below that RMS level will not be rescaled.
     Returns:
         output (torch.Tensor): Loudness normalized output data.
     delta_loudness = -loudness_headroom_db - input_loudness_db
     gain = 10.0 ** (delta_loudness / 20.0)
     output = gain * wav
+    if loudness_compressor:
+        output = torch.tanh(output)
     assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
     return output
 def normalize_audio(wav: torch.Tensor, normalize: bool = True,
                     strategy: str = 'peak', peak_clip_headroom_db: float = 1,
                     rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                    loudness_compressor: bool = False, log_clipping: bool = False,
+                    sample_rate: tp.Optional[int] = None,
                     stem_name: tp.Optional[str] = None) -> torch.Tensor:
     """Normalize the audio according to the prescribed strategy (see after).
         rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
             than the `peak_clip` one to avoid further clipping.
         loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): If True, uses tanh based soft clipping.
         log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         sample_rate (int): Sample rate for the audio data (required for loudness).
         _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
     elif strategy == 'loudness':
         assert sample_rate is not None, "Loudness normalization requires sample rate."
+        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
         _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
     else:
         assert wav.abs().max() < 1

audiocraft/models/musicgen.py CHANGED Viewed

@@ -88,6 +88,8 @@ class MusicGen:
         cache_dir = os.environ.get('MUSICGEN_ROOT', None)
         compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
         lm = load_lm_model(name, device=device, cache_dir=cache_dir)
         return MusicGen(name, compression_model, lm)

         cache_dir = os.environ.get('MUSICGEN_ROOT', None)
         compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
         lm = load_lm_model(name, device=device, cache_dir=cache_dir)
+        if name == 'melody' and True:
+            lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
         return MusicGen(name, compression_model, lm)

audiocraft/modules/conditioners.py CHANGED Viewed

@@ -9,6 +9,7 @@ from copy import deepcopy
 from dataclasses import dataclass, field
 from itertools import chain
 import logging
 import random
 import re
 import typing as tp
@@ -484,7 +485,7 @@ class ChromaStemConditioner(WaveformConditioner):
         **kwargs: Additional parameters for the chroma extractor.
     """
     def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
-                 duration: float, match_len_on_eval: bool = False, eval_wavs: tp.Optional[str] = None,
                  n_eval_wavs: int = 0, device: tp.Union[torch.device, str] = "cpu", **kwargs):
         from demucs import pretrained
         super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
@@ -535,7 +536,10 @@ class ChromaStemConditioner(WaveformConditioner):
                 chroma = chroma[:, :self.chroma_len]
                 logger.debug(f'chroma was truncated! ({t} -> {chroma.shape[1]})')
             elif t < self.chroma_len:
-                chroma = F.pad(chroma, (0, 0, 0, self.chroma_len - t))
                 logger.debug(f'chroma was zero-padded! ({t} -> {chroma.shape[1]})')
         return chroma

 from dataclasses import dataclass, field
 from itertools import chain
 import logging
+import math
 import random
 import re
 import typing as tp
         **kwargs: Additional parameters for the chroma extractor.
     """
     def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
+                 duration: float, match_len_on_eval: bool = True, eval_wavs: tp.Optional[str] = None,
                  n_eval_wavs: int = 0, device: tp.Union[torch.device, str] = "cpu", **kwargs):
         from demucs import pretrained
         super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
                 chroma = chroma[:, :self.chroma_len]
                 logger.debug(f'chroma was truncated! ({t} -> {chroma.shape[1]})')
             elif t < self.chroma_len:
+                # chroma = F.pad(chroma, (0, 0, 0, self.chroma_len - t))
+                n_repeat = int(math.ceil(self.chroma_len / t))
+                chroma = chroma.repeat(1, n_repeat, 1)
+                chroma = chroma[:, :self.chroma_len]
                 logger.debug(f'chroma was zero-padded! ({t} -> {chroma.shape[1]})')
         return chroma