Spaces:

Surn
/

UnlimitedMusicGen

Running on T4

App Files Files Community

Surn commited on Jun 13, 2023

Commit

aef7fad

1 Parent(s): 5d66b58

Testing Seed Values

Browse files

Allow loading from file

Files changed (4) hide show

app.py +19 -7
audiocraft/models/loaders.py +4 -0
audiocraft/models/musicgen.py +5 -4
audiocraft/utils/extend.py +22 -61

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
 from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, sanitize_file_name
 import numpy as np
 MODEL = None
 IS_SHARED_SPACE = "musicgen/MusicGen" in os.environ.get('SPACE_ID', '')
@@ -25,7 +26,7 @@ def load_model(version):
     return MusicGen.get_pretrained(version)
-def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color):
     global MODEL
     output_segments = None
     topk = int(topk)
@@ -36,6 +37,10 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
         segment_duration = MODEL.lm.cfg.dataset.segment_duration
     else:
         segment_duration = duration
     MODEL.set_generation_params(
         use_sampling=True,
         top_k=topk,
@@ -47,7 +52,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
     if melody:
         if duration > MODEL.lm.cfg.dataset.segment_duration:
-            output_segments = generate_music_segments(text, melody, MODEL, duration, MODEL.lm.cfg.dataset.segment_duration)
         else:
             # pure original code
             sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
@@ -76,14 +81,13 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
         output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         if include_settings:
-            video_description = f"{text}\n Duration: {str(duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef}"
             background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
-        #filename = sanitize_file_name(title) if title != "" else file.name
         audio_write(
             file.name, output, MODEL.sample_rate, strategy="loudness",
             loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
         waveform_video = gr.make_waveform(file.name,bg_image=background, bar_count=40)
-    return waveform_video
 def ui(**kwargs):
@@ -121,15 +125,23 @@ def ui(**kwargs):
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
                     duration = gr.Slider(minimum=1, maximum=1000, value=10, label="Duration", interactive=True)
                     dimension = gr.Slider(minimum=-2, maximum=1, value=1, step=1, label="Dimension", info="determines which direction to add new segements of audio. (0 = stack tracks, 1 = lengthen, -1 = ?)", interactive=True)
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, interactive=True)
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
                     temperature = gr.Number(label="Randomness Temperature", value=1.0, precision=2, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, precision=2, interactive=True)
-            with gr.Column():
                 output = gr.Video(label="Generated Music")
-        submit.click(predict, inputs=[model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color], outputs=[output])
         gr.Examples(
             fn=predict,
             examples=[

 from audiocraft.data.audio import audio_write
 from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, sanitize_file_name
 import numpy as np
+import random
 MODEL = None
 IS_SHARED_SPACE = "musicgen/MusicGen" in os.environ.get('SPACE_ID', '')
     return MusicGen.get_pretrained(version)
+def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
     global MODEL
     output_segments = None
     topk = int(topk)
         segment_duration = MODEL.lm.cfg.dataset.segment_duration
     else:
         segment_duration = duration
+    # implement seed
+    if seed < 0:
+        seed = random.randint(0, 0xffff_ffff_ffff)
+    torch.manual_seed(seed)
     MODEL.set_generation_params(
         use_sampling=True,
         top_k=topk,
     if melody:
         if duration > MODEL.lm.cfg.dataset.segment_duration:
+            output_segments = generate_music_segments(text, melody, MODEL, seed, duration, overlap, MODEL.lm.cfg.dataset.segment_duration)
         else:
             # pure original code
             sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
         output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         if include_settings:
+            video_description = f"{text}\n Duration: {str(duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}"
             background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
         audio_write(
             file.name, output, MODEL.sample_rate, strategy="loudness",
             loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
         waveform_video = gr.make_waveform(file.name,bg_image=background, bar_count=40)
+    return waveform_video, seed
 def ui(**kwargs):
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
                     duration = gr.Slider(minimum=1, maximum=1000, value=10, label="Duration", interactive=True)
+                    overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
                     dimension = gr.Slider(minimum=-2, maximum=1, value=1, step=1, label="Dimension", info="determines which direction to add new segements of audio. (0 = stack tracks, 1 = lengthen, -1 = ?)", interactive=True)
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, interactive=True)
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
                     temperature = gr.Number(label="Randomness Temperature", value=1.0, precision=2, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, precision=2, interactive=True)
+                with gr.Row():
+                    seed = gr.Number(label="Seed", value=-1, precision=0, interactive=True)
+                    gr.Button('\U0001f3b2\ufe0f').style(full_width=False).click(fn=lambda: -1, outputs=[seed], queue=False)
+                    reuse_seed = gr.Button('\u267b\ufe0f').style(full_width=False)
+            with gr.Column() as c:
                 output = gr.Video(label="Generated Music")
+                seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
+        reuse_seed.click(fn=lambda x: x, inputs=[seed_used], outputs=[seed], queue=False)
+        submit.click(predict, inputs=[model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap], outputs=[output, seed_used])
         gr.Examples(
             fn=predict,
             examples=[

audiocraft/models/loaders.py CHANGED Viewed

@@ -50,6 +50,10 @@ def _get_state_dict(
     if os.path.isfile(file_or_url_or_id):
         return torch.load(file_or_url_or_id, map_location=device)
     elif file_or_url_or_id.startswith('https://'):
         return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)

     if os.path.isfile(file_or_url_or_id):
         return torch.load(file_or_url_or_id, map_location=device)
+    if os.path.isdir(file_or_url_or_id):
+        file = f"{file_or_url_or_id}/{filename}"
+        return torch.load(file, map_location=device)
     elif file_or_url_or_id.startswith('https://'):
         return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)

audiocraft/models/musicgen.py CHANGED Viewed

@@ -80,10 +80,11 @@ class MusicGen:
             return MusicGen(name, compression_model, lm)
         if name not in HF_MODEL_CHECKPOINTS_MAP:
-            raise ValueError(
-                f"{name} is not a valid checkpoint name. "
-                f"Choose one of {', '.join(HF_MODEL_CHECKPOINTS_MAP.keys())}"
-            )
         cache_dir = os.environ.get('MUSICGEN_ROOT', None)
         compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)

             return MusicGen(name, compression_model, lm)
         if name not in HF_MODEL_CHECKPOINTS_MAP:
+            if not os.path.isfile(name) and not os.path.isdir(name):
+                raise ValueError(
+                    f"{name} is not a valid checkpoint name. "
+                    f"Choose one of {', '.join(HF_MODEL_CHECKPOINTS_MAP.keys())}"
+                )
         cache_dir = os.environ.get('MUSICGEN_ROOT', None)
         compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)

audiocraft/utils/extend.py CHANGED Viewed

@@ -8,29 +8,34 @@ import tempfile
 import os
 import textwrap
-def separate_audio_segments(audio, segment_duration=30):
     sr, audio_data = audio[0], audio[1]
     total_samples = len(audio_data)
     segment_samples = sr * segment_duration
-    total_segments = math.ceil(total_samples / segment_samples)
     segments = []
-    for segment_idx in range(total_segments):
-        print(f"Audio Input segment {segment_idx + 1} / {total_segments + 1} \r")
-        start_sample = segment_idx * segment_samples
-        end_sample = (segment_idx + 1) * segment_samples
         segment = audio_data[start_sample:end_sample]
         segments.append((sr, segment))
     return segments
-def generate_music_segments(text, melody, MODEL, duration:int=10, segment_duration:int=30):
     # generate audio segments
-    melody_segments = separate_audio_segments(melody, segment_duration)
     # Create a list to store the melody tensors for each segment
     melodys = []
@@ -40,7 +45,7 @@ def generate_music_segments(text, melody, MODEL, duration:int=10, segment_durati
     total_segments = max(math.ceil(duration / segment_duration),1)
     print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds")
-    # If melody_segments is shorter than total_segments, repeat the segments until the total number of segments is reached
     if len(melody_segments) < total_segments:
         for i in range(total_segments - len(melody_segments)):
             segment = melody_segments[i]
@@ -59,6 +64,7 @@ def generate_music_segments(text, melody, MODEL, duration:int=10, segment_durati
         # Append the segment to the melodys list
         melodys.append(verse)
     for idx, verse in enumerate(melodys):
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
         output = MODEL.generate_with_chroma(
@@ -74,42 +80,6 @@ def generate_music_segments(text, melody, MODEL, duration:int=10, segment_durati
         print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
     return output_segments
-#def generate_music_segments(text, melody, duration, MODEL, segment_duration=30):
-#    sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
-#    # Create a list to store the melody tensors for each segment
-#    melodys = []
-#    # Calculate the total number of segments
-#    total_segments = math.ceil(melody.shape[1] / (sr * segment_duration))
-#    # Iterate over the segments
-#    for segment_idx in range(total_segments):
-#        print(f"segment {segment_idx + 1} / {total_segments + 1} \r")
-#        start_frame = segment_idx * sr * segment_duration
-#        end_frame = (segment_idx + 1) * sr * segment_duration
-#        # Extract the segment from the melody tensor
-#        segment = melody[:, start_frame:end_frame]
-#        # Append the segment to the melodys list
-#        melodys.append(segment)
-#    output_segments = []
-#    for segment in melodys:
-#        output = MODEL.generate_with_chroma(
-#            descriptions=[text],
-#            melody_wavs=segment,
-#            melody_sample_rate=sr,
-#            progress=False
-#        )
-#        # Append the generated output to the list of segments
-#        output_segments.append(output[:, :segment_duration])
-#    return output_segments
 def save_image(image):
     """
     Saves a PIL image to a temporary file and returns the file path.
@@ -184,13 +154,4 @@ def add_settings_to_image(title: str = "title", description: str = "", width: in
     background.paste(image, offset, mask=image)
     # Save the image and return the file path
-    return save_image(background)
-def sanitize_file_name(filename):
-    valid_chars = "-_.() " + string.ascii_letters + string.digits
-    sanitized_filename = ''.join(c for c in filename if c in valid_chars)
-    return sanitized_filename

 import os
 import textwrap
+def separate_audio_segments(audio, segment_duration=30, overlap=1):
     sr, audio_data = audio[0], audio[1]
     total_samples = len(audio_data)
     segment_samples = sr * segment_duration
+    overlap_samples = sr * overlap
     segments = []
+    start_sample = 0
+    while total_samples >= segment_samples:
+        end_sample = start_sample + segment_samples
         segment = audio_data[start_sample:end_sample]
         segments.append((sr, segment))
+        start_sample += segment_samples - overlap_samples
+        total_samples -= segment_samples - overlap_samples
+    # Collect the final segment
+    if total_samples > 0:
+        segment = audio_data[-segment_samples:]
+        segments.append((sr, segment))
     return segments
+def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:int=1, segment_duration:int=30):
     # generate audio segments
+    melody_segments = separate_audio_segments(melody, segment_duration, overlap)
     # Create a list to store the melody tensors for each segment
     melodys = []
     total_segments = max(math.ceil(duration / segment_duration),1)
     print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds")
+    # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
     if len(melody_segments) < total_segments:
         for i in range(total_segments - len(melody_segments)):
             segment = melody_segments[i]
         # Append the segment to the melodys list
         melodys.append(verse)
+    torch.manual_seed(seed)
     for idx, verse in enumerate(melodys):
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
         output = MODEL.generate_with_chroma(
         print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
     return output_segments
 def save_image(image):
     """
     Saves a PIL image to a temporary file and returns the file path.
     background.paste(image, offset, mask=image)
     # Save the image and return the file path
+    return save_image(background)