thecollabagepatch commited on
Commit
3e4e311
1 Parent(s): abfd774
Files changed (4) hide show
  1. .gitmodules +3 -0
  2. app.py +186 -0
  3. packages.txt +2 -0
  4. requirements.txt +5 -0
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "audiocraft"]
2
+ path = audiocraft
3
+ url = https://github.com/facebookresearch/audiocraft
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from musiclang_predict import MusicLangPredictor
3
+ import random
4
+ import subprocess
5
+ import os
6
+ import torchaudio
7
+ import torch
8
+ import numpy as np
9
+ from audiocraft.models import MusicGen
10
+ from audiocraft.data.audio import audio_write
11
+ from pydub import AudioSegment
12
+
13
+ # Utility Functions
14
+ def peak_normalize(y, target_peak=0.97):
15
+ return target_peak * (y / np.max(np.abs(y)))
16
+
17
+ def rms_normalize(y, target_rms=0.05):
18
+ return y * (target_rms / np.sqrt(np.mean(y**2)))
19
+
20
+ def preprocess_audio(waveform):
21
+ waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy
22
+ processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
23
+ return torch.from_numpy(processed_waveform_np).unsqueeze(0).to(device)
24
+
25
+ def create_slices(song, sr, slice_duration, bpm, num_slices=5):
26
+ song_length = song.shape[-1] / sr
27
+ slices = []
28
+
29
+ # Ensure the first slice is from the beginning of the song
30
+ first_slice_waveform = song[..., :int(slice_duration * sr)]
31
+ slices.append(first_slice_waveform)
32
+
33
+ for i in range(1, num_slices):
34
+ random_start = random.choice(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
35
+ slice_end = random_start + int(slice_duration * sr)
36
+
37
+ if slice_end > song_length * sr:
38
+ # Wrap around to the beginning of the song
39
+ remaining_samples = int(slice_end - song_length * sr)
40
+ slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
41
+ else:
42
+ slice_waveform = song[..., random_start:slice_end]
43
+
44
+ if len(slice_waveform.squeeze()) < int(slice_duration * sr):
45
+ additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
46
+ slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)
47
+
48
+ slices.append(slice_waveform)
49
+
50
+ return slices
51
+
52
+ def calculate_duration(bpm, min_duration=29, max_duration=30):
53
+ single_bar_duration = 4 * 60 / bpm
54
+ bars = max(min_duration // single_bar_duration, 1)
55
+
56
+ while single_bar_duration * bars < min_duration:
57
+ bars += 1
58
+
59
+ duration = single_bar_duration * bars
60
+
61
+ while duration > max_duration and bars > 1:
62
+ bars -= 1
63
+ duration = single_bar_duration * bars
64
+
65
+ return duration
66
+
67
+ def generate_music(seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm):
68
+ if seed == "":
69
+ seed = random.randint(1, 10000)
70
+
71
+ ml = MusicLangPredictor('musiclang/musiclang-v2')
72
+
73
+ try:
74
+ seed = int(seed)
75
+ except ValueError:
76
+ seed = random.randint(1, 10000)
77
+
78
+ nb_tokens = 4096
79
+ temperature = 0.9
80
+ top_p = 1.0
81
+
82
+ if use_chords and chord_progression.strip():
83
+ score = ml.predict_chords(
84
+ chord_progression,
85
+ time_signature=(4, 4),
86
+ temperature=temperature,
87
+ topp=top_p,
88
+ rng_seed=seed
89
+ )
90
+ else:
91
+ score = ml.predict(
92
+ nb_tokens=nb_tokens,
93
+ temperature=temperature,
94
+ topp=top_p,
95
+ rng_seed=seed
96
+ )
97
+
98
+ midi_filename = f"output_{seed}.mid"
99
+ wav_filename = midi_filename.replace(".mid", ".wav")
100
+
101
+ score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))
102
+
103
+ subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])
104
+
105
+ # Load the generated audio
106
+ song, sr = torchaudio.load(wav_filename)
107
+ song = song.to(device)
108
+
109
+ # Use the user-provided BPM value for duration calculation
110
+ duration = calculate_duration(bpm)
111
+
112
+ # Create slices from the song using the user-provided BPM value
113
+ slices = create_slices(song, sr, 35, bpm, num_slices=5)
114
+
115
+ # Load the model
116
+ model_continue = MusicGen.get_pretrained(musicgen_model)
117
+
118
+ # Setting generation parameters
119
+ model_continue.set_generation_params(
120
+ use_sampling=True,
121
+ top_k=250,
122
+ top_p=0.0,
123
+ temperature=1.0,
124
+ duration=duration,
125
+ cfg_coef=3
126
+ )
127
+
128
+ all_audio_files = []
129
+
130
+ for i in range(num_iterations):
131
+ slice_idx = i % len(slices)
132
+
133
+ print(f"Running iteration {i + 1} using slice {slice_idx}...")
134
+
135
+ prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
136
+ prompt_waveform = preprocess_audio(prompt_waveform)
137
+
138
+ output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
139
+ output = output.cpu() # Move the output tensor back to CPU
140
+
141
+ # Make sure the output tensor has at most 2 dimensions
142
+ if len(output.size()) > 2:
143
+ output = output.squeeze()
144
+
145
+ filename_without_extension = f'continue_{i}'
146
+ filename_with_extension = f'{filename_without_extension}.wav'
147
+
148
+ audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
149
+ all_audio_files.append(f'{filename_without_extension}.wav.wav') # Assuming the library appends an extra .wav
150
+
151
+ # Combine all audio files
152
+ combined_audio = AudioSegment.empty()
153
+ for filename in all_audio_files:
154
+ combined_audio += AudioSegment.from_wav(filename)
155
+
156
+ combined_audio_filename = f"combined_audio_{seed}.mp3"
157
+ combined_audio.export(combined_audio_filename, format="mp3")
158
+
159
+ # Clean up temporary files
160
+ os.remove(midi_filename)
161
+ os.remove(wav_filename)
162
+ for filename in all_audio_files:
163
+ os.remove(filename)
164
+
165
+ return combined_audio_filename
166
+
167
+ # Check if CUDA is available
168
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
169
+
170
+ iface = gr.Interface(
171
+ fn=generate_music,
172
+ inputs=[
173
+ gr.Textbox(label="Seed (leave blank for random)", value=""),
174
+ gr.Checkbox(label="Control Chord Progression", value=False),
175
+ gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True),
176
+ gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=7),
177
+ gr.Textbox(label="MusicGen Model", value="thepatch/vanya_ai_dnb_0.1"),
178
+ gr.Slider(label="Number of Iterations", minimum=1, maximum=10, step=1, value=3),
179
+ gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=140)
180
+ ],
181
+ outputs=gr.Audio(label="Generated Music"),
182
+ title="Music Generation Slot Machine",
183
+ description="Enter a seed to generate music or leave blank for a random tune! Optionally, control the chord progression, prompt duration, MusicGen model, number of iterations, and BPM."
184
+ )
185
+
186
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ fluidsynth &> /dev/null
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch==2.1.0
2
+ musiclang_predict
3
+ gradio
4
+ pyFluidSynth &> /dev/null
5
+ midi2audio &> /dev/null