moving the audiogen tab before musicgen
Browse files
app.py
CHANGED
@@ -188,6 +188,7 @@ def get_audio_info(audio_path):
|
|
188 |
return "No tags found. Either the file is not generated by MusicGen+ V1.2.7 and higher or the tags are corrupted. (Discord removes metadata from mp4 and wav files, so you can't use them)"
|
189 |
json_string = song.tags['COMMENT'][0]
|
190 |
data = json.loads(json_string)
|
|
|
191 |
global_prompt = str("\nGlobal Prompt: " + (data['global_prompt'] if data['global_prompt'] != "" else "none")) if 'global_prompt' in data else ""
|
192 |
bpm = str("\nBPM: " + data['bpm']) if 'bpm' in data else ""
|
193 |
key = str("\nKey: " + data['key']) if 'key' in data else ""
|
@@ -212,13 +213,15 @@ def get_audio_info(audio_path):
|
|
212 |
version = str("Version: " + data['version']) if 'version' in data else "Version: Unknown"
|
213 |
info = str(version + global_prompt + bpm + key + scale + prompts + duration + overlap + seed + audio_mode + input_length + channel + sr_select + model + custom_model + base_model + decoder + topk + topp + temperature + cfg_coef)
|
214 |
if info == "":
|
215 |
-
return "No tags found. Either the file is not generated by
|
|
|
216 |
return info
|
217 |
else:
|
218 |
with open(audio_path.name) as json_file:
|
219 |
data = json.load(json_file)
|
220 |
#if 'global_prompt' not in data:
|
221 |
-
#return "No tags found. Either the file is not generated by
|
|
|
222 |
global_prompt = str("\nGlobal Prompt: " + (data['global_prompt'] if data['global_prompt'] != "" else "none")) if 'global_prompt' in data else ""
|
223 |
bpm = str("\nBPM: " + data['bpm']) if 'bpm' in data else ""
|
224 |
key = str("\nKey: " + data['key']) if 'key' in data else ""
|
@@ -243,7 +246,8 @@ def get_audio_info(audio_path):
|
|
243 |
version = str("Version: " + data['version']) if 'version' in data else "Version: Unknown"
|
244 |
info = str(version + global_prompt + bpm + key + scale + prompts + duration + overlap + seed + audio_mode + input_length + channel + sr_select + model + custom_model + base_model + decoder + topk + topp + temperature + cfg_coef)
|
245 |
if info == "":
|
246 |
-
return "No tags found. Either the file is not generated by
|
|
|
247 |
return info
|
248 |
else:
|
249 |
return "Only .wav ,.mp4 and .json files are supported"
|
@@ -467,7 +471,8 @@ def load_diffusion():
|
|
467 |
global MBD
|
468 |
if MBD is None:
|
469 |
print("loading MBD")
|
470 |
-
MBD = MultiBandDiffusion.
|
|
|
471 |
|
472 |
|
473 |
def unload_diffusion():
|
@@ -852,12 +857,14 @@ def predict_full(gen_type, model, decoder, custom_model, base_model, prompt_amou
|
|
852 |
|
853 |
if gen_type == "music":
|
854 |
model_shrt = model
|
855 |
-
model = "GrandaddyShmax/
|
|
|
856 |
elif gen_type == "audio":
|
857 |
model_shrt = model
|
858 |
model = "GrandaddyShmax/audiogen-" + model
|
859 |
base_model_shrt = base_model
|
860 |
-
base_model = "GrandaddyShmax/
|
|
|
861 |
|
862 |
if MODEL is None or MODEL.name != (model):
|
863 |
load_model(model, custom_model, base_model, gen_type)
|
@@ -957,118 +964,106 @@ def ui_full(launch_kwargs):
|
|
957 |
Welcome to Soundscapes - TulipAI’s flagship Audio Storytelling Toolkit. Designed with modern content creators in mind, our AI-driven platform generates audio sound effects in just minutes tailored to your unique needs.
|
958 |
"""
|
959 |
)
|
960 |
-
with gr.Tab("
|
961 |
gr.Markdown(
|
962 |
"""
|
963 |
-
###
|
964 |
"""
|
965 |
)
|
966 |
with gr.Row():
|
967 |
with gr.Column():
|
968 |
with gr.Tab("Generation"):
|
969 |
with gr.Accordion("Structure Prompts", open=False):
|
970 |
-
with gr.
|
971 |
-
|
972 |
-
|
973 |
-
bpm = gr.Number(label="BPM", value=120, interactive=True, scale=1, precision=0)
|
974 |
-
key = gr.Dropdown(["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "Bb", "B"], label="Key", value="C", interactive=True)
|
975 |
-
scale = gr.Dropdown(["Major", "Minor"], label="Scale", value="Major", interactive=True)
|
976 |
-
with gr.Row():
|
977 |
-
global_prompt = gr.Text(label="Global Prompt", interactive=True, scale=3)
|
978 |
with gr.Row():
|
979 |
-
|
980 |
-
#s_mode = gr.Radio(["segmentation", "batch"], value="segmentation", interactive=True, scale=1, label="Generation Mode")
|
981 |
with gr.Column():
|
982 |
-
|
983 |
-
|
984 |
-
|
985 |
-
|
986 |
with gr.Row():
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
for i in range(max_textboxes):
|
994 |
-
with gr.Row(visible=False) as
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
with gr.Row():
|
1004 |
-
|
1005 |
with gr.Row():
|
1006 |
-
|
1007 |
with gr.Row():
|
1008 |
-
|
1009 |
-
gr.Button('\U0001f3b2\ufe0f', scale=1).click(fn=lambda: -1, outputs=[
|
1010 |
-
|
1011 |
|
1012 |
with gr.Tab("Audio"):
|
1013 |
with gr.Row():
|
1014 |
with gr.Column():
|
1015 |
-
|
1016 |
-
|
1017 |
with gr.Row():
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
|
1022 |
with gr.Tab("Customization"):
|
1023 |
with gr.Row():
|
1024 |
with gr.Column():
|
1025 |
-
|
1026 |
-
|
1027 |
-
|
1028 |
with gr.Column():
|
1029 |
-
|
1030 |
with gr.Row():
|
1031 |
-
|
1032 |
-
|
1033 |
|
1034 |
with gr.Tab("Settings"):
|
1035 |
with gr.Row():
|
1036 |
-
|
1037 |
-
|
1038 |
-
with gr.Row():
|
1039 |
-
model = gr.Radio(["melody", "small", "medium", "large", "custom"], label="Model", value="large", interactive=True, scale=1)
|
1040 |
-
with gr.Column():
|
1041 |
-
dropdown = gr.Dropdown(choices=get_available_models(), value=("No models found" if len(get_available_models()) < 1 else get_available_models()[0]), label='Custom Model (models folder)', elem_classes='slim-dropdown', interactive=True)
|
1042 |
-
ui.create_refresh_button(dropdown, lambda: None, lambda: {'choices': get_available_models()}, 'refresh-button')
|
1043 |
-
basemodel = gr.Radio(["small", "medium", "melody", "large"], label="Base Model", value="medium", interactive=True, scale=1)
|
1044 |
with gr.Row():
|
1045 |
-
|
|
|
1046 |
with gr.Row():
|
1047 |
-
|
1048 |
-
|
1049 |
-
|
1050 |
-
|
1051 |
with gr.Row():
|
1052 |
-
|
1053 |
-
# Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
|
1054 |
_ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
|
1055 |
-
with gr.Column()
|
1056 |
with gr.Tab("Output"):
|
1057 |
-
|
1058 |
with gr.Row():
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
with gr.Tab("Wiki"):
|
1065 |
gr.Markdown(
|
1066 |
"""
|
1067 |
- **[Generate (button)]:**
|
1068 |
-
Generates the
|
1069 |
|
1070 |
- **[Interrupt (button)]:**
|
1071 |
-
Stops the
|
1072 |
|
1073 |
---
|
1074 |
|
@@ -1082,30 +1077,21 @@ def ui_full(launch_kwargs):
|
|
1082 |
- **[Structure Prompts (checkbox)]:**
|
1083 |
Enable/Disable the structure prompts feature.
|
1084 |
|
1085 |
-
- **[BPM (number)]:**
|
1086 |
-
Beats per minute of the generated music.
|
1087 |
-
|
1088 |
-
- **[Key (dropdown)]:**
|
1089 |
-
The key of the generated music.
|
1090 |
-
|
1091 |
-
- **[Scale (dropdown)]:**
|
1092 |
-
The scale of the generated music.
|
1093 |
-
|
1094 |
- **[Global Prompt (text)]:**
|
1095 |
Here write the prompt that you wish to be used for all prompt segments.
|
1096 |
|
1097 |
#### Multi-Prompt:
|
1098 |
|
1099 |
-
This feature allows you to control the
|
1100 |
-
You have up to 10 prompt segments. the first prompt will always be
|
1101 |
-
the other prompts will be [
|
1102 |
-
for example if the overlap is
|
1103 |
|
1104 |
- **[Prompt Segments (number)]:**
|
1105 |
-
Amount of unique prompt to generate throughout the
|
1106 |
|
1107 |
- **[Prompt/Input Text (prompt)]:**
|
1108 |
-
Here describe the
|
1109 |
|
1110 |
- **[Repeat (number)]:**
|
1111 |
Write how many times this prompt will repeat (instead of wasting another prompt segment on the same prompt).
|
@@ -1117,15 +1103,15 @@ def ui_full(launch_kwargs):
|
|
1117 |
Calculates the timings of the prompt segments.
|
1118 |
|
1119 |
- **[Duration (number)]:**
|
1120 |
-
How long you want the generated
|
1121 |
|
1122 |
- **[Overlap (number)]:**
|
1123 |
How much each new segment will reference the previous segment (in seconds).
|
1124 |
-
For example, if you choose
|
1125 |
-
and will generate only
|
1126 |
|
1127 |
- **[Seed (number)]:**
|
1128 |
-
Your generated
|
1129 |
place the exact seed with the exact prompts
|
1130 |
(This way you can also extend specific song that was generated short).
|
1131 |
|
@@ -1143,16 +1129,12 @@ def ui_full(launch_kwargs):
|
|
1143 |
`File` mode allows you to upload an audio file to use as input
|
1144 |
`Mic` mode allows you to use your microphone as input
|
1145 |
|
1146 |
-
- **[Input Audio Mode (selection)]:**
|
1147 |
-
`Melody` mode only works with the melody model: it conditions the music generation to reference the melody
|
1148 |
-
`Sample` mode works with any model: it gives a music sample to the model to generate its continuation.
|
1149 |
-
|
1150 |
- **[Trim Start and Trim End (numbers)]:**
|
1151 |
`Trim Start` set how much you'd like to trim the input audio from the start
|
1152 |
`Trim End` same as the above but from the end
|
1153 |
|
1154 |
- **[Input Audio (audio file)]:**
|
1155 |
-
Input here the audio you wish to use
|
1156 |
|
1157 |
---
|
1158 |
|
@@ -1187,29 +1169,6 @@ def ui_full(launch_kwargs):
|
|
1187 |
- **[Output Audio Sample Rate (dropdown)]:**
|
1188 |
The output audio sample rate, the model default is 32000.
|
1189 |
|
1190 |
-
- **[Model (selection)]:**
|
1191 |
-
Here you can choose which model you wish to use:
|
1192 |
-
`melody` model is based on the medium model with a unique feature that lets you use melody conditioning
|
1193 |
-
`small` model is trained on 300M parameters
|
1194 |
-
`medium` model is trained on 1.5B parameters
|
1195 |
-
`large` model is trained on 3.3B parameters
|
1196 |
-
`custom` model runs the custom model that you provided.
|
1197 |
-
|
1198 |
-
- **[Custom Model (selection)]:**
|
1199 |
-
This dropdown will show you models that are placed in the `models` folder
|
1200 |
-
you must select `custom` in the model options in order to use it.
|
1201 |
-
|
1202 |
-
- **[Refresh (button)]:**
|
1203 |
-
Refreshes the dropdown list for custom model.
|
1204 |
-
|
1205 |
-
- **[Base Model (selection)]:**
|
1206 |
-
Choose here the model that your custom model is based on.
|
1207 |
-
|
1208 |
-
- **[Decoder (selection)]:**
|
1209 |
-
Choose here the decoder that you wish to use:
|
1210 |
-
`Default` is the default decoder
|
1211 |
-
`MultiBand_Diffusion` is a decoder that uses diffusion to generate the audio.
|
1212 |
-
|
1213 |
- **[Top-k (number)]:**
|
1214 |
is a parameter used in text generation models, including music generation models. It determines the number of most likely next tokens to consider at each step of the generation process. The model ranks all possible tokens based on their predicted probabilities, and then selects the top-k tokens from the ranked list. The model then samples from this reduced set of tokens to determine the next token in the generated sequence. A smaller value of k results in a more focused and deterministic output, while a larger value of k allows for more diversity in the generated music.
|
1215 |
|
@@ -1223,106 +1182,118 @@ def ui_full(launch_kwargs):
|
|
1223 |
refers to a technique used in some music generation models where a separate classifier network is trained to provide guidance or control over the generated music. This classifier is trained on labeled data to recognize specific musical characteristics or styles. During the generation process, the output of the generator model is evaluated by the classifier, and the generator is encouraged to produce music that aligns with the desired characteristics or style. This approach allows for more fine-grained control over the generated music, enabling users to specify certain attributes they want the model to capture.
|
1224 |
"""
|
1225 |
)
|
1226 |
-
with gr.Tab("
|
1227 |
gr.Markdown(
|
1228 |
"""
|
1229 |
-
###
|
1230 |
"""
|
1231 |
)
|
1232 |
with gr.Row():
|
1233 |
with gr.Column():
|
1234 |
with gr.Tab("Generation"):
|
1235 |
with gr.Accordion("Structure Prompts", open=False):
|
1236 |
-
with gr.
|
1237 |
-
|
1238 |
-
|
|
|
|
|
|
|
|
|
|
|
1239 |
with gr.Row():
|
1240 |
-
|
|
|
1241 |
with gr.Column():
|
1242 |
-
|
1243 |
-
|
1244 |
-
|
1245 |
-
|
1246 |
with gr.Row():
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
-
|
1252 |
-
|
1253 |
for i in range(max_textboxes):
|
1254 |
-
with gr.Row(visible=False) as
|
1255 |
-
|
1256 |
-
|
1257 |
-
|
1258 |
-
|
1259 |
-
|
1260 |
-
|
1261 |
-
|
1262 |
-
|
1263 |
with gr.Row():
|
1264 |
-
|
1265 |
with gr.Row():
|
1266 |
-
|
1267 |
with gr.Row():
|
1268 |
-
|
1269 |
-
gr.Button('\U0001f3b2\ufe0f', scale=1).click(fn=lambda: -1, outputs=[
|
1270 |
-
|
1271 |
|
1272 |
with gr.Tab("Audio"):
|
1273 |
with gr.Row():
|
1274 |
with gr.Column():
|
1275 |
-
|
1276 |
-
|
1277 |
with gr.Row():
|
1278 |
-
|
1279 |
-
|
1280 |
-
|
1281 |
|
1282 |
with gr.Tab("Customization"):
|
1283 |
with gr.Row():
|
1284 |
with gr.Column():
|
1285 |
-
|
1286 |
-
|
1287 |
-
|
1288 |
with gr.Column():
|
1289 |
-
|
1290 |
with gr.Row():
|
1291 |
-
|
1292 |
-
|
1293 |
|
1294 |
with gr.Tab("Settings"):
|
1295 |
with gr.Row():
|
1296 |
-
|
1297 |
-
|
1298 |
with gr.Row():
|
1299 |
-
|
1300 |
-
|
|
|
|
|
|
|
1301 |
with gr.Row():
|
1302 |
-
|
1303 |
-
|
1304 |
-
|
1305 |
-
|
|
|
|
|
1306 |
with gr.Row():
|
1307 |
-
|
|
|
1308 |
_ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
|
1309 |
-
with gr.Column():
|
1310 |
with gr.Tab("Output"):
|
1311 |
-
|
1312 |
with gr.Row():
|
1313 |
-
|
1314 |
-
|
1315 |
-
|
1316 |
-
|
1317 |
-
|
1318 |
with gr.Tab("Wiki"):
|
1319 |
gr.Markdown(
|
1320 |
"""
|
1321 |
- **[Generate (button)]:**
|
1322 |
-
Generates the
|
1323 |
|
1324 |
- **[Interrupt (button)]:**
|
1325 |
-
Stops the
|
1326 |
|
1327 |
---
|
1328 |
|
@@ -1336,21 +1307,30 @@ def ui_full(launch_kwargs):
|
|
1336 |
- **[Structure Prompts (checkbox)]:**
|
1337 |
Enable/Disable the structure prompts feature.
|
1338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1339 |
- **[Global Prompt (text)]:**
|
1340 |
Here write the prompt that you wish to be used for all prompt segments.
|
1341 |
|
1342 |
#### Multi-Prompt:
|
1343 |
|
1344 |
-
This feature allows you to control the
|
1345 |
-
You have up to 10 prompt segments. the first prompt will always be
|
1346 |
-
the other prompts will be [
|
1347 |
-
for example if the overlap is
|
1348 |
|
1349 |
- **[Prompt Segments (number)]:**
|
1350 |
-
Amount of unique prompt to generate throughout the
|
1351 |
|
1352 |
- **[Prompt/Input Text (prompt)]:**
|
1353 |
-
Here describe the
|
1354 |
|
1355 |
- **[Repeat (number)]:**
|
1356 |
Write how many times this prompt will repeat (instead of wasting another prompt segment on the same prompt).
|
@@ -1362,15 +1342,15 @@ def ui_full(launch_kwargs):
|
|
1362 |
Calculates the timings of the prompt segments.
|
1363 |
|
1364 |
- **[Duration (number)]:**
|
1365 |
-
How long you want the generated
|
1366 |
|
1367 |
- **[Overlap (number)]:**
|
1368 |
How much each new segment will reference the previous segment (in seconds).
|
1369 |
-
For example, if you choose
|
1370 |
-
and will generate only
|
1371 |
|
1372 |
- **[Seed (number)]:**
|
1373 |
-
Your generated
|
1374 |
place the exact seed with the exact prompts
|
1375 |
(This way you can also extend specific song that was generated short).
|
1376 |
|
@@ -1388,12 +1368,16 @@ def ui_full(launch_kwargs):
|
|
1388 |
`File` mode allows you to upload an audio file to use as input
|
1389 |
`Mic` mode allows you to use your microphone as input
|
1390 |
|
|
|
|
|
|
|
|
|
1391 |
- **[Trim Start and Trim End (numbers)]:**
|
1392 |
`Trim Start` set how much you'd like to trim the input audio from the start
|
1393 |
`Trim End` same as the above but from the end
|
1394 |
|
1395 |
- **[Input Audio (audio file)]:**
|
1396 |
-
Input here the audio you wish to use.
|
1397 |
|
1398 |
---
|
1399 |
|
@@ -1428,6 +1412,29 @@ def ui_full(launch_kwargs):
|
|
1428 |
- **[Output Audio Sample Rate (dropdown)]:**
|
1429 |
The output audio sample rate, the model default is 32000.
|
1430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1431 |
- **[Top-k (number)]:**
|
1432 |
is a parameter used in text generation models, including music generation models. It determines the number of most likely next tokens to consider at each step of the generation process. The model ranks all possible tokens based on their predicted probabilities, and then selects the top-k tokens from the ranked list. The model then samples from this reduced set of tokens to determine the next token in the generated sequence. A smaller value of k results in a more focused and deterministic output, while a larger value of k allows for more diversity in the generated music.
|
1433 |
|
|
|
188 |
return "No tags found. Either the file is not generated by MusicGen+ V1.2.7 and higher or the tags are corrupted. (Discord removes metadata from mp4 and wav files, so you can't use them)"
|
189 |
json_string = song.tags['COMMENT'][0]
|
190 |
data = json.loads(json_string)
|
191 |
+
|
192 |
global_prompt = str("\nGlobal Prompt: " + (data['global_prompt'] if data['global_prompt'] != "" else "none")) if 'global_prompt' in data else ""
|
193 |
bpm = str("\nBPM: " + data['bpm']) if 'bpm' in data else ""
|
194 |
key = str("\nKey: " + data['key']) if 'key' in data else ""
|
|
|
213 |
version = str("Version: " + data['version']) if 'version' in data else "Version: Unknown"
|
214 |
info = str(version + global_prompt + bpm + key + scale + prompts + duration + overlap + seed + audio_mode + input_length + channel + sr_select + model + custom_model + base_model + decoder + topk + topp + temperature + cfg_coef)
|
215 |
if info == "":
|
216 |
+
return "No tags found. Either the file is not generated by
|
217 |
+
+ V1.2.7 and higher or the tags are corrupted. (Discord removes metadata from mp4 and wav files, so you can't use them)"
|
218 |
return info
|
219 |
else:
|
220 |
with open(audio_path.name) as json_file:
|
221 |
data = json.load(json_file)
|
222 |
#if 'global_prompt' not in data:
|
223 |
+
#return "No tags found. Either the file is not generated by
|
224 |
+
+ V1.2.8a and higher or the tags are corrupted."
|
225 |
global_prompt = str("\nGlobal Prompt: " + (data['global_prompt'] if data['global_prompt'] != "" else "none")) if 'global_prompt' in data else ""
|
226 |
bpm = str("\nBPM: " + data['bpm']) if 'bpm' in data else ""
|
227 |
key = str("\nKey: " + data['key']) if 'key' in data else ""
|
|
|
246 |
version = str("Version: " + data['version']) if 'version' in data else "Version: Unknown"
|
247 |
info = str(version + global_prompt + bpm + key + scale + prompts + duration + overlap + seed + audio_mode + input_length + channel + sr_select + model + custom_model + base_model + decoder + topk + topp + temperature + cfg_coef)
|
248 |
if info == "":
|
249 |
+
return "No tags found. Either the file is not generated by
|
250 |
+
+ V1.2.7 and higher or the tags are corrupted."
|
251 |
return info
|
252 |
else:
|
253 |
return "Only .wav ,.mp4 and .json files are supported"
|
|
|
471 |
global MBD
|
472 |
if MBD is None:
|
473 |
print("loading MBD")
|
474 |
+
MBD = MultiBandDiffusion.get_mbd_
|
475 |
+
()
|
476 |
|
477 |
|
478 |
def unload_diffusion():
|
|
|
857 |
|
858 |
if gen_type == "music":
|
859 |
model_shrt = model
|
860 |
+
model = "GrandaddyShmax/
|
861 |
+
-" + model
|
862 |
elif gen_type == "audio":
|
863 |
model_shrt = model
|
864 |
model = "GrandaddyShmax/audiogen-" + model
|
865 |
base_model_shrt = base_model
|
866 |
+
base_model = "GrandaddyShmax/
|
867 |
+
-" + base_model
|
868 |
|
869 |
if MODEL is None or MODEL.name != (model):
|
870 |
load_model(model, custom_model, base_model, gen_type)
|
|
|
964 |
Welcome to Soundscapes - TulipAI’s flagship Audio Storytelling Toolkit. Designed with modern content creators in mind, our AI-driven platform generates audio sound effects in just minutes tailored to your unique needs.
|
965 |
"""
|
966 |
)
|
967 |
+
with gr.Tab("AudioGen"):
|
968 |
gr.Markdown(
|
969 |
"""
|
970 |
+
### AudioGen
|
971 |
"""
|
972 |
)
|
973 |
with gr.Row():
|
974 |
with gr.Column():
|
975 |
with gr.Tab("Generation"):
|
976 |
with gr.Accordion("Structure Prompts", open=False):
|
977 |
+
with gr.Row():
|
978 |
+
struc_prompts_a = gr.Checkbox(label="Enable", value=False, interactive=True, container=False)
|
979 |
+
global_prompt_a = gr.Text(label="Global Prompt", interactive=True, scale=3)
|
|
|
|
|
|
|
|
|
|
|
980 |
with gr.Row():
|
981 |
+
s_a = gr.Slider(1, max_textboxes, value=1, step=1, label="Prompts:", interactive=True, scale=2)
|
|
|
982 |
with gr.Column():
|
983 |
+
textboxes_a = []
|
984 |
+
prompts_a = []
|
985 |
+
repeats_a = []
|
986 |
+
calcs_a = []
|
987 |
with gr.Row():
|
988 |
+
text0_a = gr.Text(label="Input Text", interactive=True, scale=4)
|
989 |
+
prompts_a.append(text0_a)
|
990 |
+
drag0_a = gr.Number(label="Repeat", value=1, interactive=True, scale=1)
|
991 |
+
repeats_a.append(drag0_a)
|
992 |
+
calc0_a = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
|
993 |
+
calcs_a.append(calc0_a)
|
994 |
for i in range(max_textboxes):
|
995 |
+
with gr.Row(visible=False) as t_a:
|
996 |
+
text_a = gr.Text(label="Input Text", interactive=True, scale=3)
|
997 |
+
repeat_a = gr.Number(label="Repeat", minimum=1, value=1, interactive=True, scale=1)
|
998 |
+
calc_a = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
|
999 |
+
textboxes_a.append(t_a)
|
1000 |
+
prompts_a.append(text_a)
|
1001 |
+
repeats_a.append(repeat_a)
|
1002 |
+
calcs_a.append(calc_a)
|
1003 |
+
to_calc_a = gr.Button("Calculate Timings", variant="secondary")
|
1004 |
with gr.Row():
|
1005 |
+
duration_a = gr.Slider(minimum=1, maximum=300, value=10, step=1, label="Duration", interactive=True)
|
1006 |
with gr.Row():
|
1007 |
+
overlap_a = gr.Slider(minimum=1, maximum=9, value=2, step=1, label="Overlap", interactive=True)
|
1008 |
with gr.Row():
|
1009 |
+
seed_a = gr.Number(label="Seed", value=-1, scale=4, precision=0, interactive=True)
|
1010 |
+
gr.Button('\U0001f3b2\ufe0f', scale=1).click(fn=lambda: -1, outputs=[seed_a], queue=False)
|
1011 |
+
reuse_seed_a = gr.Button('\u267b\ufe0f', scale=1)
|
1012 |
|
1013 |
with gr.Tab("Audio"):
|
1014 |
with gr.Row():
|
1015 |
with gr.Column():
|
1016 |
+
input_type_a = gr.Radio(["file", "mic"], value="file", label="Input Type (optional)", interactive=True)
|
1017 |
+
mode_a = gr.Radio(["sample"], label="Input Audio Mode (optional)", value="sample", interactive=False, visible=False)
|
1018 |
with gr.Row():
|
1019 |
+
trim_start_a = gr.Number(label="Trim Start", value=0, interactive=True)
|
1020 |
+
trim_end_a = gr.Number(label="Trim End", value=0, interactive=True)
|
1021 |
+
audio_a = gr.Audio(source="upload", type="numpy", label="Input Audio (optional)", interactive=True)
|
1022 |
|
1023 |
with gr.Tab("Customization"):
|
1024 |
with gr.Row():
|
1025 |
with gr.Column():
|
1026 |
+
background_a = gr.ColorPicker(value="#0f0f0f", label="background color", interactive=True, scale=0)
|
1027 |
+
bar1_a = gr.ColorPicker(value="#84cc16", label="bar color start", interactive=True, scale=0)
|
1028 |
+
bar2_a = gr.ColorPicker(value="#10b981", label="bar color end", interactive=True, scale=0)
|
1029 |
with gr.Column():
|
1030 |
+
image_a = gr.Image(label="Background Image", type="filepath", interactive=True, scale=4)
|
1031 |
with gr.Row():
|
1032 |
+
height_a = gr.Number(label="Height", value=512, interactive=True)
|
1033 |
+
width_a = gr.Number(label="Width", value=768, interactive=True)
|
1034 |
|
1035 |
with gr.Tab("Settings"):
|
1036 |
with gr.Row():
|
1037 |
+
channel_a = gr.Radio(["mono", "stereo", "stereo effect"], label="Output Audio Channels", value="stereo", interactive=True, scale=1)
|
1038 |
+
sr_select_a = gr.Dropdown(["11025", "16000", "22050", "24000", "32000", "44100", "48000"], label="Output Audio Sample Rate", value="48000", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
1039 |
with gr.Row():
|
1040 |
+
model_a = gr.Radio(["medium"], label="Model", value="medium", interactive=False, visible=False)
|
1041 |
+
decoder_a = gr.Radio(["Default"], label="Decoder", value="Default", interactive=False, visible=False)
|
1042 |
with gr.Row():
|
1043 |
+
topk_a = gr.Number(label="Top-k", value=250, interactive=True)
|
1044 |
+
topp_a = gr.Number(label="Top-p", value=0, interactive=True)
|
1045 |
+
temperature_a = gr.Number(label="Temperature", value=1.0, interactive=True)
|
1046 |
+
cfg_coef_a = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
|
1047 |
with gr.Row():
|
1048 |
+
submit_a = gr.Button("Generate", variant="primary")
|
|
|
1049 |
_ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
|
1050 |
+
with gr.Column():
|
1051 |
with gr.Tab("Output"):
|
1052 |
+
output_a = gr.Video(label="Generated Audio", scale=0)
|
1053 |
with gr.Row():
|
1054 |
+
audio_only_a = gr.Audio(type="numpy", label="Audio Only", interactive=False)
|
1055 |
+
backup_only_a = gr.Audio(type="numpy", label="Backup Audio", interactive=False, visible=False)
|
1056 |
+
send_audio_a = gr.Button("Send to Input Audio")
|
1057 |
+
seed_used_a = gr.Number(label='Seed used', value=-1, interactive=False)
|
1058 |
+
download_a = gr.File(label="Generated Files", interactive=False)
|
1059 |
with gr.Tab("Wiki"):
|
1060 |
gr.Markdown(
|
1061 |
"""
|
1062 |
- **[Generate (button)]:**
|
1063 |
+
Generates the audio with the given settings and prompts.
|
1064 |
|
1065 |
- **[Interrupt (button)]:**
|
1066 |
+
Stops the audio generation as soon as it can, providing an incomplete output.
|
1067 |
|
1068 |
---
|
1069 |
|
|
|
1077 |
- **[Structure Prompts (checkbox)]:**
|
1078 |
Enable/Disable the structure prompts feature.
|
1079 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1080 |
- **[Global Prompt (text)]:**
|
1081 |
Here write the prompt that you wish to be used for all prompt segments.
|
1082 |
|
1083 |
#### Multi-Prompt:
|
1084 |
|
1085 |
+
This feature allows you to control the audio, adding variation to different time segments.
|
1086 |
+
You have up to 10 prompt segments. the first prompt will always be 10s long
|
1087 |
+
the other prompts will be [10s - overlap].
|
1088 |
+
for example if the overlap is 2s, each prompt segment will be 8s.
|
1089 |
|
1090 |
- **[Prompt Segments (number)]:**
|
1091 |
+
Amount of unique prompt to generate throughout the audio generation.
|
1092 |
|
1093 |
- **[Prompt/Input Text (prompt)]:**
|
1094 |
+
Here describe the audio you wish the model to generate.
|
1095 |
|
1096 |
- **[Repeat (number)]:**
|
1097 |
Write how many times this prompt will repeat (instead of wasting another prompt segment on the same prompt).
|
|
|
1103 |
Calculates the timings of the prompt segments.
|
1104 |
|
1105 |
- **[Duration (number)]:**
|
1106 |
+
How long you want the generated audio to be (in seconds).
|
1107 |
|
1108 |
- **[Overlap (number)]:**
|
1109 |
How much each new segment will reference the previous segment (in seconds).
|
1110 |
+
For example, if you choose 2s: Each new segment after the first one will reference the previous segment 2s
|
1111 |
+
and will generate only 8s of new audio. The model can only process 10s of music.
|
1112 |
|
1113 |
- **[Seed (number)]:**
|
1114 |
+
Your generated audio id. If you wish to generate the exact same audio,
|
1115 |
place the exact seed with the exact prompts
|
1116 |
(This way you can also extend specific song that was generated short).
|
1117 |
|
|
|
1129 |
`File` mode allows you to upload an audio file to use as input
|
1130 |
`Mic` mode allows you to use your microphone as input
|
1131 |
|
|
|
|
|
|
|
|
|
1132 |
- **[Trim Start and Trim End (numbers)]:**
|
1133 |
`Trim Start` set how much you'd like to trim the input audio from the start
|
1134 |
`Trim End` same as the above but from the end
|
1135 |
|
1136 |
- **[Input Audio (audio file)]:**
|
1137 |
+
Input here the audio you wish to use.
|
1138 |
|
1139 |
---
|
1140 |
|
|
|
1169 |
- **[Output Audio Sample Rate (dropdown)]:**
|
1170 |
The output audio sample rate, the model default is 32000.
|
1171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1172 |
- **[Top-k (number)]:**
|
1173 |
is a parameter used in text generation models, including music generation models. It determines the number of most likely next tokens to consider at each step of the generation process. The model ranks all possible tokens based on their predicted probabilities, and then selects the top-k tokens from the ranked list. The model then samples from this reduced set of tokens to determine the next token in the generated sequence. A smaller value of k results in a more focused and deterministic output, while a larger value of k allows for more diversity in the generated music.
|
1174 |
|
|
|
1182 |
refers to a technique used in some music generation models where a separate classifier network is trained to provide guidance or control over the generated music. This classifier is trained on labeled data to recognize specific musical characteristics or styles. During the generation process, the output of the generator model is evaluated by the classifier, and the generator is encouraged to produce music that aligns with the desired characteristics or style. This approach allows for more fine-grained control over the generated music, enabling users to specify certain attributes they want the model to capture.
|
1183 |
"""
|
1184 |
)
|
1185 |
+
with gr.Tab("MusicGen"):
|
1186 |
gr.Markdown(
|
1187 |
"""
|
1188 |
+
### MusicGen
|
1189 |
"""
|
1190 |
)
|
1191 |
with gr.Row():
|
1192 |
with gr.Column():
|
1193 |
with gr.Tab("Generation"):
|
1194 |
with gr.Accordion("Structure Prompts", open=False):
|
1195 |
+
with gr.Column():
|
1196 |
+
with gr.Row():
|
1197 |
+
struc_prompts = gr.Checkbox(label="Enable", value=False, interactive=True, container=False)
|
1198 |
+
bpm = gr.Number(label="BPM", value=120, interactive=True, scale=1, precision=0)
|
1199 |
+
key = gr.Dropdown(["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "Bb", "B"], label="Key", value="C", interactive=True)
|
1200 |
+
scale = gr.Dropdown(["Major", "Minor"], label="Scale", value="Major", interactive=True)
|
1201 |
+
with gr.Row():
|
1202 |
+
global_prompt = gr.Text(label="Global Prompt", interactive=True, scale=3)
|
1203 |
with gr.Row():
|
1204 |
+
s = gr.Slider(1, max_textboxes, value=1, step=1, label="Prompts:", interactive=True, scale=2)
|
1205 |
+
#s_mode = gr.Radio(["segmentation", "batch"], value="segmentation", interactive=True, scale=1, label="Generation Mode")
|
1206 |
with gr.Column():
|
1207 |
+
textboxes = []
|
1208 |
+
prompts = []
|
1209 |
+
repeats = []
|
1210 |
+
calcs = []
|
1211 |
with gr.Row():
|
1212 |
+
text0 = gr.Text(label="Input Text", interactive=True, scale=4)
|
1213 |
+
prompts.append(text0)
|
1214 |
+
drag0 = gr.Number(label="Repeat", value=1, interactive=True, scale=1)
|
1215 |
+
repeats.append(drag0)
|
1216 |
+
calc0 = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
|
1217 |
+
calcs.append(calc0)
|
1218 |
for i in range(max_textboxes):
|
1219 |
+
with gr.Row(visible=False) as t:
|
1220 |
+
text = gr.Text(label="Input Text", interactive=True, scale=3)
|
1221 |
+
repeat = gr.Number(label="Repeat", minimum=1, value=1, interactive=True, scale=1)
|
1222 |
+
calc = gr.Text(interactive=False, value="00:00 - 00:00", scale=1, label="Time")
|
1223 |
+
textboxes.append(t)
|
1224 |
+
prompts.append(text)
|
1225 |
+
repeats.append(repeat)
|
1226 |
+
calcs.append(calc)
|
1227 |
+
to_calc = gr.Button("Calculate Timings", variant="secondary")
|
1228 |
with gr.Row():
|
1229 |
+
duration = gr.Slider(minimum=1, maximum=300, value=10, step=1, label="Duration", interactive=True)
|
1230 |
with gr.Row():
|
1231 |
+
overlap = gr.Slider(minimum=1, maximum=29, value=12, step=1, label="Overlap", interactive=True)
|
1232 |
with gr.Row():
|
1233 |
+
seed = gr.Number(label="Seed", value=-1, scale=4, precision=0, interactive=True)
|
1234 |
+
gr.Button('\U0001f3b2\ufe0f', scale=1).click(fn=lambda: -1, outputs=[seed], queue=False)
|
1235 |
+
reuse_seed = gr.Button('\u267b\ufe0f', scale=1)
|
1236 |
|
1237 |
with gr.Tab("Audio"):
|
1238 |
with gr.Row():
|
1239 |
with gr.Column():
|
1240 |
+
input_type = gr.Radio(["file", "mic"], value="file", label="Input Type (optional)", interactive=True)
|
1241 |
+
mode = gr.Radio(["melody", "sample"], label="Input Audio Mode (optional)", value="sample", interactive=True)
|
1242 |
with gr.Row():
|
1243 |
+
trim_start = gr.Number(label="Trim Start", value=0, interactive=True)
|
1244 |
+
trim_end = gr.Number(label="Trim End", value=0, interactive=True)
|
1245 |
+
audio = gr.Audio(source="upload", type="numpy", label="Input Audio (optional)", interactive=True)
|
1246 |
|
1247 |
with gr.Tab("Customization"):
|
1248 |
with gr.Row():
|
1249 |
with gr.Column():
|
1250 |
+
background = gr.ColorPicker(value="#0f0f0f", label="background color", interactive=True, scale=0)
|
1251 |
+
bar1 = gr.ColorPicker(value="#84cc16", label="bar color start", interactive=True, scale=0)
|
1252 |
+
bar2 = gr.ColorPicker(value="#10b981", label="bar color end", interactive=True, scale=0)
|
1253 |
with gr.Column():
|
1254 |
+
image = gr.Image(label="Background Image", type="filepath", interactive=True, scale=4)
|
1255 |
with gr.Row():
|
1256 |
+
height = gr.Number(label="Height", value=512, interactive=True)
|
1257 |
+
width = gr.Number(label="Width", value=768, interactive=True)
|
1258 |
|
1259 |
with gr.Tab("Settings"):
|
1260 |
with gr.Row():
|
1261 |
+
channel = gr.Radio(["mono", "stereo", "stereo effect"], label="Output Audio Channels", value="stereo", interactive=True, scale=1)
|
1262 |
+
sr_select = gr.Dropdown(["11025", "16000", "22050", "24000", "32000", "44100", "48000"], label="Output Audio Sample Rate", value="48000", interactive=True)
|
1263 |
with gr.Row():
|
1264 |
+
model = gr.Radio(["melody", "small", "medium", "large", "custom"], label="Model", value="large", interactive=True, scale=1)
|
1265 |
+
with gr.Column():
|
1266 |
+
dropdown = gr.Dropdown(choices=get_available_models(), value=("No models found" if len(get_available_models()) < 1 else get_available_models()[0]), label='Custom Model (models folder)', elem_classes='slim-dropdown', interactive=True)
|
1267 |
+
ui.create_refresh_button(dropdown, lambda: None, lambda: {'choices': get_available_models()}, 'refresh-button')
|
1268 |
+
basemodel = gr.Radio(["small", "medium", "melody", "large"], label="Base Model", value="medium", interactive=True, scale=1)
|
1269 |
with gr.Row():
|
1270 |
+
decoder = gr.Radio(["Default", "MultiBand_Diffusion"], label="Decoder", value="Default", interactive=True)
|
1271 |
+
with gr.Row():
|
1272 |
+
topk = gr.Number(label="Top-k", value=250, interactive=True)
|
1273 |
+
topp = gr.Number(label="Top-p", value=0, interactive=True)
|
1274 |
+
temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
|
1275 |
+
cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
|
1276 |
with gr.Row():
|
1277 |
+
submit = gr.Button("Generate", variant="primary")
|
1278 |
+
# Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
|
1279 |
_ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
|
1280 |
+
with gr.Column() as c:
|
1281 |
with gr.Tab("Output"):
|
1282 |
+
output = gr.Video(label="Generated Music", scale=0)
|
1283 |
with gr.Row():
|
1284 |
+
audio_only = gr.Audio(type="numpy", label="Audio Only", interactive=False)
|
1285 |
+
backup_only = gr.Audio(type="numpy", label="Backup Audio", interactive=False, visible=False)
|
1286 |
+
send_audio = gr.Button("Send to Input Audio")
|
1287 |
+
seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
|
1288 |
+
download = gr.File(label="Generated Files", interactive=False)
|
1289 |
with gr.Tab("Wiki"):
|
1290 |
gr.Markdown(
|
1291 |
"""
|
1292 |
- **[Generate (button)]:**
|
1293 |
+
Generates the music with the given settings and prompts.
|
1294 |
|
1295 |
- **[Interrupt (button)]:**
|
1296 |
+
Stops the music generation as soon as it can, providing an incomplete output.
|
1297 |
|
1298 |
---
|
1299 |
|
|
|
1307 |
- **[Structure Prompts (checkbox)]:**
|
1308 |
Enable/Disable the structure prompts feature.
|
1309 |
|
1310 |
+
- **[BPM (number)]:**
|
1311 |
+
Beats per minute of the generated music.
|
1312 |
+
|
1313 |
+
- **[Key (dropdown)]:**
|
1314 |
+
The key of the generated music.
|
1315 |
+
|
1316 |
+
- **[Scale (dropdown)]:**
|
1317 |
+
The scale of the generated music.
|
1318 |
+
|
1319 |
- **[Global Prompt (text)]:**
|
1320 |
Here write the prompt that you wish to be used for all prompt segments.
|
1321 |
|
1322 |
#### Multi-Prompt:
|
1323 |
|
1324 |
+
This feature allows you to control the music, adding variation to different time segments.
|
1325 |
+
You have up to 10 prompt segments. the first prompt will always be 30s long
|
1326 |
+
the other prompts will be [30s - overlap].
|
1327 |
+
for example if the overlap is 10s, each prompt segment will be 20s.
|
1328 |
|
1329 |
- **[Prompt Segments (number)]:**
|
1330 |
+
Amount of unique prompt to generate throughout the music generation.
|
1331 |
|
1332 |
- **[Prompt/Input Text (prompt)]:**
|
1333 |
+
Here describe the music you wish the model to generate.
|
1334 |
|
1335 |
- **[Repeat (number)]:**
|
1336 |
Write how many times this prompt will repeat (instead of wasting another prompt segment on the same prompt).
|
|
|
1342 |
Calculates the timings of the prompt segments.
|
1343 |
|
1344 |
- **[Duration (number)]:**
|
1345 |
+
How long you want the generated music to be (in seconds).
|
1346 |
|
1347 |
- **[Overlap (number)]:**
|
1348 |
How much each new segment will reference the previous segment (in seconds).
|
1349 |
+
For example, if you choose 20s: Each new segment after the first one will reference the previous segment 20s
|
1350 |
+
and will generate only 10s of new music. The model can only process 30s of music.
|
1351 |
|
1352 |
- **[Seed (number)]:**
|
1353 |
+
Your generated music id. If you wish to generate the exact same music,
|
1354 |
place the exact seed with the exact prompts
|
1355 |
(This way you can also extend specific song that was generated short).
|
1356 |
|
|
|
1368 |
`File` mode allows you to upload an audio file to use as input
|
1369 |
`Mic` mode allows you to use your microphone as input
|
1370 |
|
1371 |
+
- **[Input Audio Mode (selection)]:**
|
1372 |
+
`Melody` mode only works with the melody model: it conditions the music generation to reference the melody
|
1373 |
+
`Sample` mode works with any model: it gives a music sample to the model to generate its continuation.
|
1374 |
+
|
1375 |
- **[Trim Start and Trim End (numbers)]:**
|
1376 |
`Trim Start` set how much you'd like to trim the input audio from the start
|
1377 |
`Trim End` same as the above but from the end
|
1378 |
|
1379 |
- **[Input Audio (audio file)]:**
|
1380 |
+
Input here the audio you wish to use with "melody" or "sample" mode.
|
1381 |
|
1382 |
---
|
1383 |
|
|
|
1412 |
- **[Output Audio Sample Rate (dropdown)]:**
|
1413 |
The output audio sample rate, the model default is 32000.
|
1414 |
|
1415 |
+
- **[Model (selection)]:**
|
1416 |
+
Here you can choose which model you wish to use:
|
1417 |
+
`melody` model is based on the medium model with a unique feature that lets you use melody conditioning
|
1418 |
+
`small` model is trained on 300M parameters
|
1419 |
+
`medium` model is trained on 1.5B parameters
|
1420 |
+
`large` model is trained on 3.3B parameters
|
1421 |
+
`custom` model runs the custom model that you provided.
|
1422 |
+
|
1423 |
+
- **[Custom Model (selection)]:**
|
1424 |
+
This dropdown will show you models that are placed in the `models` folder
|
1425 |
+
you must select `custom` in the model options in order to use it.
|
1426 |
+
|
1427 |
+
- **[Refresh (button)]:**
|
1428 |
+
Refreshes the dropdown list for custom model.
|
1429 |
+
|
1430 |
+
- **[Base Model (selection)]:**
|
1431 |
+
Choose here the model that your custom model is based on.
|
1432 |
+
|
1433 |
+
- **[Decoder (selection)]:**
|
1434 |
+
Choose here the decoder that you wish to use:
|
1435 |
+
`Default` is the default decoder
|
1436 |
+
`MultiBand_Diffusion` is a decoder that uses diffusion to generate the audio.
|
1437 |
+
|
1438 |
- **[Top-k (number)]:**
|
1439 |
is a parameter used in text generation models, including music generation models. It determines the number of most likely next tokens to consider at each step of the generation process. The model ranks all possible tokens based on their predicted probabilities, and then selects the top-k tokens from the ranked list. The model then samples from this reduced set of tokens to determine the next token in the generated sequence. A smaller value of k results in a more focused and deterministic output, while a larger value of k allows for more diversity in the generated music.
|
1440 |
|