Merge the contributions from Tango 2 full and the changes on Tango 2

#8
Files changed (1) hide show
  1. app.py +184 -54
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  import json
3
  import torch
4
  import wavio
@@ -23,7 +24,6 @@ from tqdm import tqdm
23
 
24
 
25
 
26
-
27
  class Tango2Pipeline(DiffusionPipeline):
28
 
29
 
@@ -169,6 +169,7 @@ class Tango2Pipeline(DiffusionPipeline):
169
 
170
  return AudioPipelineOutput(audios=wave)
171
 
 
172
 
173
  # Automatic device detection
174
  if torch.cuda.is_available():
@@ -249,21 +250,73 @@ pipe = Tango2Pipeline(vae=tango.vae,
249
  scheduler=tango.scheduler
250
  )
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  @spaces.GPU(duration=60)
254
- def gradio_generate(prompt, output_format, steps, guidance):
255
- output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  #output_wave = tango.generate(prompt, steps, guidance)
257
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
258
- output_wave = output_wave.audios[0]
259
- output_filename = "temp.wav"
260
- wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
261
-
262
- if (output_format == "mp3"):
263
- AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
264
- output_filename = "temp.mp3"
265
 
266
- return output_filename
 
 
 
 
267
 
268
  # description_text = """
269
  # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
@@ -285,53 +338,130 @@ def gradio_generate(prompt, output_format, steps, guidance):
285
  # <p/>
286
  # """
287
  description_text = """
 
288
  <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
289
  Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
290
  <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
291
  <p/>
292
  """
293
- # Gradio input and output components
294
- input_text = gr.Textbox(lines=2, label="Prompt")
295
- output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
296
- output_audio = gr.Audio(label="Generated Audio", type="filepath")
297
- denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
298
- guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
299
 
300
  # Gradio interface
301
- gr_interface = gr.Interface(
302
- fn=gradio_generate,
303
- inputs=[input_text, output_format, denoising_steps, guidance_scale],
304
- outputs=[output_audio],
305
- title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
306
- description=description_text,
307
- allow_flagging=False,
308
- examples=[
309
- ["Quiet speech and then and airplane flying away"],
310
- ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
311
- ["Ducks quack and water splashes with some animal screeching in the background"],
312
- ["Describe the sound of the ocean"],
313
- ["A woman and a baby are having a conversation"],
314
- ["A man speaks followed by a popping noise and laughter"],
315
- ["A cup is filled from a faucet"],
316
- ["An audience cheering and clapping"],
317
- ["Rolling thunder with lightning strikes"],
318
- ["A dog barking and a cat mewing and a racing car passes by"],
319
- ["Gentle water stream, birds chirping and sudden gun shot"],
320
- ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."],
321
- ["A dog barking"],
322
- ["A cat meowing"],
323
- ["Wooden table tapping sound while water pouring"],
324
- ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
325
- ["two gunshots followed by birds flying away while chirping"],
326
- ["Whistling with birds chirping"],
327
- ["A person snoring"],
328
- ["Motor vehicles are driving with loud engines and a person whistles"],
329
- ["People cheering in a stadium while thunder and lightning strikes"],
330
- ["A helicopter is in flight"],
331
- ["A dog barking and a man talking and a racing car passes by"],
332
- ],
333
- cache_examples="lazy", # Turn on to cache.
334
- )
335
-
336
- # Launch Gradio app
337
- gr_interface.queue(10).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import random
3
  import json
4
  import torch
5
  import wavio
 
24
 
25
 
26
 
 
27
  class Tango2Pipeline(DiffusionPipeline):
28
 
29
 
 
169
 
170
  return AudioPipelineOutput(audios=wave)
171
 
172
+ max_64_bit_int = 2**63 - 1
173
 
174
  # Automatic device detection
175
  if torch.cuda.is_available():
 
250
  scheduler=tango.scheduler
251
  )
252
 
253
+
254
+ def update_seed(is_randomize_seed, seed):
255
+ if is_randomize_seed:
256
+ return random.randint(0, max_64_bit_int)
257
+ return seed
258
+
259
+ def check(
260
+ prompt,
261
+ output_format,
262
+ output_number,
263
+ steps,
264
+ guidance,
265
+ is_randomize_seed,
266
+ seed
267
+ ):
268
+ if prompt is None or prompt == "":
269
+ raise gr.Error("Please provide a prompt input.")
270
+ if not output_number in [1, 2, 3]:
271
+ raise gr.Error("Please ask for 1, 2 or 3 output files.")
272
+
273
+ def update_output(output_format, output_number):
274
+ return [
275
+ gr.update(format = output_format),
276
+ gr.update(format = output_format, visible = (2 <= output_number)),
277
+ gr.update(format = output_format, visible = (output_number == 3))
278
+ ]
279
+
280
+ def generate_output(output_wave, output_format, output_number, output_index):
281
+ if (output_number < output_index):
282
+ return gr.update(format = output_format, visible = False)
283
+
284
+ output_wave = output_wave.audios[output_index - 1]
285
+ output_filename = "tmp" + str(output_index) + ".wav"
286
+ wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
287
+
288
+ if (output_format == "mp3"):
289
+ AudioSegment.from_wav("tmp" + str(output_index) + ".wav").export("tmp" + str(output_index) + ".mp3", format = "mp3")
290
+ output_filename = "tmp" + str(output_index) + ".mp3"
291
+
292
+ return gr.update(value = output_filename, format = output_format, visible = True)
293
 
294
  @spaces.GPU(duration=60)
295
+ def gradio_generate(
296
+ prompt,
297
+ output_format,
298
+ output_number,
299
+ steps,
300
+ guidance,
301
+ is_randomize_seed,
302
+ seed
303
+ ):
304
+ if seed is None:
305
+ seed = random.randint(0, max_64_bit_int)
306
+
307
+ random.seed(seed)
308
+ torch.manual_seed(seed)
309
+
310
+ output_wave = pipe(prompt, steps, guidance, samples = output_number) ## Using pipeline automatically uses flash attention for torch2.0 above
311
+
312
  #output_wave = tango.generate(prompt, steps, guidance)
313
  # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
 
 
 
 
 
 
 
314
 
315
+ return [
316
+ generate_output(output_wave, output_format, output_number, 1),
317
+ generate_output(output_wave, output_format, output_number, 2),
318
+ generate_output(output_wave, output_format, output_number, 3)
319
+ ]
320
 
321
  # description_text = """
322
  # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 
338
  # <p/>
339
  # """
340
  description_text = """
341
+ <h1><center>Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization</center></h1>
342
  <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
343
  Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
344
  <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
345
  <p/>
346
  """
 
 
 
 
 
 
347
 
348
  # Gradio interface
349
+ with gr.Blocks() as interface:
350
+ gr.HTML(description_text)
351
+ with gr.Row():
352
+ with gr.Column():
353
+ input_text = gr.Textbox(lines=2, label="Prompt")
354
+ output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
355
+ output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 1, step = 1, interactive = True)
356
+ denoising_steps = gr.Slider(minimum=10, maximum=200, value=100, step=1, label="Steps", interactive=True)
357
+ guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
358
+ randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
359
+ seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
360
+
361
+ submit = gr.Button("Generate", variant = "primary")
362
+
363
+ with gr.Column():
364
+ output_audio_1 = gr.Audio(label = "Generated Audio #1/3", format = "wav", type="numpy")
365
+ output_audio_2 = gr.Audio(label = "Generated Audio #2/3", format = "wav", type="numpy")
366
+ output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
367
+
368
+ submit.click(fn = update_seed, inputs = [
369
+ randomize_seed,
370
+ seed
371
+ ], outputs = [
372
+ seed
373
+ ], queue = False, show_progress = False).then(fn = check, inputs = [
374
+ input_text,
375
+ output_format,
376
+ output_number,
377
+ denoising_steps,
378
+ guidance_scale,
379
+ randomize_seed,
380
+ seed
381
+ ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
382
+ output_format,
383
+ output_number
384
+ ], outputs = [
385
+ output_audio_1,
386
+ output_audio_2,
387
+ output_audio_3
388
+ ], queue = False, show_progress = False).success(fn = gradio_generate, inputs = [
389
+ input_text,
390
+ output_format,
391
+ output_number,
392
+ denoising_steps,
393
+ guidance_scale,
394
+ randomize_seed,
395
+ seed
396
+ ], outputs = [
397
+ output_audio_1,
398
+ output_audio_2,
399
+ output_audio_3
400
+ ], scroll_to_output = True)
401
+
402
+ gr.Examples(
403
+ fn = gradio_generate,
404
+ inputs = [
405
+ input_text,
406
+ output_format,
407
+ output_number,
408
+ denoising_steps,
409
+ guidance_scale,
410
+ randomize_seed,
411
+ seed
412
+ ],
413
+ outputs = [
414
+ output_audio_1,
415
+ output_audio_2,
416
+ output_audio_3
417
+ ],
418
+ examples = [
419
+ ["Quiet speech and then airplane flying away", "wav", 3, 200, 3, False, 123],
420
+ ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing", "wav", 3, 200, 3, False, 123],
421
+ ["Ducks quack and water splashes with some animal screeching in the background", "wav", 3, 200, 3, False, 123],
422
+ ["Describe the sound of the ocean", "wav", 3, 200, 3, False, 123],
423
+ ["A woman and a baby are having a conversation", "wav", 3, 200, 3, False, 123],
424
+ ["A man speaks followed by a popping noise and laughter", "wav", 3, 200, 3, False, 123],
425
+ ["A cup is filled from a faucet", "wav", 3, 200, 3, False, 123],
426
+ ["An audience cheering and clapping", "wav", 3, 200, 3, False, 123],
427
+ ["Rolling thunder with lightning strikes", "wav", 3, 200, 3, False, 123],
428
+ ["A dog barking and a cat mewing and a racing car passes by", "wav", 3, 200, 3, False, 123],
429
+ ["Gentle water stream, birds chirping and sudden gun shot", "wav", 3, 200, 3, False, 123],
430
+ ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone.", 3, 200, 3, False, 123],
431
+ ["A dog barking", "wav", 3, 200, 3, False, 123],
432
+ ["A cat meowing", "wav", 3, 200, 3, False, 123],
433
+ ["Wooden table tapping sound while water pouring", "wav", 3, 200, 3, False, 123],
434
+ ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker", "wav", 3, 200, 3, False, 123],
435
+ ["two gunshots followed by birds flying away while chirping", "wav", 3, 200, 3, False, 123],
436
+ ["Whistling with birds chirping", "wav", 3, 200, 3, False, 123],
437
+ ["A person snoring", "wav", 3, 200, 3, False, 123],
438
+ ["Motor vehicles are driving with loud engines and a person whistles", "wav", 3, 200, 3, False, 123],
439
+ ["People cheering in a stadium while thunder and lightning strikes", "wav", 3, 200, 3, False, 123],
440
+ ["A helicopter is in flight", "wav", 3, 200, 3, False, 123],
441
+ ["A dog barking and a man talking and a racing car passes by", "wav", 3, 200, 3, False, 123],
442
+ ],
443
+ cache_examples = "lazy",
444
+ )
445
+
446
+ gr.Markdown(
447
+ """
448
+ ## How to prompt your sound
449
+ You can use round brackets to increase the importance of a part:
450
+ ```
451
+ Peaceful and (calming) ambient music with singing bowl and other instruments
452
+ ```
453
+ You can use several levels of round brackets to even more increase the importance of a part:
454
+ ```
455
+ (Peaceful) and ((calming)) ambient music with singing bowl and other instruments
456
+ ```
457
+ You can use number instead of several round brackets:
458
+ ```
459
+ (Peaceful:1.5) and ((calming)) ambient music with singing bowl and other instruments
460
+ ```
461
+ You can do the same thing with square brackets to decrease the importance of a part:
462
+ ```
463
+ (Peaceful:1.5) and ((calming)) ambient music with [singing:2] bowl and other instruments
464
+ """
465
+ )
466
+
467
+ interface.queue(10).launch()