Use diffusers backend

#39
by sanchit-gandhi HF staff - opened
Files changed (3) hide show
  1. app.py +148 -243
  2. packages.txt +1 -0
  3. requirements.txt +4 -8
app.py CHANGED
@@ -1,184 +1,119 @@
1
- from huggingface_hub import hf_hub_download
2
- import torch
3
- import os
4
-
5
  import gradio as gr
6
- from audioldm2 import text_to_audio, build_model
 
7
  from share_btn import community_icon_html, loading_icon_html, share_js
8
 
9
- os.environ["TOKENIZERS_PARALLELISM"] = "true"
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- default_checkpoint="audioldm2-full"
13
- audioldm = None
14
- current_model_name = None
15
 
16
- def text2audio(
17
- text,
18
- guidance_scale,
19
- random_seed,
20
- n_candidates,
21
- model_name=default_checkpoint,
22
- ):
23
- global audioldm, current_model_name
24
- torch.set_float32_matmul_precision("high")
25
 
26
- if audioldm is None or model_name != current_model_name:
27
- audioldm = build_model(model_name=model_name)
28
- current_model_name = model_name
29
- audioldm = torch.compile(audioldm)
30
 
31
- # print(text, length, guidance_scale)
32
- waveform = text_to_audio(
33
- latent_diffusion=audioldm,
34
- text=text,
35
- seed=random_seed,
36
- duration=10,
37
  guidance_scale=guidance_scale,
38
- n_candidate_gen_per_text=int(n_candidates),
39
- ) # [bs, 1, samples]
40
- waveform = [
41
- gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform
42
- ]
43
- # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
44
- if len(waveform) == 1:
45
- waveform = waveform[0]
46
- return waveform
47
 
48
  css = """
49
  a {
50
- color: inherit;
51
- text-decoration: underline;
52
- }
53
- .gradio-container {
54
  font-family: 'IBM Plex Sans', sans-serif;
55
- max-width: 730px !important;
56
- }
57
- .gr-button {
58
- color: white;
59
- border-color: #000000;
60
- background: #000000;
61
- }
62
- input[type='range'] {
63
  accent-color: #000000;
64
- }
65
- .dark input[type='range'] {
66
  accent-color: #dfdfdf;
67
- }
68
- .container {
69
- margin: auto;
70
- padding-top: 1.5rem;
71
- }
72
- #gallery {
73
- min-height: 22rem;
74
- margin-bottom: 15px;
75
- margin-left: auto;
76
- margin-right: auto;
77
- border-bottom-right-radius: .5rem !important;
78
- border-bottom-left-radius: .5rem !important;
79
- }
80
- #gallery>div>.h-full {
81
  min-height: 20rem;
82
- }
83
- .details:hover {
84
  text-decoration: underline;
85
- }
86
- .gr-button {
87
  white-space: nowrap;
88
- }
89
- .gr-button:focus {
90
- border-color: rgb(147 197 253 / var(--tw-border-opacity));
91
- outline: none;
92
- box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
93
- --tw-border-opacity: 1;
94
- --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
95
- --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
96
- --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
97
- --tw-ring-opacity: .5;
98
- }
99
- #advanced-btn {
100
- font-size: .7rem !important;
101
- line-height: 19px;
102
- margin-top: 12px;
103
- margin-bottom: 12px;
104
- padding: 2px 8px;
105
  border-radius: 14px !important;
106
- }
107
- #advanced-options {
108
  margin-bottom: 20px;
109
- }
110
- .footer {
111
- margin-bottom: 45px;
112
- margin-top: 35px;
113
- text-align: center;
114
- border-bottom: 1px solid #e5e5e5;
115
- }
116
- .footer>p {
117
- font-size: .8rem;
118
- display: inline-block;
119
- padding: 0 10px;
120
- transform: translateY(10px);
121
- background: white;
122
- }
123
- .dark .footer {
124
  border-color: #303030;
125
- }
126
- .dark .footer>p {
127
  background: #0b0f19;
128
- }
129
- .acknowledgments h4{
130
- margin: 1.25em 0 .25em 0;
131
- font-weight: bold;
132
- font-size: 115%;
133
- }
134
- #container-advanced-btns{
135
- display: flex;
136
- flex-wrap: wrap;
137
- justify-content: space-between;
138
- align-items: center;
139
- }
140
- .animate-spin {
141
  animation: spin 1s linear infinite;
142
- }
143
- @keyframes spin {
144
  from {
145
  transform: rotate(0deg);
146
- }
147
- to {
148
  transform: rotate(360deg);
149
  }
150
- }
151
- #share-btn-container {
152
- display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
153
- margin-top: 10px;
154
- margin-left: auto;
155
- }
156
- #share-btn {
157
- all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
158
- }
159
- #share-btn * {
160
  all: unset;
161
- }
162
- #share-btn-container div:nth-child(-n+2){
163
- width: auto !important;
164
- min-height: 0px !important;
165
- }
166
- #share-btn-container .wrap {
167
  display: none !important;
168
- }
169
- .gr-form{
170
  flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
171
- }
172
- #prompt-container{
173
  gap: 0;
174
- }
175
- #generated_id{
176
  min-height: 700px
177
- }
178
- #setting_id{
179
- margin-bottom: 12px;
180
- text-align: center;
181
- font-weight: 900;
182
  }
183
  """
184
  iface = gr.Blocks(css=css)
@@ -189,77 +124,72 @@ with iface:
189
  <div style="text-align: center; max-width: 700px; margin: 0 auto;">
190
  <div
191
  style="
192
- display: inline-flex;
193
- align-items: center;
194
- gap: 0.8rem;
195
- font-size: 1.75rem;
196
  "
197
  >
198
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
199
  AudioLDM 2: A General Framework for Audio, Music, and Speech Generation
200
  </h1>
201
- </div>
202
- <p style="margin-bottom: 10px; font-size: 94%">
203
- <a href="https://arxiv.org/abs/2308.05734">[Paper]</a> <a href="https://audioldm.github.io/audioldm2">[Project page]</a> <a href="https://discord.com/invite/b64SEmdf">[Join Discord]</a>
 
204
  </p>
205
  </div>
206
  """
207
  )
208
  gr.HTML(
209
  """
210
- <p style="display:flex">For faster inference without a queue
211
- <a style="margin-left: .5em" href="https://huggingface.co/spaces/haoheliu/audioldm2-text2audio-text2music?duplicate=true">
212
- <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
213
- <p/>
 
214
  """
215
  )
 
216
  with gr.Group():
217
  with gr.Box():
218
- ############# Input
219
  textbox = gr.Textbox(
220
- value="A forest of wind chimes singing a soothing melody in the breeze.",
221
  max_lines=1,
222
- label="Input your prompt here",
223
  info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
224
  elem_id="prompt-in",
225
  )
 
 
 
 
 
 
 
226
 
227
  with gr.Accordion("Click to modify detailed configurations", open=False):
228
  seed = gr.Number(
229
- value=0,
230
- label="Change this value (any integer number) will lead to a different generation result.",
 
231
  )
232
- # duration = gr.Slider(
233
- # 10, 10, value=10, step=2.5, label="Duration (seconds)"
234
- # )
235
  guidance_scale = gr.Slider(
236
  0,
237
- 6,
238
  value=3.5,
239
  step=0.5,
240
- label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
 
241
  )
242
  n_candidates = gr.Slider(
243
  1,
244
- 3,
245
  value=3,
246
  step=1,
247
- label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
 
248
  )
249
- # model_name = gr.Dropdown(
250
- # ["audioldm-m-text-ft", "audioldm-s-text-ft", "audioldm-m-full","audioldm-s-full-v2", "audioldm-s-full", "audioldm-l-full"], value="audioldm-m-full", label="Choose the model to use. audioldm-m-text-ft and audioldm-s-text-ft are recommanded. -s- means small, -m- means medium and -l- means large",
251
- # )
252
- ############# Output
253
- # outputs=gr.Audio(label="Output", type="numpy")
254
- outputs = gr.Video(label="Output", elem_id="output-video")
255
 
256
- # with gr.Group(elem_id="container-advanced-btns"):
257
- # # advanced_button = gr.Button("Advanced options", elem_id="advanced-btn")
258
- # with gr.Group(elem_id="share-btn-container"):
259
- # community_icon = gr.HTML(community_icon_html, visible=False)
260
- # loading_icon = gr.HTML(loading_icon_html, visible=False)
261
- # share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
262
- # outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
263
  btn = gr.Button("Submit").style(full_width=True)
264
 
265
  with gr.Group(elem_id="share-btn-container", visible=False):
@@ -267,11 +197,9 @@ with iface:
267
  loading_icon = gr.HTML(loading_icon_html)
268
  share_button = gr.Button("Share to community", elem_id="share-btn")
269
 
270
- # btn.click(text2audio, inputs=[
271
- # textbox, duration, guidance_scale, seed, n_candidates, model_name], outputs=[outputs])
272
  btn.click(
273
  text2audio,
274
- inputs=[textbox, guidance_scale, seed, n_candidates],
275
  outputs=[outputs],
276
  )
277
 
@@ -279,79 +207,56 @@ with iface:
279
  gr.HTML(
280
  """
281
  <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
282
- <p>Follow the latest update of AudioLDM 2 on our<a href="https://github.com/haoheliu/AudioLDM2" style="text-decoration: underline;" target="_blank"> Github repo</a>
283
- </p>
284
- <br>
285
- <p>Model by <a href="https://twitter.com/LiuHaohe" style="text-decoration: underline;" target="_blank">Haohe Liu</a></p>
286
- <br>
287
  </div>
288
  """
289
  )
290
  gr.Examples(
291
  [
292
- [
293
- "An excited crowd cheering at a sports game.",
294
- 3.5,
295
- 1234,
296
- 3,
297
- default_checkpoint,
298
- ],
299
- [
300
- "A cat is meowing for attention.",
301
- 3.5,
302
- 1234,
303
- 3,
304
- default_checkpoint,
305
- ],
306
- [
307
- "Birds singing sweetly in a blooming garden.",
308
- 3.5,
309
- 1234,
310
- 3,
311
- default_checkpoint,
312
- ],
313
- [
314
- "A modern synthesizer creating futuristic soundscapes.",
315
- 3.5,
316
- 1234,
317
- 3,
318
- default_checkpoint,
319
- ],
320
- [
321
- "The vibrant beat of Brazilian samba drums.",
322
- 3.5,
323
- 1234,
324
- 3,
325
- default_checkpoint,
326
- ],
327
  ],
328
  fn=text2audio,
329
- # inputs=[textbox, duration, guidance_scale, seed, n_candidates, model_name],
330
- inputs=[textbox, guidance_scale, seed, n_candidates],
331
  outputs=[outputs],
332
  cache_examples=True,
333
  )
334
  gr.HTML(
335
  """
336
- <div class="acknowledgements">
337
- <p>Essential Tricks for Enhancing the Quality of Your Generated Audio</p>
338
- <p>1. Try to use more adjectives to describe your sound. For example: "A man is speaking clearly and slowly in a large room" is better than "A man is speaking". This can make sure AudioLDM 2 understands what you want.</p>
339
- <p>2. Try to use different random seeds, which can affect the generation quality significantly sometimes.</p>
340
- <p>3. It's better to use general terms like 'man' or 'woman' instead of specific names for individuals or abstract objects that humans may not be familiar with, such as 'mummy'.</p>
 
 
 
 
 
341
  </div>
342
  """
343
  )
344
-
345
  with gr.Accordion("Additional information", open=False):
346
  gr.HTML(
347
  """
348
  <div class="acknowledgments">
349
- <p> We build the model with data from <a href="http://research.google.com/audioset/">AudioSet</a>, <a href="https://freesound.org/">Freesound</a> and <a href="https://sound-effects.bbcrewind.co.uk/">BBC Sound Effect library</a>. We share this demo based on the <a href="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/375954/Research.pdf">UK copyright exception</a> of data for academic research. </p>
350
- </div>
351
- """
 
 
 
 
 
 
352
  )
353
- # <p>This demo is strictly for research demo purpose only. For commercial use please <a href="[email protected]">contact us</a>.</p>
354
 
355
- iface.queue(max_size=20)
356
- iface.launch(debug=True)
357
- # iface.launch(debug=True, share=True)
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from diffusers import AudioLDM2Pipeline
4
  from share_btn import community_icon_html, loading_icon_html, share_js
5
 
 
6
 
7
+ # make Space compatible with CPU duplicates
8
+ if torch.cuda.is_available():
9
+ device = "cuda"
10
+ torch_dtype = torch.float16
11
+ else:
12
+ device = "cpu"
13
+ torch_dtype = torch.float32
14
+
15
+ # load the diffusers pipeline
16
+ repo_id = "cvssp/audioldm2"
17
+ pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
18
+ # pipe.unet = torch.compile(pipe.unet)
19
 
20
+ # set the generator for reproducibility
21
+ generator = torch.Generator(device)
 
22
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
25
+ if text is None:
26
+ raise gr.Error("Please provide a text input.")
 
27
 
28
+ waveforms = pipe(
29
+ text,
30
+ audio_length_in_s=duration,
 
 
 
31
  guidance_scale=guidance_scale,
32
+ num_inference_steps=200,
33
+ negative_prompt=negative_prompt,
34
+ num_waveforms_per_prompt=n_candidates if n_candidates else 1,
35
+ generator=generator.manual_seed(int(random_seed)),
36
+ )["audios"]
37
+
38
+ return gr.make_waveform((16000, waveforms[0]), bg_image="bg.png")
39
+
 
40
 
41
  css = """
42
  a {
43
+ color: inherit; text-decoration: underline;
44
+ } .gradio-container {
 
 
45
  font-family: 'IBM Plex Sans', sans-serif;
46
+ } .gr-button {
47
+ color: white; border-color: #000000; background: #000000;
48
+ } input[type='range'] {
 
 
 
 
 
49
  accent-color: #000000;
50
+ } .dark input[type='range'] {
 
51
  accent-color: #dfdfdf;
52
+ } .container {
53
+ max-width: 730px; margin: auto; padding-top: 1.5rem;
54
+ } #gallery {
55
+ min-height: 22rem; margin-bottom: 15px; margin-left: auto; margin-right: auto; border-bottom-right-radius:
56
+ .5rem !important; border-bottom-left-radius: .5rem !important;
57
+ } #gallery>div>.h-full {
 
 
 
 
 
 
 
 
58
  min-height: 20rem;
59
+ } .details:hover {
 
60
  text-decoration: underline;
61
+ } .gr-button {
 
62
  white-space: nowrap;
63
+ } .gr-button:focus {
64
+ border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow:
65
+ var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1;
66
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width)
67
+ var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px
68
+ var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 /
69
+ var(--tw-ring-opacity)); --tw-ring-opacity: .5;
70
+ } #advanced-btn {
71
+ font-size: .7rem !important; line-height: 19px; margin-top: 12px; margin-bottom: 12px; padding: 2px 8px;
 
 
 
 
 
 
 
 
72
  border-radius: 14px !important;
73
+ } #advanced-options {
 
74
  margin-bottom: 20px;
75
+ } .footer {
76
+ margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5;
77
+ } .footer>p {
78
+ font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white;
79
+ } .dark .footer {
 
 
 
 
 
 
 
 
 
 
80
  border-color: #303030;
81
+ } .dark .footer>p {
 
82
  background: #0b0f19;
83
+ } .acknowledgments h4{
84
+ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%;
85
+ } #container-advanced-btns{
86
+ display: flex; flex-wrap: wrap; justify-content: space-between; align-items: center;
87
+ } .animate-spin {
 
 
 
 
 
 
 
 
88
  animation: spin 1s linear infinite;
89
+ } @keyframes spin {
 
90
  from {
91
  transform: rotate(0deg);
92
+ } to {
 
93
  transform: rotate(360deg);
94
  }
95
+ } #share-btn-container {
96
+ display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color:
97
+ #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
98
+ margin-top: 10px; margin-left: auto;
99
+ } #share-btn {
100
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif;
101
+ margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem
102
+ !important;right:0;
103
+ } #share-btn * {
 
104
  all: unset;
105
+ } #share-btn-container div:nth-child(-n+2){
106
+ width: auto !important; min-height: 0px !important;
107
+ } #share-btn-container .wrap {
 
 
 
108
  display: none !important;
109
+ } .gr-form{
 
110
  flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
111
+ } #prompt-container{
 
112
  gap: 0;
113
+ } #generated_id{
 
114
  min-height: 700px
115
+ } #setting_id{
116
+ margin-bottom: 12px; text-align: center; font-weight: 900;
 
 
 
117
  }
118
  """
119
  iface = gr.Blocks(css=css)
 
124
  <div style="text-align: center; max-width: 700px; margin: 0 auto;">
125
  <div
126
  style="
127
+ display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
 
 
 
128
  "
129
  >
130
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
131
  AudioLDM 2: A General Framework for Audio, Music, and Speech Generation
132
  </h1>
133
+ </div> <p style="margin-bottom: 10px; font-size: 94%">
134
+ <a href="https://arxiv.org/abs/2308.05734">[Paper]</a> <a href="https://audioldm.github.io/audioldm2">[Project
135
+ page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2">[🧨
136
+ Diffusers]</a>
137
  </p>
138
  </div>
139
  """
140
  )
141
  gr.HTML(
142
  """
143
+ <p>This is the demo for AudioLDM 2, powered by 🧨 Diffusers. Demo uses the checkpoint <a
144
+ href="https://huggingface.co/cvssp/audioldm2"> AudioLDM 2 base</a>. For faster inference without waiting in
145
+ queue, you may duplicate the space and upgrade to a GPU in the settings. <br/> <a
146
+ href="https://huggingface.co/spaces/haoheliu/audioldm2-text2audio-text2music?duplicate=true"> <img
147
+ style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> <p/>
148
  """
149
  )
150
+
151
  with gr.Group():
152
  with gr.Box():
 
153
  textbox = gr.Textbox(
154
+ value="The vibrant beat of Brazilian samba drums.",
155
  max_lines=1,
156
+ label="Input text",
157
  info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
158
  elem_id="prompt-in",
159
  )
160
+ negative_textbox = gr.Textbox(
161
+ value="Low quality.",
162
+ max_lines=1,
163
+ label="Negative prompt",
164
+ info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
165
+ elem_id="prompt-in",
166
+ )
167
 
168
  with gr.Accordion("Click to modify detailed configurations", open=False):
169
  seed = gr.Number(
170
+ value=45,
171
+ label="Seed",
172
+ info="Change this value (any integer number) will lead to a different generation result.",
173
  )
174
+ duration = gr.Slider(5, 15, value=10, step=2.5, label="Duration (seconds)")
 
 
175
  guidance_scale = gr.Slider(
176
  0,
177
+ 7,
178
  value=3.5,
179
  step=0.5,
180
+ label="Guidance scale",
181
+ info="Larger => better quality and relevancy to text; Smaller => better diversity",
182
  )
183
  n_candidates = gr.Slider(
184
  1,
185
+ 5,
186
  value=3,
187
  step=1,
188
+ label="Number waveforms to generate",
189
+ info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A larger value usually lead to better quality with heavier computation",
190
  )
 
 
 
 
 
 
191
 
192
+ outputs = gr.Video(label="Output", elem_id="output-video")
 
 
 
 
 
 
193
  btn = gr.Button("Submit").style(full_width=True)
194
 
195
  with gr.Group(elem_id="share-btn-container", visible=False):
 
197
  loading_icon = gr.HTML(loading_icon_html)
198
  share_button = gr.Button("Share to community", elem_id="share-btn")
199
 
 
 
200
  btn.click(
201
  text2audio,
202
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
203
  outputs=[outputs],
204
  )
205
 
 
207
  gr.HTML(
208
  """
209
  <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
210
+ <p>Follow the latest update of AudioLDM 2 on our<a href="https://audioldm.github.io/audioldm2"
211
+ style="text-decoration: underline;" target="_blank"> Github repo</a> </p> <br> <p>Model by <a
212
+ href="https://twitter.com/LiuHaohe" style="text-decoration: underline;" target="_blank">Haohe
213
+ Liu</a>. Code and demo by 🤗 Hugging Face.</p> <br>
 
214
  </div>
215
  """
216
  )
217
  gr.Examples(
218
  [
219
+ ["A hammer is hitting a wooden surface.", "Low quality.", 10, 3.5, 45, 3],
220
+ ["A cat is meowing for attention.", "Low quality.", 10, 3.5, 45, 3],
221
+ ["An excited crowd cheering at a sports game.", "Low quality.", 10, 3.5, 45, 3],
222
+ ["Birds singing sweetly in a blooming garden.", "Low quality.", 10, 3.5, 45, 3],
223
+ ["A modern synthesizer creating futuristic soundscapes.", "Low quality.", 10, 3.5, 45, 3],
224
+ ["The vibrant beat of Brazilian samba drums.", "Low quality.", 10, 3.5, 45, 3],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  ],
226
  fn=text2audio,
227
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
 
228
  outputs=[outputs],
229
  cache_examples=True,
230
  )
231
  gr.HTML(
232
  """
233
+ <div class="acknowledgements"> <p>Essential Tricks for Enhancing the Quality of Your Generated
234
+ Audio</p>
235
+ <p>1. Try using more adjectives to describe your sound. For example: "A man is speaking
236
+ clearly and slowly in a large room" is better than "A man is speaking".</p>
237
+ <p>2. Try using different random seeds, which can significantly affect the quality of the generated
238
+ output.</p>
239
+ <p>3. It's better to use general terms like 'man' or 'woman' instead of specific names for individuals or
240
+ abstract objects that humans may not be familiar with.</p>
241
+ <p>4. Using a negative prompt to not guide the diffusion process can improve the
242
+ audio quality significantly. Try using negative prompts like 'low quality'.</p>
243
  </div>
244
  """
245
  )
 
246
  with gr.Accordion("Additional information", open=False):
247
  gr.HTML(
248
  """
249
  <div class="acknowledgments">
250
+ <p> We build the model with data from <a href="http://research.google.com/audioset/">AudioSet</a>,
251
+ <a href="https://freesound.org/">Freesound</a> and <a
252
+ href="https://sound-effects.bbcrewind.co.uk/">BBC Sound Effect library</a>. We share this demo
253
+ based on the <a
254
+ href="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/375954/Research.pdf">UK
255
+ copyright exception</a> of data for academic research.
256
+ </p>
257
+ </div>
258
+ """
259
  )
 
260
 
261
+ iface.queue(max_size=20).launch()
262
+
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt CHANGED
@@ -1,9 +1,5 @@
1
- git+https://github.com/huggingface/diffusers.git
2
- transformers==4.30.2
3
  --extra-index-url https://download.pytorch.org/whl/cu113
4
- torch >= 2.0
5
- gradio_client==0.2.7
6
- huggingface_hub
7
- pydantic<2
8
- timm
9
- audioldm2==0.0.3
 
 
 
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
+ torch>=2.0
3
+ librosa
4
+ transformers
5
+ git+https://github.com/huggingface/diffusers.git