File size: 4,546 Bytes
372395e
caed802
b19e4a9
f694503
04d2706
 
caed802
372395e
caed802
372395e
41b5a1b
cd970d9
 
9983da0
 
 
cd970d9
be460ce
eb1af87
b6e8417
cb934a1
6cc068b
7396afd
cb934a1
 
36b4db6
41b5a1b
b5357a4
be460ce
6cc068b
 
372395e
79b4496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48c215d
79b4496
 
 
 
 
 
 
 
 
 
 
e7c2915
 
 
 
 
 
 
 
 
 
 
 
12bd467
 
 
 
 
 
e7c2915
 
 
79b4496
 
372395e
 
 
79b4496
372395e
 
7acb3e3
be460ce
 
 
 
 
 
8e6038a
f694503
372395e
570b690
79b4496
53f5458
 
 
 
 
79b4496
7acb3e3
9983da0
be460ce
bc6b39c
372395e
a63d987
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import os
import time
from moviepy.editor import *
from share_btn import community_icon_html, loading_icon_html, share_js

token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/SRDdev/Image-Caption")
audio_gen = gr.Blocks.load(name="spaces/fffiloni/audioldm-text-to-audio-generation-clone", api_key=token)

ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"

def clean():
    manual_cap.update(value="",placeholder=ph_message)
    caption_output.update(value=None)
    sound_output.update(value=None)

def infer(image_input, manual_caption, duration_in, seed):
    print(duration_in)
    if manual_caption == "":
        cap = caption(image_input, fn_index=0)
        print("gpt2 caption: '" + cap + "' β€’ ")
        ph_update = "gpt2 caption: '" + cap + "' β€’ "
    else:
        cap = manual_caption
        print("manual captiony: " + cap)
        ph_update=""
    
    sound = audio_gen(cap, duration_in, 2.5, seed, 3, fn_index=0)
    
    return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)

title = """
    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
        "
        >
        <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
            Image to Sound Effect
        </h1>
        </div>
        <p style="margin-bottom: 10px; font-size: 94%">
        Convert an image to a corresponding sound effect generated through GPT2 Image Captioning & AudioLDM
        </p>
    </div>
"""

article = """
    
    <div class="footer">
        <p>
         
        Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates πŸ€—
        </p>
    </div>

    <div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
        <p>You may also like: </p>
        
        <div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
            
            <svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
                 </a>
            </svg>

            <svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue" src="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue.png" height="20"/>
                 </a>
            </svg>
            
        </div>
    </div>
"""

with gr.Blocks(css="style.css") as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
    
        input_img = gr.Image(type="filepath", elem_id="input-img")
        
        with gr.Column():
            manual_cap = gr.Textbox(label="Manual Image description (optional)", lines=3, placeholder=ph_message)
            with gr.Row():
                duration_in = gr.Slider(minimum=5, maximum=10, step=5, value=5, label="Duration")
                seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1)
        
        caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
        sound_output = gr.Audio(label="Result", elem_id="sound-output")
        
        generate = gr.Button("Generate SFX from Image")

        with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
            community_icon = gr.HTML(community_icon_html)
            loading_icon = gr.HTML(loading_icon_html)
            share_button = gr.Button("Share to community", elem_id="share-btn")

        gr.HTML(article)

    input_img.change(clean)
    generate.click(infer, inputs=[input_img, manual_cap, duration_in, seed_in], outputs=[caption_output, sound_output, manual_cap, share_group], api_name="i2fx")
    share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=32).launch(debug=True)