Spaces:
Runtime error
Runtime error
leeway.zlw
commited on
Commit
·
69c71b8
1
Parent(s):
ca33a23
update
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] els
|
|
| 12 |
if(not is_shared_ui):
|
| 13 |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
|
| 14 |
|
| 15 |
-
def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
|
| 16 |
if is_shared_ui:
|
| 17 |
raise gr.Error("This Space only works in duplicated instances")
|
| 18 |
|
|
@@ -23,10 +23,10 @@ def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=T
|
|
| 23 |
source_image=source_image,
|
| 24 |
driving_audio=driving_audio,
|
| 25 |
output=f'output-{unique_id}.mp4',
|
| 26 |
-
pose_weight=
|
| 27 |
-
face_weight=
|
| 28 |
-
lip_weight=
|
| 29 |
-
face_expand_ratio=
|
| 30 |
checkpoint=None
|
| 31 |
)
|
| 32 |
|
|
@@ -91,17 +91,38 @@ with gr.Blocks(css=css) as demo:
|
|
| 91 |
''', elem_id="warning-duplicate")
|
| 92 |
gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
|
| 93 |
gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
with gr.Row():
|
| 95 |
with gr.Column():
|
| 96 |
avatar_face = gr.Image(type="filepath", label="Face")
|
| 97 |
driving_audio = gr.Audio(type="filepath", label="Driving audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
generate = gr.Button("Generate")
|
| 99 |
with gr.Column():
|
| 100 |
output_video = gr.Video(label="Your talking head")
|
| 101 |
|
| 102 |
generate.click(
|
| 103 |
fn=run_inference,
|
| 104 |
-
inputs=[avatar_face, driving_audio],
|
| 105 |
outputs=output_video
|
| 106 |
)
|
| 107 |
|
|
|
|
| 12 |
if(not is_shared_ui):
|
| 13 |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
|
| 14 |
|
| 15 |
+
def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
|
| 16 |
if is_shared_ui:
|
| 17 |
raise gr.Error("This Space only works in duplicated instances")
|
| 18 |
|
|
|
|
| 23 |
source_image=source_image,
|
| 24 |
driving_audio=driving_audio,
|
| 25 |
output=f'output-{unique_id}.mp4',
|
| 26 |
+
pose_weight=pose_weight,
|
| 27 |
+
face_weight=face_weight,
|
| 28 |
+
lip_weight=lip_weight,
|
| 29 |
+
face_expand_ratio=face_expand_ratio,
|
| 30 |
checkpoint=None
|
| 31 |
)
|
| 32 |
|
|
|
|
| 91 |
''', elem_id="warning-duplicate")
|
| 92 |
gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
|
| 93 |
gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
|
| 94 |
+
gr.Markdown("""
|
| 95 |
+
Hallo has a few simple requirements for input data:
|
| 96 |
+
|
| 97 |
+
For the source image:
|
| 98 |
+
|
| 99 |
+
1. It should be cropped into squares.
|
| 100 |
+
2. The face should be the main focus, making up 50%-70% of the image.
|
| 101 |
+
3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
|
| 102 |
+
|
| 103 |
+
For the driving audio:
|
| 104 |
+
|
| 105 |
+
1. It must be in WAV format.
|
| 106 |
+
2. It must be in English since our training datasets are only in this language.
|
| 107 |
+
3. Ensure the vocals are clear; background music is acceptable.
|
| 108 |
+
|
| 109 |
+
We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference.
|
| 110 |
+
""")
|
| 111 |
with gr.Row():
|
| 112 |
with gr.Column():
|
| 113 |
avatar_face = gr.Image(type="filepath", label="Face")
|
| 114 |
driving_audio = gr.Audio(type="filepath", label="Driving audio")
|
| 115 |
+
pose_weight = gr.Number(label="pose weight", value=1.0),
|
| 116 |
+
face_weight = gr.Number(label="face weight", value=1.0),
|
| 117 |
+
lip_weight = gr.Number(label="lip weight", value=1.0),
|
| 118 |
+
face_expand_ratio = gr.Number(label="face expand ratio", value=1.2),
|
| 119 |
generate = gr.Button("Generate")
|
| 120 |
with gr.Column():
|
| 121 |
output_video = gr.Video(label="Your talking head")
|
| 122 |
|
| 123 |
generate.click(
|
| 124 |
fn=run_inference,
|
| 125 |
+
inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
|
| 126 |
outputs=output_video
|
| 127 |
)
|
| 128 |
|