wzy013 commited on
Commit
21d1989
·
1 Parent(s): e78e3fd

Deploy real API calling version

Browse files

- Implements Gradio Client API to call official HunyuanVideo-Foley Space
- Falls back to Hugging Face Inference API as secondary option
- Smart API inference with multiple fallback strategies
- No local model loading - solves 16GB memory limit issue
- Real AI audio generation through remote API calls
- Comprehensive error handling and user feedback
- Minimal dependencies focused on API calling

Files changed (4) hide show
  1. app.py +221 -136
  2. app_real_api.py +326 -0
  3. requirements.txt +8 -5
  4. requirements_api.txt +10 -0
app.py CHANGED
@@ -1,146 +1,211 @@
1
  import os
2
  import tempfile
3
  import gradio as gr
4
- import torch
5
- import torchaudio
6
- from loguru import logger
7
- from typing import Optional, Tuple
8
- import random
9
- import numpy as np
10
  import requests
11
  import json
 
 
 
 
12
 
13
- # Simplified working version without loading large models
14
-
15
- def create_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str:
16
- """Create a simple demo audio file"""
17
- sample_rate = 48000
18
- duration_samples = int(duration * sample_rate)
19
-
20
- # Generate a simple tone as demo
21
- t = torch.linspace(0, duration, duration_samples)
22
- frequency = 440 # A note
23
- audio = 0.3 * torch.sin(2 * 3.14159 * frequency * t)
24
-
25
- # Add some variation based on text prompt length
26
- if text_prompt:
27
- freq_mod = len(text_prompt) * 10
28
- audio += 0.1 * torch.sin(2 * 3.14159 * freq_mod * t)
29
-
30
- # Save to temporary file
31
- temp_dir = tempfile.mkdtemp()
32
- audio_path = os.path.join(temp_dir, "demo_audio.wav")
33
- torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
34
-
35
- return audio_path
36
-
37
- def process_video_demo(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]:
38
- """Working demo version that generates simple audio"""
39
-
40
- if video_file is None:
41
- return [], "❌ Please upload a video file!"
42
-
43
- if text_prompt is None:
44
- text_prompt = ""
45
-
46
  try:
47
- logger.info(f"Processing video in demo mode: {video_file}")
48
- logger.info(f"Text prompt: {text_prompt}")
49
 
50
- # Generate simple demo audio
51
- video_outputs = []
52
- for i in range(min(sample_nums, 3)): # Limit to 3 samples
53
- demo_audio = create_demo_audio(video_file, f"{text_prompt}_sample_{i+1}")
54
-
55
- # For demo, just return the audio file path
56
- # In a real implementation, this would be merged with video
57
- video_outputs.append(demo_audio)
58
 
59
- success_msg = f"""✅ Demo Generation Complete!
60
-
61
- 📹 **Processed**: {os.path.basename(video_file) if hasattr(video_file, 'name') else 'Video file'}
62
- 📝 **Prompt**: "{text_prompt}"
63
- ⚙️ **Settings**: CFG={guidance_scale}, Steps={inference_steps}, Samples={sample_nums}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- 🎵 **Generated**: {len(video_outputs)} demo audio sample(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- ⚠️ **Note**: This is a working demo with synthetic audio.
68
- For real AI-generated Foley audio, run locally with the full model:
69
- https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley"""
 
 
 
 
 
 
70
 
71
- return video_outputs, success_msg
72
 
73
  except Exception as e:
74
- logger.error(f"Demo processing failed: {str(e)}")
75
- return [], f"❌ Demo processing failed: {str(e)}"
76
 
77
- def create_working_interface():
78
- """Create a working Gradio interface"""
79
 
80
- css = """
81
- .gradio-container {
82
- font-family: 'Inter', sans-serif;
83
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
84
- }
85
 
86
- .main-header {
87
- text-align: center;
88
- padding: 2rem;
89
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
90
- border-radius: 20px;
91
- margin-bottom: 2rem;
92
- color: white;
93
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- .demo-notice {
96
- background: #e8f4fd;
97
- border: 2px solid #1890ff;
 
 
 
 
 
 
98
  border-radius: 10px;
99
  padding: 1rem;
100
  margin: 1rem 0;
101
- color: #0050b3;
102
  }
103
  """
104
 
105
- with gr.Blocks(css=css, title="HunyuanVideo-Foley Demo") as app:
106
 
107
  # Header
108
- with gr.Column(elem_classes=["main-header"]):
109
- gr.HTML("""
110
  <h1>🎵 HunyuanVideo-Foley</h1>
111
- <p>Working Demo Version</p>
112
- """)
 
113
 
114
- # Demo Notice
115
  gr.HTML("""
116
- <div class="demo-notice">
117
- <strong>🎯 Working Demo:</strong> This version generates synthetic audio to demonstrate the interface.
118
- Upload a video and try the controls to see how it works!<br>
119
- <strong>For real AI audio:</strong> Visit the <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">original repository</a>
120
  </div>
121
  """)
122
 
123
  with gr.Row():
124
- # Input Section
125
  with gr.Column(scale=1):
126
- gr.Markdown("### 📹 Video Input")
127
 
128
  video_input = gr.Video(
129
- label="Upload Video",
130
- info="Upload any video file to test the interface"
131
  )
132
 
133
  text_input = gr.Textbox(
134
- label="🎯 Audio Description",
135
- placeholder="Describe the audio you want (affects demo tone)",
136
- lines=3
 
137
  )
138
 
139
  with gr.Row():
140
  guidance_scale = gr.Slider(
141
  minimum=1.0,
142
  maximum=10.0,
143
- value=4.0,
144
  step=0.1,
145
  label="🎚️ CFG Scale"
146
  )
@@ -150,87 +215,107 @@ def create_working_interface():
150
  maximum=100,
151
  value=50,
152
  step=5,
153
- label="⚡ Steps"
154
  )
155
 
156
  sample_nums = gr.Slider(
157
  minimum=1,
158
- maximum=3,
159
  value=1,
160
  step=1,
161
- label="🎲 Samples"
162
  )
163
 
164
- generate_btn = gr.Button("🎵 Generate Demo Audio", variant="primary")
 
 
 
 
165
 
166
- # Output Section
167
  with gr.Column(scale=1):
168
- gr.Markdown("### 🎵 Generated Audio")
169
 
170
- audio_output_1 = gr.Audio(label="Sample 1", visible=True)
171
- audio_output_2 = gr.Audio(label="Sample 2", visible=False)
172
- audio_output_3 = gr.Audio(label="Sample 3", visible=False)
 
 
 
 
173
 
174
  status_output = gr.Textbox(
175
- label="Status",
176
  interactive=False,
177
- lines=6
 
178
  )
179
 
180
- # Event handlers
181
- def update_visibility(sample_nums):
182
- return [
183
- gr.update(visible=True), # Sample 1 always visible
184
- gr.update(visible=sample_nums >= 2),
185
- gr.update(visible=sample_nums >= 3)
186
- ]
187
-
188
- def process_demo(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
189
- audio_files, status_msg = process_video_demo(
190
  video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
191
  )
192
 
193
- # Prepare outputs
194
- outputs = [None, None, None]
195
- for i, audio_file in enumerate(audio_files[:3]):
196
- outputs[i] = audio_file
197
 
198
- return outputs[0], outputs[1], outputs[2], status_msg
 
 
 
 
 
199
 
200
- # Connect events
 
 
 
 
 
201
  sample_nums.change(
202
  fn=update_visibility,
203
  inputs=[sample_nums],
204
- outputs=[audio_output_1, audio_output_2, audio_output_3]
205
  )
206
 
207
  generate_btn.click(
208
- fn=process_demo,
209
  inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
210
- outputs=[audio_output_1, audio_output_2, audio_output_3, status_output]
211
  )
212
 
213
  # Footer
214
  gr.HTML("""
215
- <div style="text-align: center; padding: 2rem; color: #666;">
216
- <p>🎭 <strong>Demo Version:</strong> Generates synthetic audio for interface demonstration</p>
217
- <p>🚀 <strong>Full Version:</strong> <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p>
 
218
  </div>
219
  """)
220
 
221
  return app
222
 
223
  if __name__ == "__main__":
224
- # Setup logging
225
  logger.remove()
226
  logger.add(lambda msg: print(msg, end=''), level="INFO")
227
 
228
- logger.info("Starting HunyuanVideo-Foley Working Demo...")
 
 
 
 
 
 
 
229
 
230
- # Create and launch app
231
- app = create_working_interface()
232
 
233
- logger.info("Demo app ready - will generate synthetic audio for testing")
234
 
235
  app.launch(
236
  server_name="0.0.0.0",
 
1
  import os
2
  import tempfile
3
  import gradio as gr
 
 
 
 
 
 
4
  import requests
5
  import json
6
+ from loguru import logger
7
+ from typing import Optional, Tuple
8
+ import base64
9
+ import time
10
 
11
+ def call_gradio_client_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
12
+ """调用官方Hugging Face Space的API"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  try:
14
+ from gradio_client import Client
 
15
 
16
+ logger.info("连接到官方 HunyuanVideo-Foley Space...")
 
 
 
 
 
 
 
17
 
18
+ # 连接到官方Space
19
+ client = Client("tencent/HunyuanVideo-Foley")
20
+
21
+ logger.info("发送推理请求...")
22
+
23
+ # 调用推理函数
24
+ result = client.predict(
25
+ video_file, # 视频文件
26
+ text_prompt, # 文本提示
27
+ guidance_scale, # CFG scale
28
+ inference_steps, # 推理步数
29
+ sample_nums, # 样本数量
30
+ api_name="/infer_single_video" # API端点名称
31
+ )
32
+
33
+ return result, "✅ 成功通过官方API生成音频!"
34
+
35
+ except Exception as e:
36
+ error_msg = str(e)
37
+ logger.error(f"Gradio Client API 调用失败: {error_msg}")
38
+
39
+ if "not found" in error_msg.lower():
40
+ return None, "❌ 官方Space的API端点未找到,可能接口已更改"
41
+ elif "connection" in error_msg.lower():
42
+ return None, "❌ 无法连接到官方Space,请检查网络"
43
+ elif "queue" in error_msg.lower():
44
+ return None, "⏳ 官方Space繁忙,请稍后重试"
45
+ else:
46
+ return None, f"❌ API调用错误: {error_msg}"
47
 
48
+ def call_huggingface_inference_api(video_file, text_prompt):
49
+ """调用Hugging Face Inference API"""
50
+ try:
51
+ logger.info("尝试Hugging Face Inference API...")
52
+
53
+ API_URL = "https://api-inference.huggingface.co/models/tencent/HunyuanVideo-Foley"
54
+
55
+ # 读取视频文件
56
+ with open(video_file, "rb") as f:
57
+ video_data = f.read()
58
+
59
+ # 准备请求数据
60
+ headers = {
61
+ "Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}",
62
+ }
63
+
64
+ # 发送请求
65
+ response = requests.post(
66
+ API_URL,
67
+ headers=headers,
68
+ json={"inputs": {"video": base64.b64encode(video_data).decode(), "text": text_prompt}},
69
+ timeout=300
70
+ )
71
+
72
+ if response.status_code == 200:
73
+ # 保存结果
74
+ temp_dir = tempfile.mkdtemp()
75
+ audio_path = os.path.join(temp_dir, "generated_audio.wav")
76
+ with open(audio_path, 'wb') as f:
77
+ f.write(response.content)
78
+ return [audio_path], "✅ 通过Hugging Face API生成成功!"
79
+ else:
80
+ logger.error(f"HF API错误: {response.status_code}")
81
+ return None, f"❌ Hugging Face API返回错误: {response.status_code}"
82
+
83
+ except Exception as e:
84
+ logger.error(f"HF API调用失败: {str(e)}")
85
+ return None, f"❌ Hugging Face API调用失败: {str(e)}"
86
 
87
+ def try_alternative_apis(video_file, text_prompt):
88
+ """尝试其他可能的API服务"""
89
+
90
+ # 1. 尝试通过公开的demo接口
91
+ try:
92
+ logger.info("尝试demo接口...")
93
+
94
+ # 这里可以尝试其他公开的API服务
95
+ # 比如Replicate、RunPod等
96
 
97
+ return None, "❌ 暂无可用的替代API服务"
98
 
99
  except Exception as e:
100
+ return None, f" 替代API调用失败: {str(e)}"
 
101
 
102
+ def smart_api_inference(video_file, text_prompt, guidance_scale=4.5, inference_steps=50, sample_nums=1):
103
+ """智能API推理 - 尝试多种API调用方式"""
104
 
105
+ if video_file is None:
106
+ return [], "❌ 请上传视频文件!"
 
 
 
107
 
108
+ if not text_prompt:
109
+ text_prompt = "audio for this video"
110
+
111
+ logger.info(f"开始API推理: {video_file}")
112
+ logger.info(f"文本提示: {text_prompt}")
113
+
114
+ status_updates = []
115
+
116
+ # 方法1: 尝试Gradio Client (最可能成功)
117
+ status_updates.append("🔄 尝试连接官方Space API...")
118
+ try:
119
+ result, status = call_gradio_client_api(
120
+ video_file, text_prompt, guidance_scale, inference_steps, sample_nums
121
+ )
122
+ if result:
123
+ return result, "\n".join(status_updates + [status])
124
+ status_updates.append(status)
125
+ except ImportError:
126
+ status_updates.append("⚠️ gradio_client未安装,跳过官方API调用")
127
+
128
+ # 方法2: 尝试Hugging Face Inference API
129
+ status_updates.append("🔄 尝试Hugging Face Inference API...")
130
+ result, status = call_huggingface_inference_api(video_file, text_prompt)
131
+ if result:
132
+ return result, "\n".join(status_updates + [status])
133
+ status_updates.append(status)
134
+
135
+ # 方法3: 尝试其他API
136
+ status_updates.append("🔄 尝试替代API服务...")
137
+ result, status = try_alternative_apis(video_file, text_prompt)
138
+ status_updates.append(status)
139
+
140
+ # 所有方法都失败了
141
+ final_message = "\n".join(status_updates + [
142
+ "",
143
+ "💡 **解决方案建议:**",
144
+ "• 安装 gradio_client: pip install gradio_client",
145
+ "• 配置 HF_TOKEN 环境变量",
146
+ "• 等待官方Space负载降低",
147
+ "• 本地运行完整模型(需24GB+ RAM)",
148
+ "",
149
+ "🔗 **官方Space**: https://huggingface.co/spaces/tencent/HunyuanVideo-Foley"
150
+ ])
151
 
152
+ return [], final_message
153
+
154
+ def create_real_api_interface():
155
+ """创建真实API调用界面"""
156
+
157
+ css = """
158
+ .api-status {
159
+ background: #f0f8ff;
160
+ border: 2px solid #4169e1;
161
  border-radius: 10px;
162
  padding: 1rem;
163
  margin: 1rem 0;
164
+ color: #191970;
165
  }
166
  """
167
 
168
+ with gr.Blocks(css=css, title="HunyuanVideo-Foley API Client") as app:
169
 
170
  # Header
171
+ gr.HTML("""
172
+ <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
173
  <h1>🎵 HunyuanVideo-Foley</h1>
174
+ <p>API客户端 - 调用真实模型推理</p>
175
+ </div>
176
+ """)
177
 
178
+ # API Status Notice
179
  gr.HTML("""
180
+ <div class="api-status">
181
+ <strong>🌐 真实API调用模式:</strong> 这个版本会通过API调用真实的HunyuanVideo-Foley模型进行推理。
182
+ <br><strong>优点:</strong> 真实AI音频生成,无需本地大内存
183
+ <br><strong>缺点:</strong> 依赖外部服务可用性,可能需要等待队列
184
  </div>
185
  """)
186
 
187
  with gr.Row():
188
+ # 输入区域
189
  with gr.Column(scale=1):
190
+ gr.Markdown("### 📹 视频输入")
191
 
192
  video_input = gr.Video(
193
+ label="上传视频",
194
+ info="支持MP4、AVI、MOV等格式"
195
  )
196
 
197
  text_input = gr.Textbox(
198
+ label="🎯 音频描述",
199
+ placeholder="描述你想要的音频效果,例如:脚步声、雨声、车辆行驶等",
200
+ lines=3,
201
+ value="audio sound effects for this video"
202
  )
203
 
204
  with gr.Row():
205
  guidance_scale = gr.Slider(
206
  minimum=1.0,
207
  maximum=10.0,
208
+ value=4.5,
209
  step=0.1,
210
  label="🎚️ CFG Scale"
211
  )
 
215
  maximum=100,
216
  value=50,
217
  step=5,
218
+ label="⚡ 推理步数"
219
  )
220
 
221
  sample_nums = gr.Slider(
222
  minimum=1,
223
+ maximum=6,
224
  value=1,
225
  step=1,
226
+ label="🎲 样本数量"
227
  )
228
 
229
+ generate_btn = gr.Button(
230
+ "🎵 调用API生成音频",
231
+ variant="primary",
232
+ size="lg"
233
+ )
234
 
235
+ # 输出区域
236
  with gr.Column(scale=1):
237
+ gr.Markdown("### 🎵 生成结果")
238
 
239
+ audio_outputs = []
240
+ for i in range(6):
241
+ audio_output = gr.Audio(
242
+ label=f"样本 {i+1}",
243
+ visible=(i == 0) # 只显示第一个
244
+ )
245
+ audio_outputs.append(audio_output)
246
 
247
  status_output = gr.Textbox(
248
+ label="API状态",
249
  interactive=False,
250
+ lines=10,
251
+ placeholder="等待API调用..."
252
  )
253
 
254
+ # 事件处理
255
+ def process_with_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
256
+ # 调用API推理
257
+ results, status_msg = smart_api_inference(
 
 
 
 
 
 
258
  video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
259
  )
260
 
261
+ # 准备输出
262
+ outputs = [None] * 6
263
+ visibilities = [False] * 6
 
264
 
265
+ if results and isinstance(results, list):
266
+ for i, result in enumerate(results[:6]):
267
+ outputs[i] = result
268
+ visibilities[i] = True
269
+
270
+ return outputs + visibilities + [status_msg]
271
 
272
+ # 动态显示样本数量
273
+ def update_visibility(sample_nums):
274
+ sample_nums = int(sample_nums)
275
+ return [gr.update(visible=(i < sample_nums)) for i in range(6)]
276
+
277
+ # 连接事件
278
  sample_nums.change(
279
  fn=update_visibility,
280
  inputs=[sample_nums],
281
+ outputs=audio_outputs
282
  )
283
 
284
  generate_btn.click(
285
+ fn=process_with_api,
286
  inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
287
+ outputs=audio_outputs + [gr.update(visible=(i < 6)) for i in range(6)] + [status_output]
288
  )
289
 
290
  # Footer
291
  gr.HTML("""
292
+ <div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
293
+ <p><strong>📡 API调用版本</strong> - 通过网络调用真实模型进行推理</p>
294
+ <p>🔗 官方Space: <a href="https://huggingface.co/spaces/tencent/HunyuanVideo-Foley" target="_blank">tencent/HunyuanVideo-Foley</a></p>
295
+ <p>⚠️ 需要安装: <code>pip install gradio_client</code></p>
296
  </div>
297
  """)
298
 
299
  return app
300
 
301
  if __name__ == "__main__":
302
+ # 设置日志
303
  logger.remove()
304
  logger.add(lambda msg: print(msg, end=''), level="INFO")
305
 
306
+ logger.info("启动 HunyuanVideo-Foley API 客户端...")
307
+
308
+ # 检查依赖
309
+ try:
310
+ import gradio_client
311
+ logger.info("✅ gradio_client 已安装")
312
+ except ImportError:
313
+ logger.warning("⚠️ gradio_client 未安装,API调用功能可能受限")
314
 
315
+ # 创建并启动应用
316
+ app = create_real_api_interface()
317
 
318
+ logger.info("API客户端就绪,准备调用真实模型...")
319
 
320
  app.launch(
321
  server_name="0.0.0.0",
app_real_api.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import gradio as gr
4
+ import requests
5
+ import json
6
+ from loguru import logger
7
+ from typing import Optional, Tuple
8
+ import base64
9
+ import time
10
+
11
+ def call_gradio_client_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
12
+ """调用官方Hugging Face Space的API"""
13
+ try:
14
+ from gradio_client import Client
15
+
16
+ logger.info("连接到官方 HunyuanVideo-Foley Space...")
17
+
18
+ # 连接到官方Space
19
+ client = Client("tencent/HunyuanVideo-Foley")
20
+
21
+ logger.info("发送推理请求...")
22
+
23
+ # 调用推理函数
24
+ result = client.predict(
25
+ video_file, # 视频文件
26
+ text_prompt, # 文本提示
27
+ guidance_scale, # CFG scale
28
+ inference_steps, # 推理步数
29
+ sample_nums, # 样本数量
30
+ api_name="/infer_single_video" # API端点名称
31
+ )
32
+
33
+ return result, "✅ 成功通过官方API生成音频!"
34
+
35
+ except Exception as e:
36
+ error_msg = str(e)
37
+ logger.error(f"Gradio Client API 调用失败: {error_msg}")
38
+
39
+ if "not found" in error_msg.lower():
40
+ return None, "❌ 官方Space的API端点未找到,可能接口已更改"
41
+ elif "connection" in error_msg.lower():
42
+ return None, "❌ 无法连接到官方Space,请检查网络"
43
+ elif "queue" in error_msg.lower():
44
+ return None, "⏳ 官方Space繁忙,请稍后重试"
45
+ else:
46
+ return None, f"❌ API调用错误: {error_msg}"
47
+
48
+ def call_huggingface_inference_api(video_file, text_prompt):
49
+ """调用Hugging Face Inference API"""
50
+ try:
51
+ logger.info("尝试Hugging Face Inference API...")
52
+
53
+ API_URL = "https://api-inference.huggingface.co/models/tencent/HunyuanVideo-Foley"
54
+
55
+ # 读取视频文件
56
+ with open(video_file, "rb") as f:
57
+ video_data = f.read()
58
+
59
+ # 准备请求数据
60
+ headers = {
61
+ "Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}",
62
+ }
63
+
64
+ # 发送请求
65
+ response = requests.post(
66
+ API_URL,
67
+ headers=headers,
68
+ json={"inputs": {"video": base64.b64encode(video_data).decode(), "text": text_prompt}},
69
+ timeout=300
70
+ )
71
+
72
+ if response.status_code == 200:
73
+ # 保存结果
74
+ temp_dir = tempfile.mkdtemp()
75
+ audio_path = os.path.join(temp_dir, "generated_audio.wav")
76
+ with open(audio_path, 'wb') as f:
77
+ f.write(response.content)
78
+ return [audio_path], "✅ 通过Hugging Face API生成成功!"
79
+ else:
80
+ logger.error(f"HF API错误: {response.status_code}")
81
+ return None, f"❌ Hugging Face API返回错误: {response.status_code}"
82
+
83
+ except Exception as e:
84
+ logger.error(f"HF API调用失败: {str(e)}")
85
+ return None, f"❌ Hugging Face API调用失败: {str(e)}"
86
+
87
+ def try_alternative_apis(video_file, text_prompt):
88
+ """尝试其他可能的API服务"""
89
+
90
+ # 1. 尝试通过公开的demo接口
91
+ try:
92
+ logger.info("尝试demo接口...")
93
+
94
+ # 这里可以尝试其他公开的API服务
95
+ # 比如Replicate、RunPod等
96
+
97
+ return None, "❌ 暂无可用的替代API服务"
98
+
99
+ except Exception as e:
100
+ return None, f"❌ 替代API调用失败: {str(e)}"
101
+
102
+ def smart_api_inference(video_file, text_prompt, guidance_scale=4.5, inference_steps=50, sample_nums=1):
103
+ """智能API推理 - 尝试多种API调用方式"""
104
+
105
+ if video_file is None:
106
+ return [], "❌ 请上传视频文件!"
107
+
108
+ if not text_prompt:
109
+ text_prompt = "audio for this video"
110
+
111
+ logger.info(f"开始API推理: {video_file}")
112
+ logger.info(f"文本提示: {text_prompt}")
113
+
114
+ status_updates = []
115
+
116
+ # 方法1: 尝试Gradio Client (最可能成功)
117
+ status_updates.append("🔄 尝试连接官方Space API...")
118
+ try:
119
+ result, status = call_gradio_client_api(
120
+ video_file, text_prompt, guidance_scale, inference_steps, sample_nums
121
+ )
122
+ if result:
123
+ return result, "\n".join(status_updates + [status])
124
+ status_updates.append(status)
125
+ except ImportError:
126
+ status_updates.append("⚠️ gradio_client未安装,跳过官方API调用")
127
+
128
+ # 方法2: 尝试Hugging Face Inference API
129
+ status_updates.append("🔄 尝试Hugging Face Inference API...")
130
+ result, status = call_huggingface_inference_api(video_file, text_prompt)
131
+ if result:
132
+ return result, "\n".join(status_updates + [status])
133
+ status_updates.append(status)
134
+
135
+ # 方法3: 尝试其他API
136
+ status_updates.append("🔄 尝试替代API服务...")
137
+ result, status = try_alternative_apis(video_file, text_prompt)
138
+ status_updates.append(status)
139
+
140
+ # 所有方法都失败了
141
+ final_message = "\n".join(status_updates + [
142
+ "",
143
+ "💡 **解决方案建议:**",
144
+ "• 安装 gradio_client: pip install gradio_client",
145
+ "• 配置 HF_TOKEN 环境变量",
146
+ "• 等待官方Space负载降低",
147
+ "• 本地运行完整模型(需24GB+ RAM)",
148
+ "",
149
+ "🔗 **官方Space**: https://huggingface.co/spaces/tencent/HunyuanVideo-Foley"
150
+ ])
151
+
152
+ return [], final_message
153
+
154
+ def create_real_api_interface():
155
+ """创建真实API调用界面"""
156
+
157
+ css = """
158
+ .api-status {
159
+ background: #f0f8ff;
160
+ border: 2px solid #4169e1;
161
+ border-radius: 10px;
162
+ padding: 1rem;
163
+ margin: 1rem 0;
164
+ color: #191970;
165
+ }
166
+ """
167
+
168
+ with gr.Blocks(css=css, title="HunyuanVideo-Foley API Client") as app:
169
+
170
+ # Header
171
+ gr.HTML("""
172
+ <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
173
+ <h1>🎵 HunyuanVideo-Foley</h1>
174
+ <p>API客户端 - 调用真实模型推理</p>
175
+ </div>
176
+ """)
177
+
178
+ # API Status Notice
179
+ gr.HTML("""
180
+ <div class="api-status">
181
+ <strong>🌐 真实API调用模式:</strong> 这个版本会通过API调用真实的HunyuanVideo-Foley模型进行推理。
182
+ <br><strong>优点:</strong> 真实AI音频生成,无需本地大内存
183
+ <br><strong>缺点:</strong> 依赖外部服务可用性,可能需要等待队列
184
+ </div>
185
+ """)
186
+
187
+ with gr.Row():
188
+ # 输入区域
189
+ with gr.Column(scale=1):
190
+ gr.Markdown("### 📹 视频输入")
191
+
192
+ video_input = gr.Video(
193
+ label="上传视频",
194
+ info="支持MP4、AVI、MOV等格式"
195
+ )
196
+
197
+ text_input = gr.Textbox(
198
+ label="🎯 音频描述",
199
+ placeholder="描述你想要的音频效果,例如:脚步声、雨声、车辆行驶等",
200
+ lines=3,
201
+ value="audio sound effects for this video"
202
+ )
203
+
204
+ with gr.Row():
205
+ guidance_scale = gr.Slider(
206
+ minimum=1.0,
207
+ maximum=10.0,
208
+ value=4.5,
209
+ step=0.1,
210
+ label="🎚️ CFG Scale"
211
+ )
212
+
213
+ inference_steps = gr.Slider(
214
+ minimum=10,
215
+ maximum=100,
216
+ value=50,
217
+ step=5,
218
+ label="⚡ 推理步数"
219
+ )
220
+
221
+ sample_nums = gr.Slider(
222
+ minimum=1,
223
+ maximum=6,
224
+ value=1,
225
+ step=1,
226
+ label="🎲 样本数量"
227
+ )
228
+
229
+ generate_btn = gr.Button(
230
+ "🎵 调用API生成音频",
231
+ variant="primary",
232
+ size="lg"
233
+ )
234
+
235
+ # 输出区域
236
+ with gr.Column(scale=1):
237
+ gr.Markdown("### 🎵 生成结果")
238
+
239
+ audio_outputs = []
240
+ for i in range(6):
241
+ audio_output = gr.Audio(
242
+ label=f"样本 {i+1}",
243
+ visible=(i == 0) # 只显示第一个
244
+ )
245
+ audio_outputs.append(audio_output)
246
+
247
+ status_output = gr.Textbox(
248
+ label="API状态",
249
+ interactive=False,
250
+ lines=10,
251
+ placeholder="等待API调用..."
252
+ )
253
+
254
+ # 事件处理
255
+ def process_with_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
256
+ # 调用API推理
257
+ results, status_msg = smart_api_inference(
258
+ video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
259
+ )
260
+
261
+ # 准备输出
262
+ outputs = [None] * 6
263
+ visibilities = [False] * 6
264
+
265
+ if results and isinstance(results, list):
266
+ for i, result in enumerate(results[:6]):
267
+ outputs[i] = result
268
+ visibilities[i] = True
269
+
270
+ return outputs + visibilities + [status_msg]
271
+
272
+ # 动态显示样本数量
273
+ def update_visibility(sample_nums):
274
+ sample_nums = int(sample_nums)
275
+ return [gr.update(visible=(i < sample_nums)) for i in range(6)]
276
+
277
+ # 连���事件
278
+ sample_nums.change(
279
+ fn=update_visibility,
280
+ inputs=[sample_nums],
281
+ outputs=audio_outputs
282
+ )
283
+
284
+ generate_btn.click(
285
+ fn=process_with_api,
286
+ inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
287
+ outputs=audio_outputs + [gr.update(visible=(i < 6)) for i in range(6)] + [status_output]
288
+ )
289
+
290
+ # Footer
291
+ gr.HTML("""
292
+ <div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
293
+ <p><strong>📡 API调用版本</strong> - 通过网络调用真实模型进行推理</p>
294
+ <p>🔗 官方Space: <a href="https://huggingface.co/spaces/tencent/HunyuanVideo-Foley" target="_blank">tencent/HunyuanVideo-Foley</a></p>
295
+ <p>⚠️ 需要安装: <code>pip install gradio_client</code></p>
296
+ </div>
297
+ """)
298
+
299
+ return app
300
+
301
+ if __name__ == "__main__":
302
+ # 设置日志
303
+ logger.remove()
304
+ logger.add(lambda msg: print(msg, end=''), level="INFO")
305
+
306
+ logger.info("启动 HunyuanVideo-Foley API 客户端...")
307
+
308
+ # 检查依赖
309
+ try:
310
+ import gradio_client
311
+ logger.info("✅ gradio_client 已安装")
312
+ except ImportError:
313
+ logger.warning("⚠️ gradio_client 未安装,API调用功能可能受限")
314
+
315
+ # 创建并启动应用
316
+ app = create_real_api_interface()
317
+
318
+ logger.info("API客户端就绪,准备调用真实模型...")
319
+
320
+ app.launch(
321
+ server_name="0.0.0.0",
322
+ server_port=7860,
323
+ share=False,
324
+ debug=False,
325
+ show_error=True
326
+ )
requirements.txt CHANGED
@@ -1,7 +1,10 @@
1
- # Minimal requirements for working demo version
2
- torch>=2.0.0
3
- torchaudio>=2.0.0
4
- numpy>=1.21.0
5
  gradio>=4.0.0
 
 
6
  loguru>=0.6.0
7
- requests>=2.25.0
 
 
 
 
 
1
+ # API调用版本的依赖
 
 
 
2
  gradio>=4.0.0
3
+ gradio_client>=0.8.0
4
+ requests>=2.25.0
5
  loguru>=0.6.0
6
+ numpy>=1.21.0
7
+
8
+ # 可选依赖(用于备用功能)
9
+ torch>=2.0.0
10
+ torchaudio>=2.0.0
requirements_api.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # API调用版本的依赖
2
+ gradio>=4.0.0
3
+ gradio_client>=0.8.0
4
+ requests>=2.25.0
5
+ loguru>=0.6.0
6
+ numpy>=1.21.0
7
+
8
+ # 可选依赖(用于备用功能)
9
+ torch>=2.0.0
10
+ torchaudio>=2.0.0