Spaces:
Running
Running
Fix audio backend error with robust fallback mechanisms
Browse files- Replace torch-based audio generation with numpy for better compatibility
- Add multiple audio saving fallbacks: torchaudio → wave module → silence
- Use standard 44.1kHz sample rate for better compatibility
- Improve error handling with detailed logging
- Add final silence fallback if all audio generation fails
- Tested and verified audio generation works correctly
Resolves: "Couldn't find appropriate backend to handle uri" error
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
app.py
CHANGED
@@ -10,6 +10,8 @@ import json
|
|
10 |
import time
|
11 |
import base64
|
12 |
from io import BytesIO
|
|
|
|
|
13 |
|
14 |
def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
|
15 |
"""直接调用 Hugging Face 推理 API"""
|
@@ -128,39 +130,85 @@ def call_gradio_client_api(video_file_path: str, text_prompt: str = "") -> Tuple
|
|
128 |
|
129 |
def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
|
130 |
"""创建备用演示音频(当 API 不可用时)"""
|
131 |
-
sample_rate =
|
132 |
duration = 5.0
|
133 |
duration_samples = int(duration * sample_rate)
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
|
166 |
"""使用多种 API 方法处理视频"""
|
|
|
10 |
import time
|
11 |
import base64
|
12 |
from io import BytesIO
|
13 |
+
import numpy as np
|
14 |
+
import wave
|
15 |
|
16 |
def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
|
17 |
"""直接调用 Hugging Face 推理 API"""
|
|
|
130 |
|
131 |
def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
|
132 |
"""创建备用演示音频(当 API 不可用时)"""
|
133 |
+
sample_rate = 44100 # 使用更标准的采样率
|
134 |
duration = 5.0
|
135 |
duration_samples = int(duration * sample_rate)
|
136 |
|
137 |
+
try:
|
138 |
+
# 使用 numpy 生成音频(避免 torch 依赖问题)
|
139 |
+
t = np.linspace(0, duration, duration_samples, dtype=np.float32)
|
140 |
+
|
141 |
+
# 根据文本内容生成不同类型的音频
|
142 |
+
if "footsteps" in text_prompt.lower() or "步" in text_prompt:
|
143 |
+
# 脚步声:低频节拍
|
144 |
+
audio = 0.4 * np.sin(2 * np.pi * 2 * t) * np.exp(-3 * (t % 0.5))
|
145 |
+
elif "rain" in text_prompt.lower() or "雨" in text_prompt:
|
146 |
+
# 雨声:白噪声
|
147 |
+
audio = 0.3 * np.random.randn(duration_samples)
|
148 |
+
elif "wind" in text_prompt.lower() or "风" in text_prompt:
|
149 |
+
# 风声:低频噪声
|
150 |
+
audio = 0.3 * np.sin(2 * np.pi * 0.5 * t) + 0.2 * np.random.randn(duration_samples)
|
151 |
+
elif "car" in text_prompt.lower() or "车" in text_prompt:
|
152 |
+
# 车辆声:混合频率
|
153 |
+
audio = 0.3 * np.sin(2 * np.pi * 80 * t) + 0.2 * np.sin(2 * np.pi * 120 * t)
|
154 |
+
else:
|
155 |
+
# 默认:和谐音调
|
156 |
+
base_freq = 220 + len(text_prompt) * 5
|
157 |
+
audio = 0.3 * np.sin(2 * np.pi * base_freq * t)
|
158 |
+
# 添加泛音
|
159 |
+
audio += 0.1 * np.sin(2 * np.pi * base_freq * 2 * t)
|
160 |
+
audio += 0.05 * np.sin(2 * np.pi * base_freq * 3 * t)
|
161 |
+
|
162 |
+
# 应用包络以避免突然开始/结束
|
163 |
+
envelope = np.ones_like(audio)
|
164 |
+
fade_samples = int(0.1 * sample_rate) # 0.1秒淡入淡出
|
165 |
+
envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
|
166 |
+
envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
|
167 |
+
audio *= envelope
|
168 |
+
|
169 |
+
# 保存音频文件
|
170 |
+
temp_dir = tempfile.mkdtemp()
|
171 |
+
audio_path = os.path.join(temp_dir, "fallback_audio.wav")
|
172 |
+
|
173 |
+
# 尝试 torchaudio 保存
|
174 |
+
try:
|
175 |
+
audio_tensor = torch.from_numpy(audio).unsqueeze(0)
|
176 |
+
torchaudio.save(audio_path, audio_tensor, sample_rate)
|
177 |
+
logger.info("✅ 使用 torchaudio 保存音频成功")
|
178 |
+
except Exception as e:
|
179 |
+
logger.warning(f"torchaudio 保存失败: {e}")
|
180 |
+
# 备用方法:使用 Python 内置的 wave 模块
|
181 |
+
logger.info("使用 wave 模块保存音频...")
|
182 |
+
|
183 |
+
# 规范化音频到 int16 范围
|
184 |
+
audio_normalized = np.clip(audio, -1.0, 1.0)
|
185 |
+
audio_int16 = (audio_normalized * 32767).astype(np.int16)
|
186 |
+
|
187 |
+
with wave.open(audio_path, 'w') as wav_file:
|
188 |
+
wav_file.setnchannels(1) # 单声道
|
189 |
+
wav_file.setsampwidth(2) # 16-bit
|
190 |
+
wav_file.setframerate(sample_rate)
|
191 |
+
wav_file.writeframes(audio_int16.tobytes())
|
192 |
+
|
193 |
+
logger.info("✅ 使用 wave 模块保存音频成功")
|
194 |
+
|
195 |
+
return audio_path
|
196 |
+
|
197 |
+
except Exception as e:
|
198 |
+
logger.error(f"音频生成失败: {str(e)}")
|
199 |
+
# 最终备用方案:创建一个简单的静音文件
|
200 |
+
temp_dir = tempfile.mkdtemp()
|
201 |
+
audio_path = os.path.join(temp_dir, "silence.wav")
|
202 |
+
|
203 |
+
silence = np.zeros(duration_samples, dtype=np.int16)
|
204 |
+
with wave.open(audio_path, 'w') as wav_file:
|
205 |
+
wav_file.setnchannels(1)
|
206 |
+
wav_file.setsampwidth(2)
|
207 |
+
wav_file.setframerate(sample_rate)
|
208 |
+
wav_file.writeframes(silence.tobytes())
|
209 |
+
|
210 |
+
logger.info("生成静音音频作为最终备用方案")
|
211 |
+
return audio_path
|
212 |
|
213 |
def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
|
214 |
"""使用多种 API 方法处理视频"""
|