wzy013 Claude commited on
Commit
d72626f
·
1 Parent(s): d353b6f

Fix audio backend error with robust fallback mechanisms

Browse files

- Replace torch-based audio generation with numpy for better compatibility
- Add multiple audio saving fallbacks: torchaudio → wave module → silence
- Use standard 44.1kHz sample rate for better compatibility
- Improve error handling with detailed logging
- Add final silence fallback if all audio generation fails
- Tested and verified audio generation works correctly

Resolves: "Couldn't find appropriate backend to handle uri" error

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show
  1. app.py +78 -30
app.py CHANGED
@@ -10,6 +10,8 @@ import json
10
  import time
11
  import base64
12
  from io import BytesIO
 
 
13
 
14
  def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
15
  """直接调用 Hugging Face 推理 API"""
@@ -128,39 +130,85 @@ def call_gradio_client_api(video_file_path: str, text_prompt: str = "") -> Tuple
128
 
129
  def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
130
  """创建备用演示音频(当 API 不可用时)"""
131
- sample_rate = 48000
132
  duration = 5.0
133
  duration_samples = int(duration * sample_rate)
134
 
135
- t = torch.linspace(0, duration, duration_samples)
136
-
137
- # 根据文本内容生成不同类型的音频
138
- if "footsteps" in text_prompt.lower() or "步" in text_prompt:
139
- audio = 0.4 * torch.sin(2 * 3.14159 * 2 * t) * torch.exp(-3 * (t % 0.5))
140
- elif "rain" in text_prompt.lower() or "" in text_prompt:
141
- audio = 0.3 * torch.randn(duration_samples)
142
- elif "wind" in text_prompt.lower() or "风" in text_prompt:
143
- audio = 0.3 * torch.sin(2 * 3.14159 * 0.5 * t) + 0.2 * torch.randn(duration_samples)
144
- elif "car" in text_prompt.lower() or "车" in text_prompt:
145
- audio = 0.3 * torch.sin(2 * 3.14159 * 80 * t) + 0.2 * torch.sin(2 * 3.14159 * 120 * t)
146
- else:
147
- base_freq = 220 + len(text_prompt) * 5
148
- audio = 0.3 * torch.sin(2 * 3.14159 * base_freq * t)
149
- audio += 0.1 * torch.sin(2 * 3.14159 * base_freq * 2 * t)
150
-
151
- # 应用包络
152
- envelope = torch.ones_like(audio)
153
- fade_samples = int(0.1 * sample_rate)
154
- envelope[:fade_samples] = torch.linspace(0, 1, fade_samples)
155
- envelope[-fade_samples:] = torch.linspace(1, 0, fade_samples)
156
- audio *= envelope
157
-
158
- # 保存音频
159
- temp_dir = tempfile.mkdtemp()
160
- audio_path = os.path.join(temp_dir, "fallback_audio.wav")
161
- torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
162
-
163
- return audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
166
  """使用多种 API 方法处理视频"""
 
10
  import time
11
  import base64
12
  from io import BytesIO
13
+ import numpy as np
14
+ import wave
15
 
16
  def call_huggingface_inference_api(video_file_path: str, text_prompt: str = "") -> Tuple[Optional[str], str]:
17
  """直接调用 Hugging Face 推理 API"""
 
130
 
131
  def create_fallback_audio(video_file_path: str, text_prompt: str) -> str:
132
  """创建备用演示音频(当 API 不可用时)"""
133
+ sample_rate = 44100 # 使用更标准的采样率
134
  duration = 5.0
135
  duration_samples = int(duration * sample_rate)
136
 
137
+ try:
138
+ # 使用 numpy 生成音频(避免 torch 依赖问题)
139
+ t = np.linspace(0, duration, duration_samples, dtype=np.float32)
140
+
141
+ # 根据文本内容生成不同类型的音频
142
+ if "footsteps" in text_prompt.lower() or "" in text_prompt:
143
+ # 脚步声:低频节拍
144
+ audio = 0.4 * np.sin(2 * np.pi * 2 * t) * np.exp(-3 * (t % 0.5))
145
+ elif "rain" in text_prompt.lower() or "雨" in text_prompt:
146
+ # 雨声:白噪声
147
+ audio = 0.3 * np.random.randn(duration_samples)
148
+ elif "wind" in text_prompt.lower() or "风" in text_prompt:
149
+ # 风声:低频噪声
150
+ audio = 0.3 * np.sin(2 * np.pi * 0.5 * t) + 0.2 * np.random.randn(duration_samples)
151
+ elif "car" in text_prompt.lower() or "车" in text_prompt:
152
+ # 车辆声:混合频率
153
+ audio = 0.3 * np.sin(2 * np.pi * 80 * t) + 0.2 * np.sin(2 * np.pi * 120 * t)
154
+ else:
155
+ # 默认:和谐音调
156
+ base_freq = 220 + len(text_prompt) * 5
157
+ audio = 0.3 * np.sin(2 * np.pi * base_freq * t)
158
+ # 添加泛音
159
+ audio += 0.1 * np.sin(2 * np.pi * base_freq * 2 * t)
160
+ audio += 0.05 * np.sin(2 * np.pi * base_freq * 3 * t)
161
+
162
+ # 应用包络以避免突然开始/结束
163
+ envelope = np.ones_like(audio)
164
+ fade_samples = int(0.1 * sample_rate) # 0.1秒淡入淡出
165
+ envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
166
+ envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
167
+ audio *= envelope
168
+
169
+ # 保存音频文件
170
+ temp_dir = tempfile.mkdtemp()
171
+ audio_path = os.path.join(temp_dir, "fallback_audio.wav")
172
+
173
+ # 尝试 torchaudio 保存
174
+ try:
175
+ audio_tensor = torch.from_numpy(audio).unsqueeze(0)
176
+ torchaudio.save(audio_path, audio_tensor, sample_rate)
177
+ logger.info("✅ 使用 torchaudio 保存音频成功")
178
+ except Exception as e:
179
+ logger.warning(f"torchaudio 保存失败: {e}")
180
+ # 备用方法:使用 Python 内置的 wave 模块
181
+ logger.info("使用 wave 模块保存音频...")
182
+
183
+ # 规范化音频到 int16 范围
184
+ audio_normalized = np.clip(audio, -1.0, 1.0)
185
+ audio_int16 = (audio_normalized * 32767).astype(np.int16)
186
+
187
+ with wave.open(audio_path, 'w') as wav_file:
188
+ wav_file.setnchannels(1) # 单声道
189
+ wav_file.setsampwidth(2) # 16-bit
190
+ wav_file.setframerate(sample_rate)
191
+ wav_file.writeframes(audio_int16.tobytes())
192
+
193
+ logger.info("✅ 使用 wave 模块保存音频成功")
194
+
195
+ return audio_path
196
+
197
+ except Exception as e:
198
+ logger.error(f"音频生成失败: {str(e)}")
199
+ # 最终备用方案:创建一个简单的静音文件
200
+ temp_dir = tempfile.mkdtemp()
201
+ audio_path = os.path.join(temp_dir, "silence.wav")
202
+
203
+ silence = np.zeros(duration_samples, dtype=np.int16)
204
+ with wave.open(audio_path, 'w') as wav_file:
205
+ wav_file.setnchannels(1)
206
+ wav_file.setsampwidth(2)
207
+ wav_file.setframerate(sample_rate)
208
+ wav_file.writeframes(silence.tobytes())
209
+
210
+ logger.info("生成静音音频作为最终备用方案")
211
+ return audio_path
212
 
213
  def process_video_with_apis(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[List[str], str]:
214
  """使用多种 API 方法处理视频"""