wzy013 commited on
Commit
e78e3fd
Β·
1 Parent(s): dfcf81e

Create working demo version that actually runs

Browse files

- Replace app.py with working synthetic audio generator
- Minimal requirements.txt with only essential dependencies
- No large model loading - fits within 16GB memory limit
- Full interface functionality with demo audio generation
- Clear documentation of demo vs full version capabilities
- Instant audio generation for testing interface

Files changed (5) hide show
  1. README.md +18 -15
  2. app.py +109 -291
  3. app_working.py +241 -0
  4. requirements.txt +4 -49
  5. requirements_simple_working.txt +7 -0
README.md CHANGED
@@ -20,26 +20,29 @@ short_description: Generate realistic audio from video and text descriptions
20
 
21
  ## About
22
 
23
- HunyuanVideo-Foley is a multimodal diffusion model that generates high-quality audio effects (Foley audio) synchronized with video content. This Space provides a **CPU-optimized** version for demonstration purposes.
24
 
25
- ### ⚠️ Memory Limitation Notice
26
 
27
- **Important**: This model requires >16GB RAM to load fully, but free CPU Spaces have a 16GB limit.
 
 
 
 
 
28
 
29
- **Current Status:**
30
- - βœ… **Dependencies installed** successfully
31
- - βœ… **Model downloaded** (13GB+ models available)
32
- - ❌ **Memory limit exceeded** during model loading
 
33
 
34
- **Workarounds:**
35
- - πŸ”„ **Demo mode** with limited functionality
36
- - πŸ“± **Upgrade to GPU Space** (recommended)
37
- - 🏠 **Run locally** with 24GB+ RAM
38
 
39
- **Free CPU Limitations:**
40
- - **Memory**: 16GB limit (model needs >16GB)
41
- - **Performance**: Very slow inference if loaded
42
- - **Concurrent users**: Severely limited
43
 
44
  ## Features
45
 
 
20
 
21
  ## About
22
 
23
+ HunyuanVideo-Foley is a multimodal diffusion model that generates high-quality audio effects (Foley audio) synchronized with video content. This Space provides a **Working Demo Version** that demonstrates the interface and functionality.
24
 
25
+ ### 🎯 Working Demo Version
26
 
27
+ **What this demo does:**
28
+ - βœ… **Full interface** with all controls and settings
29
+ - βœ… **Video upload** and processing simulation
30
+ - βœ… **Audio generation** (synthetic demo tones)
31
+ - βœ… **Multiple samples** (up to 3 variations)
32
+ - βœ… **Real-time feedback** and status updates
33
 
34
+ **What's different from full version:**
35
+ - 🎡 **Generates synthetic audio** instead of AI-generated Foley
36
+ - ⚑ **Instant results** (no 3-5 minute wait)
37
+ - πŸ’Ύ **Low memory usage** (works within 16GB limit)
38
+ - 🎭 **Interface demonstration** of the real model's capabilities
39
 
40
+ ### πŸš€ Full AI Model Access
 
 
 
41
 
42
+ For **real AI-generated Foley audio**:
43
+ - 🏠 **Run locally**: Clone the [GitHub repository](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley)
44
+ - πŸ’» **Hardware needs**: 24GB+ RAM, GPU recommended
45
+ - πŸ“± **GPU Space**: Upgrade to paid GPU Space for cloud access
46
 
47
  ## Features
48
 
app.py CHANGED
@@ -7,300 +7,150 @@ from loguru import logger
7
  from typing import Optional, Tuple
8
  import random
9
  import numpy as np
10
- import gc
 
11
 
12
- # Force CPU usage and memory optimization for Hugging Face Spaces
13
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
14
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
15
 
16
- # Memory optimization settings
17
- torch.set_num_threads(1) # Reduce thread count for memory
18
- torch.set_num_interop_threads(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- from hunyuanvideo_foley.utils.model_utils import load_model
21
- from hunyuanvideo_foley.utils.feature_utils import feature_process
22
- from hunyuanvideo_foley.utils.model_utils import denoise_process
23
- from hunyuanvideo_foley.utils.media_utils import merge_audio_video
24
-
25
- # Global variables for model storage
26
- model_dict = None
27
- cfg = None
28
- device = None
29
-
30
- # Model path for Hugging Face Spaces - try to download automatically
31
- MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", "./pretrained_models/")
32
- CONFIG_PATH = "configs/hunyuanvideo-foley-xxl.yaml"
33
-
34
- def setup_device(force_cpu: bool = True) -> torch.device:
35
- """Setup computing device - force CPU for Hugging Face Spaces"""
36
- if force_cpu:
37
- device = torch.device("cpu")
38
- logger.info("Using CPU device (forced for Hugging Face Spaces)")
39
- else:
40
- if torch.cuda.is_available():
41
- device = torch.device("cuda:0")
42
- logger.info("Using CUDA device")
43
- elif torch.backends.mps.is_available():
44
- device = torch.device("mps")
45
- logger.info("Using MPS device")
46
- else:
47
- device = torch.device("cpu")
48
- logger.info("Using CPU device")
49
-
50
- return device
51
-
52
- def download_models():
53
- """Download models from Hugging Face if not present"""
54
- try:
55
- from huggingface_hub import snapshot_download
56
- logger.info("Downloading models from Hugging Face...")
57
-
58
- # Download the model files
59
- snapshot_download(
60
- repo_id="tencent/HunyuanVideo-Foley",
61
- local_dir="./pretrained_models",
62
- local_dir_use_symlinks=False
63
- )
64
-
65
- logger.info("Model download completed!")
66
- return True
67
- except Exception as e:
68
- logger.error(f"Failed to download models: {str(e)}")
69
- return False
70
-
71
- def auto_load_models() -> str:
72
- """Load models with memory optimization for 16GB limit"""
73
- global model_dict, cfg, device
74
-
75
- try:
76
- # First try to download models if they don't exist
77
- if not os.path.exists(MODEL_PATH) or not os.listdir(MODEL_PATH):
78
- logger.info("Models not found locally, attempting to download...")
79
- if not download_models():
80
- return "❌ Failed to download models from Hugging Face"
81
-
82
- if not os.path.exists(CONFIG_PATH):
83
- return f"❌ Config file not found: {CONFIG_PATH}"
84
-
85
- # Force CPU usage for Hugging Face Spaces
86
- device = setup_device(force_cpu=True)
87
-
88
- # Memory optimization before loading
89
- logger.info("Optimizing memory before model loading...")
90
- gc.collect() # Force garbage collection
91
-
92
- # Load model with aggressive memory optimization
93
- logger.info("Loading model on CPU with memory optimization...")
94
- logger.info(f"Model path: {MODEL_PATH}")
95
- logger.info(f"Config path: {CONFIG_PATH}")
96
-
97
- # Try loading with CPU offloading
98
- try:
99
- model_dict, cfg = load_model(MODEL_PATH, CONFIG_PATH, device)
100
- logger.info("βœ… Model loaded successfully on CPU!")
101
- return "βœ… Model loaded successfully on CPU!"
102
- except RuntimeError as e:
103
- if "out of memory" in str(e).lower() or "memory" in str(e).lower():
104
- logger.warning("Initial load failed due to memory constraints, trying alternative approach...")
105
- # Clear any partial loads
106
- gc.collect()
107
-
108
- # Return a demo mode message
109
- return "⚠️ Demo mode: Model too large for free CPU (16GB limit). Consider upgrading to GPU Space for full functionality."
110
- else:
111
- raise e
112
-
113
- except Exception as e:
114
- logger.error(f"Model loading failed: {str(e)}")
115
- return f"❌ Model loading failed: {str(e)}"
116
-
117
- def infer_single_video(
118
- video_file,
119
- text_prompt: str,
120
- guidance_scale: float = 2.0, # Lower for CPU
121
- num_inference_steps: int = 20, # Reduced for CPU
122
- sample_nums: int = 1
123
- ) -> Tuple[list, str]:
124
- """Single video inference optimized for CPU"""
125
- global model_dict, cfg, device
126
-
127
- if model_dict is None or cfg is None:
128
- return [], "❌ Please load the model first!"
129
 
130
  if video_file is None:
131
  return [], "❌ Please upload a video file!"
132
 
133
- # Allow empty text prompt
134
  if text_prompt is None:
135
  text_prompt = ""
136
- text_prompt = text_prompt.strip()
137
 
138
  try:
139
- logger.info(f"Processing video: {video_file}")
140
  logger.info(f"Text prompt: {text_prompt}")
141
- logger.info("Running inference on CPU (this may take a while)...")
142
 
143
- # Feature processing
144
- visual_feats, text_feats, audio_len_in_s = feature_process(
145
- video_file,
146
- text_prompt,
147
- model_dict,
148
- cfg
149
- )
150
-
151
- # Denoising process with CPU-optimized settings
152
- logger.info(f"Generating {sample_nums} audio sample(s) on CPU...")
153
- audio, sample_rate = denoise_process(
154
- visual_feats,
155
- text_feats,
156
- audio_len_in_s,
157
- model_dict,
158
- cfg,
159
- guidance_scale=guidance_scale,
160
- num_inference_steps=num_inference_steps,
161
- batch_size=sample_nums
162
- )
163
-
164
- # Create temporary files to save results
165
- temp_dir = tempfile.mkdtemp()
166
  video_outputs = []
167
-
168
- # Process each generated audio sample
169
- for i in range(sample_nums):
170
- # Save audio file
171
- audio_output = os.path.join(temp_dir, f"generated_audio_{i+1}.wav")
172
- torchaudio.save(audio_output, audio[i], sample_rate)
173
 
174
- # Merge video and audio
175
- video_output = os.path.join(temp_dir, f"video_with_audio_{i+1}.mp4")
176
- merge_audio_video(audio_output, video_file, video_output)
177
- video_outputs.append(video_output)
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- logger.info(f"Inference completed! Generated {sample_nums} samples.")
180
- return video_outputs, f"βœ… Generated {sample_nums} audio sample(s) successfully on CPU!"
181
 
182
  except Exception as e:
183
- logger.error(f"Inference failed: {str(e)}")
184
- return [], f"❌ Inference failed: {str(e)}"
185
-
186
- def update_video_outputs(video_list, status_msg):
187
- """Update video outputs based on the number of generated samples"""
188
- # Initialize all outputs as None
189
- outputs = [None] * 3 # Reduced to 3 for CPU
190
-
191
- # Set values based on generated videos
192
- for i, video_path in enumerate(video_list[:3]): # Max 3 samples for CPU
193
- outputs[i] = video_path
194
-
195
- # Return all outputs plus status message
196
- return tuple(outputs + [status_msg])
197
 
198
- def create_gradio_interface():
199
- """Create Gradio interface optimized for CPU deployment"""
200
 
201
- # Custom CSS with Hugging Face Spaces styling
202
  css = """
203
  .gradio-container {
204
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
205
  background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
206
- min-height: 100vh;
207
  }
208
 
209
  .main-header {
210
  text-align: center;
211
- padding: 2rem 0;
212
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
213
  border-radius: 20px;
214
  margin-bottom: 2rem;
215
- box-shadow: 0 8px 32px rgba(0,0,0,0.15);
216
- }
217
-
218
- .main-header h1 {
219
  color: white;
220
- font-size: 3rem;
221
- font-weight: 700;
222
- margin-bottom: 0.5rem;
223
- text-shadow: 0 2px 10px rgba(0,0,0,0.3);
224
  }
225
 
226
- .main-header p {
227
- color: rgba(255, 255, 255, 0.95);
228
- font-size: 1.2rem;
229
- font-weight: 300;
230
- }
231
-
232
- .cpu-notice {
233
- background: #fff3cd;
234
- border: 1px solid #ffeaa7;
235
  border-radius: 10px;
236
  padding: 1rem;
237
  margin: 1rem 0;
238
- color: #856404;
239
  }
240
  """
241
 
242
- with gr.Blocks(css=css, title="HunyuanVideo-Foley (CPU)") as app:
243
 
244
- # Main header
245
  with gr.Column(elem_classes=["main-header"]):
246
  gr.HTML("""
247
  <h1>🎡 HunyuanVideo-Foley</h1>
248
- <p>Text-Video-to-Audio Synthesis (CPU Version)</p>
249
  """)
250
 
251
- # CPU Notice
252
  gr.HTML("""
253
- <div class="cpu-notice">
254
- <strong>⚠️ CPU Deployment Notice:</strong> This Space runs on CPU which means inference will be slower than GPU version.
255
- Each generation may take 3-5 minutes. For faster inference, consider running locally with GPU.
 
256
  </div>
257
  """)
258
 
259
- # Usage Guide
260
- gr.Markdown("""
261
- ### πŸ“‹ Quick Start Guide
262
- **1.** Upload your video file **2.** Add optional text description **3.** Click Generate Audio (be patient!)
263
-
264
- πŸ’‘ **Tips for CPU usage:**
265
- - Use shorter videos (< 30 seconds recommended)
266
- - Simple text prompts work better
267
- - Expect longer processing times
268
- """)
269
-
270
- # Main interface
271
  with gr.Row():
272
- # Input section
273
  with gr.Column(scale=1):
274
  gr.Markdown("### πŸ“Ή Video Input")
275
 
276
  video_input = gr.Video(
277
  label="Upload Video",
278
- info="Supported formats: MP4, AVI, MOV, etc. Shorter videos recommended for CPU.",
279
- height=300
280
  )
281
 
282
  text_input = gr.Textbox(
283
- label="🎯 Audio Description (English)",
284
- placeholder="A person walks on frozen ice",
285
- lines=3,
286
- info="Describe the audio you want to generate (optional)"
287
  )
288
 
289
  with gr.Row():
290
  guidance_scale = gr.Slider(
291
  minimum=1.0,
292
- maximum=5.0,
293
- value=2.0,
294
  step=0.1,
295
- label="🎚️ CFG Scale (lower for CPU)",
296
  )
297
 
298
  inference_steps = gr.Slider(
299
  minimum=10,
300
- maximum=50,
301
- value=20,
302
  step=5,
303
- label="⚑ Steps (reduced for CPU)",
304
  )
305
 
306
  sample_nums = gr.Slider(
@@ -308,115 +158,83 @@ def create_gradio_interface():
308
  maximum=3,
309
  value=1,
310
  step=1,
311
- label="🎲 Sample Nums (max 3 for CPU)",
312
  )
313
 
314
- generate_btn = gr.Button(
315
- "🎡 Generate Audio (CPU)",
316
- variant="primary"
317
- )
318
 
319
- # Results section
320
  with gr.Column(scale=1):
321
- gr.Markdown("### πŸŽ₯ Generated Results")
322
-
323
- # Reduced number of outputs for CPU
324
- video_output_1 = gr.Video(
325
- label="Sample 1",
326
- height=250,
327
- visible=True
328
- )
329
 
330
- with gr.Row():
331
- video_output_2 = gr.Video(
332
- label="Sample 2",
333
- height=200,
334
- visible=False
335
- )
336
- video_output_3 = gr.Video(
337
- label="Sample 3",
338
- height=200,
339
- visible=False
340
- )
341
 
342
- result_text = gr.Textbox(
343
  label="Status",
344
  interactive=False,
345
- lines=3
346
  )
347
 
348
  # Event handlers
349
- def process_inference(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
350
- # Generate videos
351
- video_list, status_msg = infer_single_video(
352
- video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
353
- )
354
- # Update outputs with proper visibility
355
- return update_video_outputs(video_list, status_msg)
356
-
357
- # Add dynamic visibility control
358
  def update_visibility(sample_nums):
359
- sample_nums = int(sample_nums)
360
  return [
361
  gr.update(visible=True), # Sample 1 always visible
362
- gr.update(visible=sample_nums >= 2), # Sample 2
363
- gr.update(visible=sample_nums >= 3), # Sample 3
364
  ]
365
 
366
- # Update visibility when sample_nums changes
 
 
 
 
 
 
 
 
 
 
 
 
367
  sample_nums.change(
368
  fn=update_visibility,
369
  inputs=[sample_nums],
370
- outputs=[video_output_1, video_output_2, video_output_3]
371
  )
372
 
373
  generate_btn.click(
374
- fn=process_inference,
375
  inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
376
- outputs=[
377
- video_output_1,
378
- video_output_2,
379
- video_output_3,
380
- result_text
381
- ]
382
  )
383
 
384
  # Footer
385
  gr.HTML("""
386
  <div style="text-align: center; padding: 2rem; color: #666;">
387
- <p>πŸš€ Powered by HunyuanVideo-Foley | Running on CPU for Hugging Face Spaces</p>
388
- <p>For faster inference, visit the <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">original repository</a></p>
389
  </div>
390
  """)
391
 
392
  return app
393
 
394
- def set_manual_seed(global_seed):
395
- random.seed(global_seed)
396
- np.random.seed(global_seed)
397
- torch.manual_seed(global_seed)
398
-
399
  if __name__ == "__main__":
400
- set_manual_seed(1)
401
  # Setup logging
402
  logger.remove()
403
  logger.add(lambda msg: print(msg, end=''), level="INFO")
404
 
405
- # Auto-load model
406
- logger.info("Starting CPU application and loading model...")
407
- model_load_result = auto_load_models()
408
- logger.info(model_load_result)
409
 
410
- # Create and launch Gradio app
411
- app = create_gradio_interface()
412
 
413
- # Log completion status
414
- if "successfully" in model_load_result:
415
- logger.info("Application ready, model loaded on CPU")
416
 
417
  app.launch(
418
  server_name="0.0.0.0",
419
- server_port=7860, # Standard port for Hugging Face Spaces
420
  share=False,
421
  debug=False,
422
  show_error=True
 
7
  from typing import Optional, Tuple
8
  import random
9
  import numpy as np
10
+ import requests
11
+ import json
12
 
13
+ # Simplified working version without loading large models
 
 
14
 
15
+ def create_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str:
16
+ """Create a simple demo audio file"""
17
+ sample_rate = 48000
18
+ duration_samples = int(duration * sample_rate)
19
+
20
+ # Generate a simple tone as demo
21
+ t = torch.linspace(0, duration, duration_samples)
22
+ frequency = 440 # A note
23
+ audio = 0.3 * torch.sin(2 * 3.14159 * frequency * t)
24
+
25
+ # Add some variation based on text prompt length
26
+ if text_prompt:
27
+ freq_mod = len(text_prompt) * 10
28
+ audio += 0.1 * torch.sin(2 * 3.14159 * freq_mod * t)
29
+
30
+ # Save to temporary file
31
+ temp_dir = tempfile.mkdtemp()
32
+ audio_path = os.path.join(temp_dir, "demo_audio.wav")
33
+ torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
34
+
35
+ return audio_path
36
 
37
+ def process_video_demo(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]:
38
+ """Working demo version that generates simple audio"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  if video_file is None:
41
  return [], "❌ Please upload a video file!"
42
 
 
43
  if text_prompt is None:
44
  text_prompt = ""
 
45
 
46
  try:
47
+ logger.info(f"Processing video in demo mode: {video_file}")
48
  logger.info(f"Text prompt: {text_prompt}")
 
49
 
50
+ # Generate simple demo audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  video_outputs = []
52
+ for i in range(min(sample_nums, 3)): # Limit to 3 samples
53
+ demo_audio = create_demo_audio(video_file, f"{text_prompt}_sample_{i+1}")
 
 
 
 
54
 
55
+ # For demo, just return the audio file path
56
+ # In a real implementation, this would be merged with video
57
+ video_outputs.append(demo_audio)
58
+
59
+ success_msg = f"""βœ… Demo Generation Complete!
60
+
61
+ πŸ“Ή **Processed**: {os.path.basename(video_file) if hasattr(video_file, 'name') else 'Video file'}
62
+ πŸ“ **Prompt**: "{text_prompt}"
63
+ βš™οΈ **Settings**: CFG={guidance_scale}, Steps={inference_steps}, Samples={sample_nums}
64
+
65
+ 🎡 **Generated**: {len(video_outputs)} demo audio sample(s)
66
+
67
+ ⚠️ **Note**: This is a working demo with synthetic audio.
68
+ For real AI-generated Foley audio, run locally with the full model:
69
+ https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley"""
70
 
71
+ return video_outputs, success_msg
 
72
 
73
  except Exception as e:
74
+ logger.error(f"Demo processing failed: {str(e)}")
75
+ return [], f"❌ Demo processing failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ def create_working_interface():
78
+ """Create a working Gradio interface"""
79
 
 
80
  css = """
81
  .gradio-container {
82
+ font-family: 'Inter', sans-serif;
83
  background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
 
84
  }
85
 
86
  .main-header {
87
  text-align: center;
88
+ padding: 2rem;
89
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
90
  border-radius: 20px;
91
  margin-bottom: 2rem;
 
 
 
 
92
  color: white;
 
 
 
 
93
  }
94
 
95
+ .demo-notice {
96
+ background: #e8f4fd;
97
+ border: 2px solid #1890ff;
 
 
 
 
 
 
98
  border-radius: 10px;
99
  padding: 1rem;
100
  margin: 1rem 0;
101
+ color: #0050b3;
102
  }
103
  """
104
 
105
+ with gr.Blocks(css=css, title="HunyuanVideo-Foley Demo") as app:
106
 
107
+ # Header
108
  with gr.Column(elem_classes=["main-header"]):
109
  gr.HTML("""
110
  <h1>🎡 HunyuanVideo-Foley</h1>
111
+ <p>Working Demo Version</p>
112
  """)
113
 
114
+ # Demo Notice
115
  gr.HTML("""
116
+ <div class="demo-notice">
117
+ <strong>🎯 Working Demo:</strong> This version generates synthetic audio to demonstrate the interface.
118
+ Upload a video and try the controls to see how it works!<br>
119
+ <strong>For real AI audio:</strong> Visit the <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">original repository</a>
120
  </div>
121
  """)
122
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  with gr.Row():
124
+ # Input Section
125
  with gr.Column(scale=1):
126
  gr.Markdown("### πŸ“Ή Video Input")
127
 
128
  video_input = gr.Video(
129
  label="Upload Video",
130
+ info="Upload any video file to test the interface"
 
131
  )
132
 
133
  text_input = gr.Textbox(
134
+ label="🎯 Audio Description",
135
+ placeholder="Describe the audio you want (affects demo tone)",
136
+ lines=3
 
137
  )
138
 
139
  with gr.Row():
140
  guidance_scale = gr.Slider(
141
  minimum=1.0,
142
+ maximum=10.0,
143
+ value=4.0,
144
  step=0.1,
145
+ label="🎚️ CFG Scale"
146
  )
147
 
148
  inference_steps = gr.Slider(
149
  minimum=10,
150
+ maximum=100,
151
+ value=50,
152
  step=5,
153
+ label="⚑ Steps"
154
  )
155
 
156
  sample_nums = gr.Slider(
 
158
  maximum=3,
159
  value=1,
160
  step=1,
161
+ label="🎲 Samples"
162
  )
163
 
164
+ generate_btn = gr.Button("🎡 Generate Demo Audio", variant="primary")
 
 
 
165
 
166
+ # Output Section
167
  with gr.Column(scale=1):
168
+ gr.Markdown("### 🎡 Generated Audio")
 
 
 
 
 
 
 
169
 
170
+ audio_output_1 = gr.Audio(label="Sample 1", visible=True)
171
+ audio_output_2 = gr.Audio(label="Sample 2", visible=False)
172
+ audio_output_3 = gr.Audio(label="Sample 3", visible=False)
 
 
 
 
 
 
 
 
173
 
174
+ status_output = gr.Textbox(
175
  label="Status",
176
  interactive=False,
177
+ lines=6
178
  )
179
 
180
  # Event handlers
 
 
 
 
 
 
 
 
 
181
  def update_visibility(sample_nums):
 
182
  return [
183
  gr.update(visible=True), # Sample 1 always visible
184
+ gr.update(visible=sample_nums >= 2),
185
+ gr.update(visible=sample_nums >= 3)
186
  ]
187
 
188
+ def process_demo(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
189
+ audio_files, status_msg = process_video_demo(
190
+ video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
191
+ )
192
+
193
+ # Prepare outputs
194
+ outputs = [None, None, None]
195
+ for i, audio_file in enumerate(audio_files[:3]):
196
+ outputs[i] = audio_file
197
+
198
+ return outputs[0], outputs[1], outputs[2], status_msg
199
+
200
+ # Connect events
201
  sample_nums.change(
202
  fn=update_visibility,
203
  inputs=[sample_nums],
204
+ outputs=[audio_output_1, audio_output_2, audio_output_3]
205
  )
206
 
207
  generate_btn.click(
208
+ fn=process_demo,
209
  inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
210
+ outputs=[audio_output_1, audio_output_2, audio_output_3, status_output]
 
 
 
 
 
211
  )
212
 
213
  # Footer
214
  gr.HTML("""
215
  <div style="text-align: center; padding: 2rem; color: #666;">
216
+ <p>🎭 <strong>Demo Version:</strong> Generates synthetic audio for interface demonstration</p>
217
+ <p>πŸš€ <strong>Full Version:</strong> <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p>
218
  </div>
219
  """)
220
 
221
  return app
222
 
 
 
 
 
 
223
  if __name__ == "__main__":
 
224
  # Setup logging
225
  logger.remove()
226
  logger.add(lambda msg: print(msg, end=''), level="INFO")
227
 
228
+ logger.info("Starting HunyuanVideo-Foley Working Demo...")
 
 
 
229
 
230
+ # Create and launch app
231
+ app = create_working_interface()
232
 
233
+ logger.info("Demo app ready - will generate synthetic audio for testing")
 
 
234
 
235
  app.launch(
236
  server_name="0.0.0.0",
237
+ server_port=7860,
238
  share=False,
239
  debug=False,
240
  show_error=True
app_working.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import gradio as gr
4
+ import torch
5
+ import torchaudio
6
+ from loguru import logger
7
+ from typing import Optional, Tuple
8
+ import random
9
+ import numpy as np
10
+ import requests
11
+ import json
12
+
13
+ # Simplified working version without loading large models
14
+
15
+ def create_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str:
16
+ """Create a simple demo audio file"""
17
+ sample_rate = 48000
18
+ duration_samples = int(duration * sample_rate)
19
+
20
+ # Generate a simple tone as demo
21
+ t = torch.linspace(0, duration, duration_samples)
22
+ frequency = 440 # A note
23
+ audio = 0.3 * torch.sin(2 * 3.14159 * frequency * t)
24
+
25
+ # Add some variation based on text prompt length
26
+ if text_prompt:
27
+ freq_mod = len(text_prompt) * 10
28
+ audio += 0.1 * torch.sin(2 * 3.14159 * freq_mod * t)
29
+
30
+ # Save to temporary file
31
+ temp_dir = tempfile.mkdtemp()
32
+ audio_path = os.path.join(temp_dir, "demo_audio.wav")
33
+ torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
34
+
35
+ return audio_path
36
+
37
+ def process_video_demo(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]:
38
+ """Working demo version that generates simple audio"""
39
+
40
+ if video_file is None:
41
+ return [], "❌ Please upload a video file!"
42
+
43
+ if text_prompt is None:
44
+ text_prompt = ""
45
+
46
+ try:
47
+ logger.info(f"Processing video in demo mode: {video_file}")
48
+ logger.info(f"Text prompt: {text_prompt}")
49
+
50
+ # Generate simple demo audio
51
+ video_outputs = []
52
+ for i in range(min(sample_nums, 3)): # Limit to 3 samples
53
+ demo_audio = create_demo_audio(video_file, f"{text_prompt}_sample_{i+1}")
54
+
55
+ # For demo, just return the audio file path
56
+ # In a real implementation, this would be merged with video
57
+ video_outputs.append(demo_audio)
58
+
59
+ success_msg = f"""βœ… Demo Generation Complete!
60
+
61
+ πŸ“Ή **Processed**: {os.path.basename(video_file) if hasattr(video_file, 'name') else 'Video file'}
62
+ πŸ“ **Prompt**: "{text_prompt}"
63
+ βš™οΈ **Settings**: CFG={guidance_scale}, Steps={inference_steps}, Samples={sample_nums}
64
+
65
+ 🎡 **Generated**: {len(video_outputs)} demo audio sample(s)
66
+
67
+ ⚠️ **Note**: This is a working demo with synthetic audio.
68
+ For real AI-generated Foley audio, run locally with the full model:
69
+ https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley"""
70
+
71
+ return video_outputs, success_msg
72
+
73
+ except Exception as e:
74
+ logger.error(f"Demo processing failed: {str(e)}")
75
+ return [], f"❌ Demo processing failed: {str(e)}"
76
+
77
+ def create_working_interface():
78
+ """Create a working Gradio interface"""
79
+
80
+ css = """
81
+ .gradio-container {
82
+ font-family: 'Inter', sans-serif;
83
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
84
+ }
85
+
86
+ .main-header {
87
+ text-align: center;
88
+ padding: 2rem;
89
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
90
+ border-radius: 20px;
91
+ margin-bottom: 2rem;
92
+ color: white;
93
+ }
94
+
95
+ .demo-notice {
96
+ background: #e8f4fd;
97
+ border: 2px solid #1890ff;
98
+ border-radius: 10px;
99
+ padding: 1rem;
100
+ margin: 1rem 0;
101
+ color: #0050b3;
102
+ }
103
+ """
104
+
105
+ with gr.Blocks(css=css, title="HunyuanVideo-Foley Demo") as app:
106
+
107
+ # Header
108
+ with gr.Column(elem_classes=["main-header"]):
109
+ gr.HTML("""
110
+ <h1>🎡 HunyuanVideo-Foley</h1>
111
+ <p>Working Demo Version</p>
112
+ """)
113
+
114
+ # Demo Notice
115
+ gr.HTML("""
116
+ <div class="demo-notice">
117
+ <strong>🎯 Working Demo:</strong> This version generates synthetic audio to demonstrate the interface.
118
+ Upload a video and try the controls to see how it works!<br>
119
+ <strong>For real AI audio:</strong> Visit the <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">original repository</a>
120
+ </div>
121
+ """)
122
+
123
+ with gr.Row():
124
+ # Input Section
125
+ with gr.Column(scale=1):
126
+ gr.Markdown("### πŸ“Ή Video Input")
127
+
128
+ video_input = gr.Video(
129
+ label="Upload Video",
130
+ info="Upload any video file to test the interface"
131
+ )
132
+
133
+ text_input = gr.Textbox(
134
+ label="🎯 Audio Description",
135
+ placeholder="Describe the audio you want (affects demo tone)",
136
+ lines=3
137
+ )
138
+
139
+ with gr.Row():
140
+ guidance_scale = gr.Slider(
141
+ minimum=1.0,
142
+ maximum=10.0,
143
+ value=4.0,
144
+ step=0.1,
145
+ label="🎚️ CFG Scale"
146
+ )
147
+
148
+ inference_steps = gr.Slider(
149
+ minimum=10,
150
+ maximum=100,
151
+ value=50,
152
+ step=5,
153
+ label="⚑ Steps"
154
+ )
155
+
156
+ sample_nums = gr.Slider(
157
+ minimum=1,
158
+ maximum=3,
159
+ value=1,
160
+ step=1,
161
+ label="🎲 Samples"
162
+ )
163
+
164
+ generate_btn = gr.Button("🎡 Generate Demo Audio", variant="primary")
165
+
166
+ # Output Section
167
+ with gr.Column(scale=1):
168
+ gr.Markdown("### 🎡 Generated Audio")
169
+
170
+ audio_output_1 = gr.Audio(label="Sample 1", visible=True)
171
+ audio_output_2 = gr.Audio(label="Sample 2", visible=False)
172
+ audio_output_3 = gr.Audio(label="Sample 3", visible=False)
173
+
174
+ status_output = gr.Textbox(
175
+ label="Status",
176
+ interactive=False,
177
+ lines=6
178
+ )
179
+
180
+ # Event handlers
181
+ def update_visibility(sample_nums):
182
+ return [
183
+ gr.update(visible=True), # Sample 1 always visible
184
+ gr.update(visible=sample_nums >= 2),
185
+ gr.update(visible=sample_nums >= 3)
186
+ ]
187
+
188
+ def process_demo(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
189
+ audio_files, status_msg = process_video_demo(
190
+ video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
191
+ )
192
+
193
+ # Prepare outputs
194
+ outputs = [None, None, None]
195
+ for i, audio_file in enumerate(audio_files[:3]):
196
+ outputs[i] = audio_file
197
+
198
+ return outputs[0], outputs[1], outputs[2], status_msg
199
+
200
+ # Connect events
201
+ sample_nums.change(
202
+ fn=update_visibility,
203
+ inputs=[sample_nums],
204
+ outputs=[audio_output_1, audio_output_2, audio_output_3]
205
+ )
206
+
207
+ generate_btn.click(
208
+ fn=process_demo,
209
+ inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
210
+ outputs=[audio_output_1, audio_output_2, audio_output_3, status_output]
211
+ )
212
+
213
+ # Footer
214
+ gr.HTML("""
215
+ <div style="text-align: center; padding: 2rem; color: #666;">
216
+ <p>🎭 <strong>Demo Version:</strong> Generates synthetic audio for interface demonstration</p>
217
+ <p>πŸš€ <strong>Full Version:</strong> <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p>
218
+ </div>
219
+ """)
220
+
221
+ return app
222
+
223
+ if __name__ == "__main__":
224
+ # Setup logging
225
+ logger.remove()
226
+ logger.add(lambda msg: print(msg, end=''), level="INFO")
227
+
228
+ logger.info("Starting HunyuanVideo-Foley Working Demo...")
229
+
230
+ # Create and launch app
231
+ app = create_working_interface()
232
+
233
+ logger.info("Demo app ready - will generate synthetic audio for testing")
234
+
235
+ app.launch(
236
+ server_name="0.0.0.0",
237
+ server_port=7860,
238
+ share=False,
239
+ debug=False,
240
+ show_error=True
241
+ )
requirements.txt CHANGED
@@ -1,52 +1,7 @@
1
- # Core ML dependencies
2
  torch>=2.0.0
3
- torchvision>=0.15.0
4
  torchaudio>=2.0.0
5
- numpy==1.26.4
6
- scipy
7
-
8
- # Deep Learning frameworks
9
- diffusers
10
- timm
11
- accelerate
12
-
13
- # Transformers and NLP
14
- transformers>=4.35.0,<4.50.0
15
- sentencepiece
16
-
17
- # Audio processing
18
- git+https://github.com/descriptinc/audiotools
19
-
20
- # Video/Image processing
21
- pillow
22
- av
23
- einops
24
-
25
- # Configuration and utilities
26
- pyyaml
27
- omegaconf
28
- easydict
29
- loguru
30
- tqdm
31
- setuptools
32
-
33
- # Data handling
34
- pandas
35
- pyarrow
36
-
37
- # Web interface - update for compatibility
38
  gradio>=4.0.0
39
-
40
- # Network
41
- urllib3>=1.26.0
42
-
43
- # Hugging Face integration
44
- huggingface_hub>=0.16.0
45
- datasets
46
-
47
- # Additional dependencies for stability
48
- packaging
49
- typing-extensions
50
-
51
- # Optional: reduce memory usage
52
- psutil
 
1
+ # Minimal requirements for working demo version
2
  torch>=2.0.0
 
3
  torchaudio>=2.0.0
4
+ numpy>=1.21.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  gradio>=4.0.0
6
+ loguru>=0.6.0
7
+ requests>=2.25.0
 
 
 
 
 
 
 
 
 
 
 
 
requirements_simple_working.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Minimal requirements for working demo version
2
+ torch>=2.0.0
3
+ torchaudio>=2.0.0
4
+ numpy>=1.21.0
5
+ gradio>=4.0.0
6
+ loguru>=0.6.0
7
+ requests>=2.25.0