Spaces:
Running
on
Zero
Running
on
Zero
config: configs/inference.yaml | |
input_file: examples/infer_samples.txt | |
debug: null | |
infer: false | |
hparams: '' | |
dtype: bf16 | |
exp_path: pretrained_models/OmniAvatar-14B | |
text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth | |
image_encoder_path: None | |
dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors | |
vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth | |
# exp_path: pretrained_models/OmniAvatar-1.3B | |
# text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth | |
# image_encoder_path: None | |
# dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors | |
# vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth | |
wav2vec_path: pretrained_models/wav2vec2-base-960h | |
num_persistent_param_in_dit: | |
reload_cfg: true | |
sp_size: 1 | |
seed: 42 | |
image_sizes_720: | |
# - - 400 | |
# - 720 | |
# - - 720 commented out due duration needed on HF | |
# - 720 | |
- - 720 | |
- 400 | |
image_sizes_1280: | |
- - 720 | |
- 720 | |
- - 528 | |
- 960 | |
- - 960 | |
- 528 | |
- - 720 | |
- 1280 | |
- - 1280 | |
- 720 | |
max_hw: 720 | |
max_tokens: 40000 | |
seq_len: 200 | |
overlap_frame: 13 | |
guidance_scale: 4.5 | |
audio_scale: null | |
num_steps: 8 | |
fps: 24 | |
sample_rate: 16000 | |
negative_prompt: Vivid color tones, background/camera moving quickly, screen switching, | |
subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, | |
style, work, painting, image, still, overall grayish, worst quality, low quality, | |
JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly | |
drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, | |
chaotic background, three legs, crowded background with many people, walking backward | |
silence_duration_s: 0.0 | |
use_fsdp: false | |
tea_cache_l1_thresh: 0 | |
rank: 0 | |
world_size: 1 | |
local_rank: 0 | |
device: cuda | |
num_nodes: 1 | |
i2v: true | |
use_audio: true | |
random_prefix_frames: true | |
model_config: | |
in_dim: 33 | |
audio_hidden_size: 32 | |
train_architecture: lora | |
lora_target_modules: q,k,v,o,ffn.0,ffn.2 | |
init_lora_weights: kaiming | |
lora_rank: 128 | |
lora_alpha: 64.0 |