config: configs/inference.yaml

input_file: examples/infer_samples.txt
debug: null
infer: false
hparams: ''
dtype: bf16

exp_path: pretrained_models/OmniAvatar-14B
text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth
image_encoder_path: None
dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth

# exp_path: pretrained_models/OmniAvatar-1.3B
# text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
# image_encoder_path: None
# dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
# vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth

wav2vec_path: pretrained_models/wav2vec2-base-960h
num_persistent_param_in_dit: 
reload_cfg: true
sp_size: 1
seed: 42
image_sizes_720:
# - - 400
#   - 720
# - - 720 commented out due duration needed on HF
#   - 720
- - 720
  - 400
image_sizes_1280:
- - 720
  - 720
- - 528
  - 960
- - 960
  - 528
- - 720
  - 1280
- - 1280
  - 720
max_hw: 720
max_tokens: 40000
seq_len: 200
overlap_frame: 13
guidance_scale: 4.5
audio_scale: null
num_steps: 8
fps: 24
sample_rate: 16000
negative_prompt: Vivid color tones, background/camera moving quickly, screen switching,
  subtitles and special effects, mutation, overexposed, static, blurred details, subtitles,
  style, work, painting, image, still, overall grayish, worst quality, low quality,
  JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly
  drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image,
  chaotic background, three legs, crowded background with many people, walking backward
silence_duration_s: 0.0
use_fsdp: false
tea_cache_l1_thresh: 0
rank: 0
world_size: 1
local_rank: 0
device: cuda
num_nodes: 1
i2v: true
use_audio: true
random_prefix_frames: true
model_config:
  in_dim: 33
  audio_hidden_size: 32
train_architecture: lora
lora_target_modules: q,k,v,o,ffn.0,ffn.2
init_lora_weights: kaiming
lora_rank: 128
lora_alpha: 64.0