config: configs/inference.yaml input_file: examples/infer_samples.txt debug: null infer: false hparams: '' dtype: bf16 exp_path: pretrained_models/OmniAvatar-14B text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth image_encoder_path: None dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth # exp_path: pretrained_models/OmniAvatar-1.3B # text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth # image_encoder_path: None # dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors # vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth wav2vec_path: pretrained_models/wav2vec2-base-960h num_persistent_param_in_dit: reload_cfg: true sp_size: 1 seed: 42 image_sizes_720: # - - 400 # - 720 # - - 720 commented out due duration needed on HF # - 720 - - 720 - 400 image_sizes_1280: - - 720 - 720 - - 528 - 960 - - 960 - 528 - - 720 - 1280 - - 1280 - 720 max_hw: 720 max_tokens: 40000 seq_len: 200 overlap_frame: 13 guidance_scale: 4.5 audio_scale: null num_steps: 8 fps: 24 sample_rate: 16000 negative_prompt: Vivid color tones, background/camera moving quickly, screen switching, subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, chaotic background, three legs, crowded background with many people, walking backward silence_duration_s: 0.0 use_fsdp: false tea_cache_l1_thresh: 0 rank: 0 world_size: 1 local_rank: 0 device: cuda num_nodes: 1 i2v: true use_audio: true random_prefix_frames: true model_config: in_dim: 33 audio_hidden_size: 32 train_architecture: lora lora_target_modules: q,k,v,o,ffn.0,ffn.2 init_lora_weights: kaiming lora_rank: 128 lora_alpha: 64.0