adam_beta1: 0.9 adam_beta2: 0.999 adam_epsilon: 1.0e-08 adam_weight_decay: 0.01 allow_tf32: true checkpointing_steps: 5000 checkpoints_total_limit: 20 data: audio_margin: 2 dynamic_past_frames: false height: 512 metadata_paths: - data/embedding/metadata.jsonl n_sample_frames: 16 num_past_frames: 16 width: 512 dataloader_num_workers: 16 enable_xformers_memory_efficient_attention: true gradient_accumulation_steps: 1 gradient_checkpointing: true learning_rate: 1.0e-05 logit_mean: 0.0 logit_std: 1.0 lr_scheduler: constant lr_warmup_steps: 0 max_grad_norm: 1.0 max_train_steps: 3500 mixed_precision: bf16 mode_scale: 1.29 model_name_or_path: memoavatar/memo noise_scheduler_kwargs: num_train_timesteps: 1000 num_train_epochs: 700 output_dir: outputs/finetune prefetch_factor: 4 resume_from_checkpoint: null robust_training: true scale_lr: false seed: 42 start_ratio: 0.05 tracker_project_name: memo train_audio_proj: false train_batch_size: 2 train_diffusion_net: true train_image_proj: false train_reference_net: false trainable_modules: - motion_modules - audio_modules uncond_audio_ratio: 0.05 uncond_img_ratio: 0.05 use_8bit_adam: true use_ema: false vae: stabilityai/sd-vae-ft-mse weighting_scheme: logit_normal