| defaults: | |
| - data: t5_clap # chenge here to load different data in testing (data.AudioCaps_test) | |
| - override hydra/job_logging: custom-simplest | |
| - _self_ | |
| hydra: | |
| run: | |
| dir: ./exps/${exp_id} | |
| output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra | |
| enable_email: False | |
| ## model | |
| model: meanaudio_mf | |
| text_encoder_name: t5_clap # [t5, clip, t5_clap, t5_clap_cat]: change here for different feature utils (only for runner-FeatureUtils/infer, not used for using pre-computed dataset) | |
| concat_text_fc: False | |
| exp_id: default | |
| debug: False | |
| cudnn_benchmark: True | |
| compile: False # set compile to false by default | |
| amp: True | |
| weights: null | |
| # weights: null | |
| checkpoint: null | |
| seed: 14159265 | |
| num_workers: 10 # per-GPU | |
| pin_memory: False # set to True if your system can handle it, i.e., have enough memory | |
| # NOTE: This DOSE NOT affect the model during inference in any way | |
| # they are just for the dataloader to fill in the missing data in multi-modal loading | |
| # to change the sequence length for the model, see networks.py | |
| data_dim: | |
| text_seq_len: 77 | |
| text_dim: 1024 | |
| text_c_dim: 512 # 1024 for pooled T5, 512 for CLAP | |
| # ema configuration | |
| ema: | |
| enable: True | |
| sigma_rels: [0.05, 0.1] | |
| update_every: 1 | |
| checkpoint_every: 10_000 | |
| checkpoint_folder: ${hydra:run.dir}/ema_ckpts | |
| default_output_sigma: 0.05 | |
| # sampling, only for flow matching | |
| sampling: | |
| mean: 0.0 | |
| scale: 1.0 | |
| min_sigma: 0.0 | |
| method: euler | |
| num_steps: 25 | |
| # classifier-free guidance | |
| null_condition_probability: 0.1 | |
| cfg_strength: 1 | |
| # checkpoint paths to external modules | |
| vae_16k_ckpt: ./weights/v1-16.pth | |
| vae_44k_ckpt: ./weights/v1-44.pth | |
| bigvgan_vocoder_ckpt: ./weights/best_netG.pt |