Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| import argparse | |
| from .constants import * | |
| import re | |
| from .modules.models import HUNYUAN_VIDEO_CONFIG | |
| def parse_args(namespace=None): | |
| parser = argparse.ArgumentParser(description="HunyuanVideo inference script") | |
| parser = add_network_args(parser) | |
| parser = add_extra_models_args(parser) | |
| parser = add_denoise_schedule_args(parser) | |
| parser = add_inference_args(parser) | |
| parser = add_parallel_args(parser) | |
| args = parser.parse_args(namespace=namespace) | |
| args = sanity_check_args(args) | |
| return args | |
| def add_network_args(parser: argparse.ArgumentParser): | |
| group = parser.add_argument_group(title="HunyuanVideo network args") | |
| # Main model | |
| group.add_argument( | |
| "--model", | |
| type=str, | |
| choices=list(HUNYUAN_VIDEO_CONFIG.keys()), | |
| default="HYVideo-T/2-cfgdistill", | |
| ) | |
| group.add_argument( | |
| "--latent-channels", | |
| type=str, | |
| default=16, | |
| help="Number of latent channels of DiT. If None, it will be determined by `vae`. If provided, " | |
| "it still needs to match the latent channels of the VAE model.", | |
| ) | |
| group.add_argument( | |
| "--precision", | |
| type=str, | |
| default="bf16", | |
| choices=PRECISIONS, | |
| help="Precision mode. Options: fp32, fp16, bf16. Applied to the backbone model and optimizer.", | |
| ) | |
| # RoPE | |
| group.add_argument( | |
| "--rope-theta", type=int, default=256, help="Theta used in RoPE." | |
| ) | |
| return parser | |
| def add_extra_models_args(parser: argparse.ArgumentParser): | |
| group = parser.add_argument_group( | |
| title="Extra models args, including vae, text encoders and tokenizers)" | |
| ) | |
| # - VAE | |
| group.add_argument( | |
| "--vae", | |
| type=str, | |
| default="884-16c-hy", | |
| choices=list(VAE_PATH), | |
| help="Name of the VAE model.", | |
| ) | |
| group.add_argument( | |
| "--vae-precision", | |
| type=str, | |
| default="fp16", | |
| choices=PRECISIONS, | |
| help="Precision mode for the VAE model.", | |
| ) | |
| group.add_argument( | |
| "--vae-tiling", | |
| action="store_true", | |
| help="Enable tiling for the VAE model to save GPU memory.", | |
| ) | |
| group.set_defaults(vae_tiling=True) | |
| group.add_argument( | |
| "--text-encoder", | |
| type=str, | |
| default="llm", | |
| choices=list(TEXT_ENCODER_PATH), | |
| help="Name of the text encoder model.", | |
| ) | |
| group.add_argument( | |
| "--text-encoder-precision", | |
| type=str, | |
| default="fp16", | |
| choices=PRECISIONS, | |
| help="Precision mode for the text encoder model.", | |
| ) | |
| group.add_argument( | |
| "--text-states-dim", | |
| type=int, | |
| default=4096, | |
| help="Dimension of the text encoder hidden states.", | |
| ) | |
| group.add_argument( | |
| "--text-len", type=int, default=256, help="Maximum length of the text input." | |
| ) | |
| group.add_argument( | |
| "--tokenizer", | |
| type=str, | |
| default="llm", | |
| choices=list(TOKENIZER_PATH), | |
| help="Name of the tokenizer model.", | |
| ) | |
| group.add_argument( | |
| "--prompt-template", | |
| type=str, | |
| default="dit-llm-encode", | |
| choices=PROMPT_TEMPLATE, | |
| help="Image prompt template for the decoder-only text encoder model.", | |
| ) | |
| group.add_argument( | |
| "--prompt-template-video", | |
| type=str, | |
| default="dit-llm-encode-video", | |
| choices=PROMPT_TEMPLATE, | |
| help="Video prompt template for the decoder-only text encoder model.", | |
| ) | |
| group.add_argument( | |
| "--hidden-state-skip-layer", | |
| type=int, | |
| default=2, | |
| help="Skip layer for hidden states.", | |
| ) | |
| group.add_argument( | |
| "--apply-final-norm", | |
| action="store_true", | |
| help="Apply final normalization to the used text encoder hidden states.", | |
| ) | |
| # - CLIP | |
| group.add_argument( | |
| "--text-encoder-2", | |
| type=str, | |
| default="clipL", | |
| choices=list(TEXT_ENCODER_PATH), | |
| help="Name of the second text encoder model.", | |
| ) | |
| group.add_argument( | |
| "--text-encoder-precision-2", | |
| type=str, | |
| default="fp16", | |
| choices=PRECISIONS, | |
| help="Precision mode for the second text encoder model.", | |
| ) | |
| group.add_argument( | |
| "--text-states-dim-2", | |
| type=int, | |
| default=768, | |
| help="Dimension of the second text encoder hidden states.", | |
| ) | |
| group.add_argument( | |
| "--tokenizer-2", | |
| type=str, | |
| default="clipL", | |
| choices=list(TOKENIZER_PATH), | |
| help="Name of the second tokenizer model.", | |
| ) | |
| group.add_argument( | |
| "--text-len-2", | |
| type=int, | |
| default=77, | |
| help="Maximum length of the second text input.", | |
| ) | |
| return parser | |
| def add_denoise_schedule_args(parser: argparse.ArgumentParser): | |
| group = parser.add_argument_group(title="Denoise schedule args") | |
| group.add_argument( | |
| "--denoise-type", | |
| type=str, | |
| default="flow", | |
| help="Denoise type for noised inputs.", | |
| ) | |
| # Flow Matching | |
| group.add_argument( | |
| "--flow-shift", | |
| type=float, | |
| default=7.0, | |
| help="Shift factor for flow matching schedulers.", | |
| ) | |
| group.add_argument( | |
| "--flow-reverse", | |
| action="store_true", | |
| help="If reverse, learning/sampling from t=1 -> t=0.", | |
| ) | |
| group.add_argument( | |
| "--flow-solver", | |
| type=str, | |
| default="euler", | |
| help="Solver for flow matching.", | |
| ) | |
| group.add_argument( | |
| "--use-linear-quadratic-schedule", | |
| action="store_true", | |
| help="Use linear quadratic schedule for flow matching." | |
| "Following MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)", | |
| ) | |
| group.add_argument( | |
| "--linear-schedule-end", | |
| type=int, | |
| default=25, | |
| help="End step for linear quadratic schedule for flow matching.", | |
| ) | |
| return parser | |
| def add_inference_args(parser: argparse.ArgumentParser): | |
| group = parser.add_argument_group(title="Inference args") | |
| # ======================== Model loads ======================== | |
| group.add_argument( | |
| "--model-base", | |
| type=str, | |
| default="ckpts", | |
| help="Root path of all the models, including t2v models and extra models.", | |
| ) | |
| group.add_argument( | |
| "--dit-weight", | |
| type=str, | |
| default="ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt", | |
| help="Path to the HunyuanVideo model. If None, search the model in the args.model_root." | |
| "1. If it is a file, load the model directly." | |
| "2. If it is a directory, search the model in the directory. Support two types of models: " | |
| "1) named `pytorch_model_*.pt`" | |
| "2) named `*_model_states.pt`, where * can be `mp_rank_00`.", | |
| ) | |
| group.add_argument( | |
| "--model-resolution", | |
| type=str, | |
| default="540p", | |
| choices=["540p", "720p"], | |
| help="Root path of all the models, including t2v models and extra models.", | |
| ) | |
| group.add_argument( | |
| "--load-key", | |
| type=str, | |
| default="module", | |
| help="Key to load the model states. 'module' for the main model, 'ema' for the EMA model.", | |
| ) | |
| group.add_argument( | |
| "--use-cpu-offload", | |
| action="store_true", | |
| help="Use CPU offload for the model load.", | |
| ) | |
| # ======================== Inference general setting ======================== | |
| group.add_argument( | |
| "--batch-size", | |
| type=int, | |
| default=1, | |
| help="Batch size for inference and evaluation.", | |
| ) | |
| group.add_argument( | |
| "--infer-steps", | |
| type=int, | |
| default=50, | |
| help="Number of denoising steps for inference.", | |
| ) | |
| group.add_argument( | |
| "--disable-autocast", | |
| action="store_true", | |
| help="Disable autocast for denoising loop and vae decoding in pipeline sampling.", | |
| ) | |
| group.add_argument( | |
| "--save-path", | |
| type=str, | |
| default="./results", | |
| help="Path to save the generated samples.", | |
| ) | |
| group.add_argument( | |
| "--save-path-suffix", | |
| type=str, | |
| default="", | |
| help="Suffix for the directory of saved samples.", | |
| ) | |
| group.add_argument( | |
| "--name-suffix", | |
| type=str, | |
| default="", | |
| help="Suffix for the names of saved samples.", | |
| ) | |
| group.add_argument( | |
| "--num-videos", | |
| type=int, | |
| default=1, | |
| help="Number of videos to generate for each prompt.", | |
| ) | |
| # ---sample size--- | |
| group.add_argument( | |
| "--video-size", | |
| type=int, | |
| nargs="+", | |
| default=(720, 1280), | |
| help="Video size for training. If a single value is provided, it will be used for both height " | |
| "and width. If two values are provided, they will be used for height and width " | |
| "respectively.", | |
| ) | |
| group.add_argument( | |
| "--video-length", | |
| type=int, | |
| default=129, | |
| help="How many frames to sample from a video. if using 3d vae, the number should be 4n+1", | |
| ) | |
| # --- prompt --- | |
| group.add_argument( | |
| "--prompt", | |
| type=str, | |
| default=None, | |
| help="Prompt for sampling during evaluation.", | |
| ) | |
| group.add_argument( | |
| "--seed-type", | |
| type=str, | |
| default="auto", | |
| choices=["file", "random", "fixed", "auto"], | |
| help="Seed type for evaluation. If file, use the seed from the CSV file. If random, generate a " | |
| "random seed. If fixed, use the fixed seed given by `--seed`. If auto, `csv` will use the " | |
| "seed column if available, otherwise use the fixed `seed` value. `prompt` will use the " | |
| "fixed `seed` value.", | |
| ) | |
| group.add_argument("--seed", type=int, default=None, help="Seed for evaluation.") | |
| # Classifier-Free Guidance | |
| group.add_argument( | |
| "--neg-prompt", type=str, default=None, help="Negative prompt for sampling." | |
| ) | |
| group.add_argument( | |
| "--cfg-scale", type=float, default=1.0, help="Classifier free guidance scale." | |
| ) | |
| group.add_argument( | |
| "--embedded-cfg-scale", | |
| type=float, | |
| default=6.0, | |
| help="Embeded classifier free guidance scale.", | |
| ) | |
| group.add_argument( | |
| "--use-fp8", | |
| action="store_true", | |
| help="Enable use fp8 for inference acceleration." | |
| ) | |
| group.add_argument( | |
| "--reproduce", | |
| action="store_true", | |
| help="Enable reproducibility by setting random seeds and deterministic algorithms.", | |
| ) | |
| return parser | |
| def add_parallel_args(parser: argparse.ArgumentParser): | |
| group = parser.add_argument_group(title="Parallel args") | |
| # ======================== Model loads ======================== | |
| group.add_argument( | |
| "--ulysses-degree", | |
| type=int, | |
| default=1, | |
| help="Ulysses degree.", | |
| ) | |
| group.add_argument( | |
| "--ring-degree", | |
| type=int, | |
| default=1, | |
| help="Ulysses degree.", | |
| ) | |
| return parser | |
| def sanity_check_args(args): | |
| # VAE channels | |
| vae_pattern = r"\d{2,3}-\d{1,2}c-\w+" | |
| if not re.match(vae_pattern, args.vae): | |
| raise ValueError( | |
| f"Invalid VAE model: {args.vae}. Must be in the format of '{vae_pattern}'." | |
| ) | |
| vae_channels = int(args.vae.split("-")[1][:-1]) | |
| if args.latent_channels is None: | |
| args.latent_channels = vae_channels | |
| if vae_channels != args.latent_channels: | |
| raise ValueError( | |
| f"Latent channels ({args.latent_channels}) must match the VAE channels ({vae_channels})." | |
| ) | |
| return args | |

