Spaces:
Running
Running
' instead of "
Browse files- hyvideo/constants.py +90 -90
hyvideo/constants.py
CHANGED
|
@@ -1,90 +1,90 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import torch
|
| 3 |
-
|
| 4 |
-
__all__ = [
|
| 5 |
-
"C_SCALE",
|
| 6 |
-
"PROMPT_TEMPLATE",
|
| 7 |
-
"MODEL_BASE",
|
| 8 |
-
"PRECISIONS",
|
| 9 |
-
"NORMALIZATION_TYPE",
|
| 10 |
-
"ACTIVATION_TYPE",
|
| 11 |
-
"VAE_PATH",
|
| 12 |
-
"TEXT_ENCODER_PATH",
|
| 13 |
-
"TOKENIZER_PATH",
|
| 14 |
-
"TEXT_PROJECTION",
|
| 15 |
-
"DATA_TYPE",
|
| 16 |
-
"NEGATIVE_PROMPT",
|
| 17 |
-
]
|
| 18 |
-
|
| 19 |
-
PRECISION_TO_TYPE = {
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
# =================== Constant Values =====================
|
| 26 |
-
# Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
|
| 27 |
-
# overflow error when tensorboard logging values.
|
| 28 |
-
C_SCALE = 1_000_000_000_000_000
|
| 29 |
-
|
| 30 |
-
# When using decoder-only models, we must provide a prompt template to instruct the text encoder
|
| 31 |
-
# on how to generate the text.
|
| 32 |
-
# --------------------------------------------------------------------
|
| 33 |
-
PROMPT_TEMPLATE_ENCODE = (
|
| 34 |
-
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
|
| 35 |
-
"quantity, text, spatial relationships of the objects and background:<|eot_id|>"
|
| 36 |
-
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| 37 |
-
)
|
| 38 |
-
PROMPT_TEMPLATE_ENCODE_VIDEO = (
|
| 39 |
-
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
|
| 40 |
-
"1. The main content and theme of the video."
|
| 41 |
-
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
| 42 |
-
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
| 43 |
-
"4. background environment, light, style and atmosphere."
|
| 44 |
-
"5. camera angles, movements, and transitions used in the video:<|eot_id|>"
|
| 45 |
-
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
|
| 49 |
-
|
| 50 |
-
PROMPT_TEMPLATE = {
|
| 51 |
-
"dit-llm-encode": {
|
| 52 |
-
"template": PROMPT_TEMPLATE_ENCODE,
|
| 53 |
-
"crop_start": 36,
|
| 54 |
-
},
|
| 55 |
-
"dit-llm-encode-video": {
|
| 56 |
-
"template": PROMPT_TEMPLATE_ENCODE_VIDEO,
|
| 57 |
-
"crop_start": 95,
|
| 58 |
-
},
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
-
# ======================= Model ======================
|
| 62 |
-
PRECISIONS = {"fp32", "fp16", "bf16"}
|
| 63 |
-
NORMALIZATION_TYPE = {"layer", "rms"}
|
| 64 |
-
ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
|
| 65 |
-
|
| 66 |
-
# =================== Model Path =====================
|
| 67 |
-
MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
|
| 68 |
-
|
| 69 |
-
# =================== Data =======================
|
| 70 |
-
DATA_TYPE = {"image", "video", "image_video"}
|
| 71 |
-
|
| 72 |
-
# 3D VAE
|
| 73 |
-
VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
|
| 74 |
-
|
| 75 |
-
# Text Encoder
|
| 76 |
-
TEXT_ENCODER_PATH = {
|
| 77 |
-
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
| 78 |
-
"llm": f"{MODEL_BASE}/text_encoder",
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
# Tokenizer
|
| 82 |
-
TOKENIZER_PATH = {
|
| 83 |
-
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
| 84 |
-
"llm": f"{MODEL_BASE}/text_encoder",
|
| 85 |
-
}
|
| 86 |
-
|
| 87 |
-
TEXT_PROJECTION = {
|
| 88 |
-
"linear", # Default, an nn.Linear() layer
|
| 89 |
-
"single_refiner", # Single TokenRefiner. Refer to LI-DiT
|
| 90 |
-
}
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
__all__ = [
|
| 5 |
+
"C_SCALE",
|
| 6 |
+
"PROMPT_TEMPLATE",
|
| 7 |
+
"MODEL_BASE",
|
| 8 |
+
"PRECISIONS",
|
| 9 |
+
"NORMALIZATION_TYPE",
|
| 10 |
+
"ACTIVATION_TYPE",
|
| 11 |
+
"VAE_PATH",
|
| 12 |
+
"TEXT_ENCODER_PATH",
|
| 13 |
+
"TOKENIZER_PATH",
|
| 14 |
+
"TEXT_PROJECTION",
|
| 15 |
+
"DATA_TYPE",
|
| 16 |
+
"NEGATIVE_PROMPT",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
PRECISION_TO_TYPE = {
|
| 20 |
+
"fp32": torch.float32,
|
| 21 |
+
"fp16": torch.float16,
|
| 22 |
+
"bf16": torch.bfloat16,
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# =================== Constant Values =====================
|
| 26 |
+
# Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
|
| 27 |
+
# overflow error when tensorboard logging values.
|
| 28 |
+
C_SCALE = 1_000_000_000_000_000
|
| 29 |
+
|
| 30 |
+
# When using decoder-only models, we must provide a prompt template to instruct the text encoder
|
| 31 |
+
# on how to generate the text.
|
| 32 |
+
# --------------------------------------------------------------------
|
| 33 |
+
PROMPT_TEMPLATE_ENCODE = (
|
| 34 |
+
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
|
| 35 |
+
"quantity, text, spatial relationships of the objects and background:<|eot_id|>"
|
| 36 |
+
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| 37 |
+
)
|
| 38 |
+
PROMPT_TEMPLATE_ENCODE_VIDEO = (
|
| 39 |
+
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
|
| 40 |
+
"1. The main content and theme of the video."
|
| 41 |
+
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
| 42 |
+
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
| 43 |
+
"4. background environment, light, style and atmosphere."
|
| 44 |
+
"5. camera angles, movements, and transitions used in the video:<|eot_id|>"
|
| 45 |
+
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
|
| 49 |
+
|
| 50 |
+
PROMPT_TEMPLATE = {
|
| 51 |
+
"dit-llm-encode": {
|
| 52 |
+
"template": PROMPT_TEMPLATE_ENCODE,
|
| 53 |
+
"crop_start": 36,
|
| 54 |
+
},
|
| 55 |
+
"dit-llm-encode-video": {
|
| 56 |
+
"template": PROMPT_TEMPLATE_ENCODE_VIDEO,
|
| 57 |
+
"crop_start": 95,
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# ======================= Model ======================
|
| 62 |
+
PRECISIONS = {"fp32", "fp16", "bf16"}
|
| 63 |
+
NORMALIZATION_TYPE = {"layer", "rms"}
|
| 64 |
+
ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
|
| 65 |
+
|
| 66 |
+
# =================== Model Path =====================
|
| 67 |
+
MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
|
| 68 |
+
|
| 69 |
+
# =================== Data =======================
|
| 70 |
+
DATA_TYPE = {"image", "video", "image_video"}
|
| 71 |
+
|
| 72 |
+
# 3D VAE
|
| 73 |
+
VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
|
| 74 |
+
|
| 75 |
+
# Text Encoder
|
| 76 |
+
TEXT_ENCODER_PATH = {
|
| 77 |
+
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
| 78 |
+
"llm": f"{MODEL_BASE}/text_encoder",
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# Tokenizer
|
| 82 |
+
TOKENIZER_PATH = {
|
| 83 |
+
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
| 84 |
+
"llm": f"{MODEL_BASE}/text_encoder",
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
TEXT_PROJECTION = {
|
| 88 |
+
"linear", # Default, an nn.Linear() layer
|
| 89 |
+
"single_refiner", # Single TokenRefiner. Refer to LI-DiT
|
| 90 |
+
}
|