lvkaokao
commited on
Commit
·
653f44e
1
Parent(s):
dca5dbd
support fp32/fp16/bf16 eval.
Browse files- app.py +1 -1
- src/display/utils.py +18 -4
- src/leaderboard/read_evals.py +2 -2
- src/submission/check_validity.py +21 -3
- src/submission/submit.py +32 -4
app.py
CHANGED
|
@@ -572,7 +572,7 @@ with demo:
|
|
| 572 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)",
|
| 573 |
visible=not IS_PUBLIC)
|
| 574 |
compute_type = gr.Dropdown(
|
| 575 |
-
choices=[i.value.name for i in ComputeDtype],
|
| 576 |
label="Compute dtype",
|
| 577 |
multiselect=False,
|
| 578 |
value="float16",
|
|
|
|
| 572 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)",
|
| 573 |
visible=not IS_PUBLIC)
|
| 574 |
compute_type = gr.Dropdown(
|
| 575 |
+
choices=[i.value.name for i in ComputeDtype if i.value.name != "All"],
|
| 576 |
label="Compute dtype",
|
| 577 |
multiselect=False,
|
| 578 |
value="float16",
|
src/display/utils.py
CHANGED
|
@@ -242,6 +242,9 @@ class WeightDtype(Enum):
|
|
| 242 |
int4 = ModelDetails("int4")
|
| 243 |
nf4 = ModelDetails("nf4")
|
| 244 |
fp4 = ModelDetails("fp4")
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
Unknown = ModelDetails("?")
|
| 247 |
|
|
@@ -260,6 +263,12 @@ class WeightDtype(Enum):
|
|
| 260 |
return WeightDtype.fp4
|
| 261 |
if weight_dtype in ["All"]:
|
| 262 |
return WeightDtype.all
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
return WeightDtype.Unknown
|
| 264 |
|
| 265 |
class ComputeDtype(Enum):
|
|
@@ -317,8 +326,9 @@ class Precision(Enum):
|
|
| 317 |
qt_2bit = ModelDetails("2bit")
|
| 318 |
qt_3bit = ModelDetails("3bit")
|
| 319 |
qt_4bit = ModelDetails("4bit")
|
| 320 |
-
|
| 321 |
-
|
|
|
|
| 322 |
Unknown = ModelDetails("?")
|
| 323 |
|
| 324 |
def from_str(precision):
|
|
@@ -332,8 +342,12 @@ class Precision(Enum):
|
|
| 332 |
return Precision.qt_3bit
|
| 333 |
if precision in ["4bit"]:
|
| 334 |
return Precision.qt_4bit
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
return Precision.Unknown
|
| 338 |
|
| 339 |
|
|
|
|
| 242 |
int4 = ModelDetails("int4")
|
| 243 |
nf4 = ModelDetails("nf4")
|
| 244 |
fp4 = ModelDetails("fp4")
|
| 245 |
+
fp16 = ModelDetails("float16")
|
| 246 |
+
bf16 = ModelDetails("bfloat16")
|
| 247 |
+
fp32 = ModelDetails("float32")
|
| 248 |
|
| 249 |
Unknown = ModelDetails("?")
|
| 250 |
|
|
|
|
| 263 |
return WeightDtype.fp4
|
| 264 |
if weight_dtype in ["All"]:
|
| 265 |
return WeightDtype.all
|
| 266 |
+
if weight_dtype in ["float16"]:
|
| 267 |
+
return WeightDtype.fp16
|
| 268 |
+
if weight_dtype in ["bfloat16"]:
|
| 269 |
+
return WeightDtype.bf16
|
| 270 |
+
if weight_dtype in ["float32"]:
|
| 271 |
+
return WeightDtype.fp32
|
| 272 |
return WeightDtype.Unknown
|
| 273 |
|
| 274 |
class ComputeDtype(Enum):
|
|
|
|
| 326 |
qt_2bit = ModelDetails("2bit")
|
| 327 |
qt_3bit = ModelDetails("3bit")
|
| 328 |
qt_4bit = ModelDetails("4bit")
|
| 329 |
+
qt_8bit = ModelDetails("8bit")
|
| 330 |
+
qt_16bit = ModelDetails("16bit")
|
| 331 |
+
qt_32bit = ModelDetails("32bit")
|
| 332 |
Unknown = ModelDetails("?")
|
| 333 |
|
| 334 |
def from_str(precision):
|
|
|
|
| 342 |
return Precision.qt_3bit
|
| 343 |
if precision in ["4bit"]:
|
| 344 |
return Precision.qt_4bit
|
| 345 |
+
if precision in ["8bit"]:
|
| 346 |
+
return Precision.qt_8bit
|
| 347 |
+
if precision in ["16bit"]:
|
| 348 |
+
return Precision.qt_16bit
|
| 349 |
+
if precision in ["32bit"]:
|
| 350 |
+
return Precision.qt_32bit
|
| 351 |
return Precision.Unknown
|
| 352 |
|
| 353 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -56,7 +56,7 @@ class EvalResult:
|
|
| 56 |
|
| 57 |
# Precision
|
| 58 |
precision = Precision.from_str(config.get("precision", "4bit"))
|
| 59 |
-
quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
|
| 60 |
weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
|
| 61 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
| 62 |
# double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
|
@@ -209,7 +209,7 @@ def get_request_file_for_model(requests_path, model_name,
|
|
| 209 |
if (
|
| 210 |
req_content["status"] in ["Finished"]
|
| 211 |
and req_content["precision"] == precision.split(".")[-1]
|
| 212 |
-
and req_content["quant_type"] == quant_type
|
| 213 |
and req_content["weight_dtype"] == weight_dtype.split(".")[-1]
|
| 214 |
and req_content["compute_dtype"] == compute_dtype.split(".")[-1]
|
| 215 |
):
|
|
|
|
| 56 |
|
| 57 |
# Precision
|
| 58 |
precision = Precision.from_str(config.get("precision", "4bit"))
|
| 59 |
+
quant_type = QuantType.from_str(str(config.get("quant_type", "GPTQ")))
|
| 60 |
weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
|
| 61 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
| 62 |
# double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
|
|
|
| 209 |
if (
|
| 210 |
req_content["status"] in ["Finished"]
|
| 211 |
and req_content["precision"] == precision.split(".")[-1]
|
| 212 |
+
and str(req_content["quant_type"]) == quant_type
|
| 213 |
and req_content["weight_dtype"] == weight_dtype.split(".")[-1]
|
| 214 |
and req_content["compute_dtype"] == compute_dtype.split(".")[-1]
|
| 215 |
):
|
src/submission/check_validity.py
CHANGED
|
@@ -69,13 +69,27 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
| 69 |
return True, "uses a gated model.", None
|
| 70 |
return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
|
| 71 |
|
|
|
|
| 72 |
def get_model_size(model_info: ModelInfo, precision: str):
|
| 73 |
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
| 74 |
safetensors = None
|
| 75 |
try:
|
| 76 |
safetensors = get_safetensors_metadata(model_info.id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
except Exception as e:
|
| 78 |
-
print(e)
|
| 79 |
|
| 80 |
if safetensors is not None:
|
| 81 |
model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
|
|
@@ -87,9 +101,13 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
| 87 |
except AttributeError as e:
|
| 88 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 89 |
|
| 90 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
|
| 91 |
# model_size = size_factor * model_size
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
KNOWN_SIZE_FACTOR = {
|
| 95 |
"gptq": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 12},
|
|
|
|
| 69 |
return True, "uses a gated model.", None
|
| 70 |
return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
|
| 71 |
|
| 72 |
+
|
| 73 |
def get_model_size(model_info: ModelInfo, precision: str):
|
| 74 |
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
| 75 |
safetensors = None
|
| 76 |
try:
|
| 77 |
safetensors = get_safetensors_metadata(model_info.id)
|
| 78 |
+
num_parameters = 0
|
| 79 |
+
mem = 0
|
| 80 |
+
for key in safetensors.parameter_count:
|
| 81 |
+
if key in ["F16", "BF16"]:
|
| 82 |
+
mem += safetensors.parameter_count[key] * 2
|
| 83 |
+
else:
|
| 84 |
+
mem += safetensors.parameter_count[key] * 4
|
| 85 |
+
|
| 86 |
+
num_parameters += safetensors.parameter_count[key]
|
| 87 |
+
|
| 88 |
+
params_b = round(num_parameters / 1e9, 2)
|
| 89 |
+
size_gb = round(mem / 1e9,2)
|
| 90 |
+
return params_b, size_gb
|
| 91 |
except Exception as e:
|
| 92 |
+
print(str(e))
|
| 93 |
|
| 94 |
if safetensors is not None:
|
| 95 |
model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
|
|
|
|
| 101 |
except AttributeError as e:
|
| 102 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 103 |
|
| 104 |
+
# size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
|
| 105 |
# model_size = size_factor * model_size
|
| 106 |
+
if precision == "16bit":
|
| 107 |
+
size_gb = model_size * 2
|
| 108 |
+
else:
|
| 109 |
+
size_gb = model_size * 4
|
| 110 |
+
return model_size, size_gb
|
| 111 |
|
| 112 |
KNOWN_SIZE_FACTOR = {
|
| 113 |
"gptq": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 12},
|
src/submission/submit.py
CHANGED
|
@@ -157,11 +157,36 @@ def add_new_eval(
|
|
| 157 |
weight_dtype = "int2"
|
| 158 |
|
| 159 |
if quant_type is None or quant_type == "":
|
| 160 |
-
return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
quant_method=quant_type.lower(),
|
| 164 |
bits=precision)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
if quant_type == "llama.cpp":
|
| 167 |
hardware = "cpu"
|
|
@@ -170,6 +195,9 @@ def add_new_eval(
|
|
| 170 |
else:
|
| 171 |
hardware = "gpu"
|
| 172 |
|
|
|
|
|
|
|
|
|
|
| 173 |
eval_entry = {
|
| 174 |
"model": model,
|
| 175 |
"revision": revision,
|
|
@@ -187,7 +215,7 @@ def add_new_eval(
|
|
| 187 |
"hardware": hardware,
|
| 188 |
"status": "Pending",
|
| 189 |
"submitted_time": current_time,
|
| 190 |
-
"model_type":
|
| 191 |
"job_id": -1,
|
| 192 |
"job_start_time": None,
|
| 193 |
"scripts": script
|
|
|
|
| 157 |
weight_dtype = "int2"
|
| 158 |
|
| 159 |
if quant_type is None or quant_type == "":
|
| 160 |
+
# return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
|
| 161 |
+
# for eval fp32/fp16/bf16
|
| 162 |
+
quant_type = None
|
| 163 |
+
|
| 164 |
+
if quant_type is None:
|
| 165 |
+
weight_dtype = str(getattr(model_config, "torch_dtype", "float16"))
|
| 166 |
+
if weight_dtype in ["torch.float16", "float16"]:
|
| 167 |
+
weight_dtype = "float16"
|
| 168 |
+
precision = "16bit"
|
| 169 |
+
elif weight_dtype in ["torch.bfloat16", "bfloat16"]:
|
| 170 |
+
weight_dtype = "bfloat16"
|
| 171 |
+
precision = "16bit"
|
| 172 |
+
elif weight_dtype in ["torch.float32", "float32"]:
|
| 173 |
+
weight_dtype = "float32"
|
| 174 |
+
precision = "32bit"
|
| 175 |
+
else:
|
| 176 |
+
weight_dtype = "?"
|
| 177 |
+
precision = "?"
|
| 178 |
+
model_type = "original"
|
| 179 |
+
model_params, model_size = get_model_size(model_info=model_info, precision=precision)
|
| 180 |
+
else:
|
| 181 |
+
model_params, model_size = get_quantized_model_parameters_memory(model_info,
|
| 182 |
quant_method=quant_type.lower(),
|
| 183 |
bits=precision)
|
| 184 |
+
model_type = "quantization"
|
| 185 |
+
else:
|
| 186 |
+
model_params, model_size = get_quantized_model_parameters_memory(model_info,
|
| 187 |
+
quant_method=quant_type.lower(),
|
| 188 |
+
bits=precision)
|
| 189 |
+
model_type = "quantization"
|
| 190 |
|
| 191 |
if quant_type == "llama.cpp":
|
| 192 |
hardware = "cpu"
|
|
|
|
| 195 |
else:
|
| 196 |
hardware = "gpu"
|
| 197 |
|
| 198 |
+
if compute_dtype == "?":
|
| 199 |
+
compute_dtype = "float16"
|
| 200 |
+
|
| 201 |
eval_entry = {
|
| 202 |
"model": model,
|
| 203 |
"revision": revision,
|
|
|
|
| 215 |
"hardware": hardware,
|
| 216 |
"status": "Pending",
|
| 217 |
"submitted_time": current_time,
|
| 218 |
+
"model_type": model_type,
|
| 219 |
"job_id": -1,
|
| 220 |
"job_start_time": None,
|
| 221 |
"scripts": script
|