Update README.md
Browse files
README.md
CHANGED
@@ -483,139 +483,3 @@ TODO: Add `gptqmodel.utils.eval` integration and auto-generation of eval table.
|
|
483 |
|
484 |
---
|
485 |
*Generated and quantized using GPTQModel.*
|
486 |
-
"""
|
487 |
-
readme_path = os.path.join(quantized_model_dir, "README.md")
|
488 |
-
with open(readme_path, "w") as f:
|
489 |
-
f.write(readme_content)
|
490 |
-
typer.echo("README.md created with detailed information.")
|
491 |
-
|
492 |
-
|
493 |
-
@app.command()
|
494 |
-
def main(
|
495 |
-
seq_len: int = typer.Option(4096, help="Sequence length for tokenization and calibration."),
|
496 |
-
nsamples: int = typer.Option(256, help="Number of samples to use for calibration."),
|
497 |
-
source_model: str = typer.Option("arcee-ai/Virtuoso-Medium-v2",
|
498 |
-
help="Source model HF repository identifier."),
|
499 |
-
calibration_dataset: str = typer.Option("wikitext/wikitext-2-raw-v1",
|
500 |
-
help="Calibration dataset identifier (in 'dataset/config' format) or local file path."),
|
501 |
-
hf_token: str = typer.Option(HF_TOKEN,
|
502 |
-
help="Hugging Face token for creating/updating your repo."),
|
503 |
-
upload_only: bool = typer.Option(False, help="Only upload the quantized model to the Hugging Face Hub."),
|
504 |
-
group_size: GroupSize = typer.Option(GroupSize.accurate, help="Group size for quantization accurate: 32, "
|
505 |
-
"balanced: 64, fast: 128. Default: accurate."),
|
506 |
-
mse: bool = typer.Option(True, help="Use mse instead of mae for the loss function."),
|
507 |
-
):
|
508 |
-
# Prepare destination directory and model names.
|
509 |
-
model_name = source_model.split("/")[-1]
|
510 |
-
quantized_model_name = f"{model_name}_gptq_g{int(group_size.value)}_4bit"
|
511 |
-
quantized_model_dir = os.path.expanduser(os.path.join("~/models/quantized", quantized_model_name))
|
512 |
-
|
513 |
-
if not os.path.exists(quantized_model_dir) or not upload_only:
|
514 |
-
os.makedirs(quantized_model_dir, exist_ok=True)
|
515 |
-
|
516 |
-
typer.echo("Loading tokenizer from source model...")
|
517 |
-
tokenizer_obj = AutoTokenizer.from_pretrained(source_model, use_fast=True)
|
518 |
-
|
519 |
-
typer.echo("Loading calibration dataset...")
|
520 |
-
typer.echo(f"Calibration dataset: {calibration_dataset}")
|
521 |
-
calibration_data = get_calibration_dataset(tokenizer_obj, nsamples, seq_len, calibration_dataset)
|
522 |
-
if not calibration_data:
|
523 |
-
typer.echo("Calibration dataset is empty. Aborting.", err=True)
|
524 |
-
raise typer.Exit(code=1)
|
525 |
-
|
526 |
-
if mse:
|
527 |
-
# Fits mistral-small-24b particularly well, as well as the increased damp_percent
|
528 |
-
mse = 0.01
|
529 |
-
quantize_config = QuantizeConfig(bits=4, group_size=int(group_size.value), damp_percent=0.015, mse=mse)
|
530 |
-
else:
|
531 |
-
quantize_config = QuantizeConfig(bits=4, group_size=int(group_size.value), damp_percent=0.01)
|
532 |
-
|
533 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
534 |
-
typer.echo(f"Loading model in {device} mode...")
|
535 |
-
model = GPTQModel.load(source_model, quantize_config)
|
536 |
-
|
537 |
-
typer.echo("Quantizing model...")
|
538 |
-
group_size_factor = int(128 / int(group_size.value))
|
539 |
-
model.quantize(calibration_data, auto_gc=False, batch_size=int((nsamples * 0.1) / group_size_factor))
|
540 |
-
|
541 |
-
# Retrieve Hugging Face user info for README generation.
|
542 |
-
package_versions = get_pinned_package_versions()
|
543 |
-
username = get_my_user(hf_token)
|
544 |
-
script_content = self_read_script()
|
545 |
-
|
546 |
-
typer.echo(f"Saving quantized model to {quantized_model_dir} using Transformers safe serialization...")
|
547 |
-
try:
|
548 |
-
model.save_pretrained(quantized_model_dir)
|
549 |
-
tokenizer_obj.save_pretrained(quantized_model_dir)
|
550 |
-
except Exception as ex:
|
551 |
-
typer.echo(f"Error during saving with safe_serialization: {ex}. Aborting.")
|
552 |
-
raise
|
553 |
-
|
554 |
-
typer.echo(f"Model saved to: {quantized_model_dir}")
|
555 |
-
else:
|
556 |
-
tokenizer_obj = AutoTokenizer.from_pretrained(source_model, use_fast=True)
|
557 |
-
package_versions = get_pinned_package_versions()
|
558 |
-
username = get_my_user(hf_token)
|
559 |
-
script_content = self_read_script()
|
560 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
561 |
-
|
562 |
-
# Load the quantized model for perplexity calculation
|
563 |
-
typer.echo("Loading quantized model for evaluation...")
|
564 |
-
model = GPTQModel.load(quantized_model_dir, device=device)
|
565 |
-
|
566 |
-
# Calculate perplexity with improved method
|
567 |
-
avg_ppl = calculate_avg_ppl(model, tokenizer_obj)
|
568 |
-
typer.echo(f"Final perplexity result: {avg_ppl}")
|
569 |
-
|
570 |
-
deps = Path("./pyproject.toml")
|
571 |
-
if deps.exists():
|
572 |
-
shutil.copy(deps, quantized_model_dir)
|
573 |
-
|
574 |
-
generate_readme(calibration_dataset, nsamples, quantized_model_dir,
|
575 |
-
quantized_model_name, script_content, seq_len, source_model, username, avg_ppl)
|
576 |
-
|
577 |
-
typer.echo("Uploading to Hugging Face Hub...")
|
578 |
-
GPTQModel.push_to_hub(quantized_path=quantized_model_dir, private=False, repo_id=quantized_model_name,
|
579 |
-
token=HF_TOKEN)
|
580 |
-
|
581 |
-
typer.echo(f"Model uploaded to Hugging Face repo: {quantized_model_name}")
|
582 |
-
|
583 |
-
# Run a quick inference demo
|
584 |
-
demo_input = tokenizer_obj("test is", return_tensors="pt").to(device)
|
585 |
-
generated_ids = model.generate(**demo_input)
|
586 |
-
output_text = tokenizer_obj.decode(generated_ids[0])
|
587 |
-
typer.echo(f"Inference demo output: {output_text}")
|
588 |
-
typer.echo(f"Final perplexity on test dataset: {avg_ppl}")
|
589 |
-
|
590 |
-
|
591 |
-
if __name__ == "__main__":
|
592 |
-
app()
|
593 |
-
|
594 |
-
```
|
595 |
-
|
596 |
-
## Quantization Performance
|
597 |
-
|
598 |
-
Average perplexity (PPL) on WikiText-2 test dataset: **N/A (Error: Invalid thread config: max_m_blocks = 0, thread_k = -1, thread_n = -1, num_threads = -1 for MKN = [7, 576, 576] and num_bits = 4, group_size = 32, has_act_order = 1, is_k_full = 1, max_shared_mem = 166912)**
|
599 |
-
|
600 |
-
*Perplexity calculated using manual calculation method*
|
601 |
-
|
602 |
-
## Disclaimer
|
603 |
-
This model is for research purposes only. It may inherit limitations and biases from the original model and the quantization process. Please use responsibly and refer to the original model card for more details.
|
604 |
-
|
605 |
-
## Contact
|
606 |
-
For any questions or support, please visit [ConfidentialMind.com](https://www.confidentialmind.com) or contact us directly.
|
607 |
-
|
608 |
-
## License
|
609 |
-
This model inherits the license from the original model. Please refer to the original model card for more details.
|
610 |
-
Original model card: `HuggingFaceTB/SmolLM-135M`
|
611 |
-
|
612 |
-
## Author
|
613 |
-
This model was quantized by [Jaro](https://www.linkedin.com/in/jaroai/)
|
614 |
-
|
615 |
-
## Acknowledgements
|
616 |
-
Quantization performed using the GPTQModel pipeline.
|
617 |
-
|
618 |
-
TODO: Add `gptqmodel.utils.eval` integration and auto-generation of eval table.
|
619 |
-
|
620 |
-
---
|
621 |
-
*Generated and quantized using GPTQModel.*
|
|
|
483 |
|
484 |
---
|
485 |
*Generated and quantized using GPTQModel.*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|