JustJaro commited on
Commit
7f7d82c
·
verified ·
1 Parent(s): c9a730b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -136
README.md CHANGED
@@ -483,139 +483,3 @@ TODO: Add `gptqmodel.utils.eval` integration and auto-generation of eval table.
483
 
484
  ---
485
  *Generated and quantized using GPTQModel.*
486
- """
487
- readme_path = os.path.join(quantized_model_dir, "README.md")
488
- with open(readme_path, "w") as f:
489
- f.write(readme_content)
490
- typer.echo("README.md created with detailed information.")
491
-
492
-
493
- @app.command()
494
- def main(
495
- seq_len: int = typer.Option(4096, help="Sequence length for tokenization and calibration."),
496
- nsamples: int = typer.Option(256, help="Number of samples to use for calibration."),
497
- source_model: str = typer.Option("arcee-ai/Virtuoso-Medium-v2",
498
- help="Source model HF repository identifier."),
499
- calibration_dataset: str = typer.Option("wikitext/wikitext-2-raw-v1",
500
- help="Calibration dataset identifier (in 'dataset/config' format) or local file path."),
501
- hf_token: str = typer.Option(HF_TOKEN,
502
- help="Hugging Face token for creating/updating your repo."),
503
- upload_only: bool = typer.Option(False, help="Only upload the quantized model to the Hugging Face Hub."),
504
- group_size: GroupSize = typer.Option(GroupSize.accurate, help="Group size for quantization accurate: 32, "
505
- "balanced: 64, fast: 128. Default: accurate."),
506
- mse: bool = typer.Option(True, help="Use mse instead of mae for the loss function."),
507
- ):
508
- # Prepare destination directory and model names.
509
- model_name = source_model.split("/")[-1]
510
- quantized_model_name = f"{model_name}_gptq_g{int(group_size.value)}_4bit"
511
- quantized_model_dir = os.path.expanduser(os.path.join("~/models/quantized", quantized_model_name))
512
-
513
- if not os.path.exists(quantized_model_dir) or not upload_only:
514
- os.makedirs(quantized_model_dir, exist_ok=True)
515
-
516
- typer.echo("Loading tokenizer from source model...")
517
- tokenizer_obj = AutoTokenizer.from_pretrained(source_model, use_fast=True)
518
-
519
- typer.echo("Loading calibration dataset...")
520
- typer.echo(f"Calibration dataset: {calibration_dataset}")
521
- calibration_data = get_calibration_dataset(tokenizer_obj, nsamples, seq_len, calibration_dataset)
522
- if not calibration_data:
523
- typer.echo("Calibration dataset is empty. Aborting.", err=True)
524
- raise typer.Exit(code=1)
525
-
526
- if mse:
527
- # Fits mistral-small-24b particularly well, as well as the increased damp_percent
528
- mse = 0.01
529
- quantize_config = QuantizeConfig(bits=4, group_size=int(group_size.value), damp_percent=0.015, mse=mse)
530
- else:
531
- quantize_config = QuantizeConfig(bits=4, group_size=int(group_size.value), damp_percent=0.01)
532
-
533
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
534
- typer.echo(f"Loading model in {device} mode...")
535
- model = GPTQModel.load(source_model, quantize_config)
536
-
537
- typer.echo("Quantizing model...")
538
- group_size_factor = int(128 / int(group_size.value))
539
- model.quantize(calibration_data, auto_gc=False, batch_size=int((nsamples * 0.1) / group_size_factor))
540
-
541
- # Retrieve Hugging Face user info for README generation.
542
- package_versions = get_pinned_package_versions()
543
- username = get_my_user(hf_token)
544
- script_content = self_read_script()
545
-
546
- typer.echo(f"Saving quantized model to {quantized_model_dir} using Transformers safe serialization...")
547
- try:
548
- model.save_pretrained(quantized_model_dir)
549
- tokenizer_obj.save_pretrained(quantized_model_dir)
550
- except Exception as ex:
551
- typer.echo(f"Error during saving with safe_serialization: {ex}. Aborting.")
552
- raise
553
-
554
- typer.echo(f"Model saved to: {quantized_model_dir}")
555
- else:
556
- tokenizer_obj = AutoTokenizer.from_pretrained(source_model, use_fast=True)
557
- package_versions = get_pinned_package_versions()
558
- username = get_my_user(hf_token)
559
- script_content = self_read_script()
560
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
561
-
562
- # Load the quantized model for perplexity calculation
563
- typer.echo("Loading quantized model for evaluation...")
564
- model = GPTQModel.load(quantized_model_dir, device=device)
565
-
566
- # Calculate perplexity with improved method
567
- avg_ppl = calculate_avg_ppl(model, tokenizer_obj)
568
- typer.echo(f"Final perplexity result: {avg_ppl}")
569
-
570
- deps = Path("./pyproject.toml")
571
- if deps.exists():
572
- shutil.copy(deps, quantized_model_dir)
573
-
574
- generate_readme(calibration_dataset, nsamples, quantized_model_dir,
575
- quantized_model_name, script_content, seq_len, source_model, username, avg_ppl)
576
-
577
- typer.echo("Uploading to Hugging Face Hub...")
578
- GPTQModel.push_to_hub(quantized_path=quantized_model_dir, private=False, repo_id=quantized_model_name,
579
- token=HF_TOKEN)
580
-
581
- typer.echo(f"Model uploaded to Hugging Face repo: {quantized_model_name}")
582
-
583
- # Run a quick inference demo
584
- demo_input = tokenizer_obj("test is", return_tensors="pt").to(device)
585
- generated_ids = model.generate(**demo_input)
586
- output_text = tokenizer_obj.decode(generated_ids[0])
587
- typer.echo(f"Inference demo output: {output_text}")
588
- typer.echo(f"Final perplexity on test dataset: {avg_ppl}")
589
-
590
-
591
- if __name__ == "__main__":
592
- app()
593
-
594
- ```
595
-
596
- ## Quantization Performance
597
-
598
- Average perplexity (PPL) on WikiText-2 test dataset: **N/A (Error: Invalid thread config: max_m_blocks = 0, thread_k = -1, thread_n = -1, num_threads = -1 for MKN = [7, 576, 576] and num_bits = 4, group_size = 32, has_act_order = 1, is_k_full = 1, max_shared_mem = 166912)**
599
-
600
- *Perplexity calculated using manual calculation method*
601
-
602
- ## Disclaimer
603
- This model is for research purposes only. It may inherit limitations and biases from the original model and the quantization process. Please use responsibly and refer to the original model card for more details.
604
-
605
- ## Contact
606
- For any questions or support, please visit [ConfidentialMind.com](https://www.confidentialmind.com) or contact us directly.
607
-
608
- ## License
609
- This model inherits the license from the original model. Please refer to the original model card for more details.
610
- Original model card: `HuggingFaceTB/SmolLM-135M`
611
-
612
- ## Author
613
- This model was quantized by [Jaro](https://www.linkedin.com/in/jaroai/)
614
-
615
- ## Acknowledgements
616
- Quantization performed using the GPTQModel pipeline.
617
-
618
- TODO: Add `gptqmodel.utils.eval` integration and auto-generation of eval table.
619
-
620
- ---
621
- *Generated and quantized using GPTQModel.*
 
483
 
484
  ---
485
  *Generated and quantized using GPTQModel.*