metadata
library_name: peft
license: apache-2.0
pipeline_tag: text-generation
inference: false
tags:
- mistralai/Mistral-7B-Instruct-v0.1
- finetuned
pip install -q accelerate bitsandbytes trl datasets
pip install git+https://github.com/huggingface/transformers
pip install git+https://github.com/huggingface/peft.git
import os
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
pipeline,
logging,
)
from peft import LoraConfig, PeftModel
base_model_name = "mistralai/Mistral-7B-Instruct-v0.1"
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
# Load the entire model on the GPU 0
device_map = {"": 0}
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16: accelerate training with bf16=True")
print("=" * 80)
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
quantization_config=bnb_config,
device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, "Ashishkr/mistral-medical-consultation")
model = model.merge_and_unload()
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Run text generation pipeline with the merged model
prompt = """
i have a neck pain since 2 days .
"""
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,do_sample = True, temperature = 0.9, max_length=200)
response = pipe(f"<s>[INST] {prompt} [/INST]")
print(response[0]['generated_text'])