how to enable low memory version

#3
by aarunsoman - opened

model = transformers.AutoModelForCausalLM.from_pretrained(
'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3',
trust_remote_code=True,
load_8bit=True,
torch_dtype=bfloat16,
)
doesn't work

H2O.ai org
import torch
from h2oai_pipeline import H2OTextGenerationPipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

quantization_config = None
# optional quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

tokenizer = AutoTokenizer.from_pretrained(
    "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
    use_fast=False,
    padding_side="left",
    trust_remote_code=True,
)
model = AutoModelForCausalLM.from_pretrained(
    "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map={"": "cuda:0"},
    quantization_config=quantization_config
).eval()
generate_text = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer)

res = generate_text(
    "Why is drinking water so healthy?",
    min_new_tokens=2,
    max_new_tokens=1024,
    do_sample=False,
    num_beams=1,
    temperature=float(0.3),
    repetition_penalty=float(1.2),
    renormalize_logits=True
)
print(res[0]["generated_text"])

Something like this should work, but this model has been trained in float16, quantization will likely lead to degraded results.

ilu000 changed discussion status to closed

Sign up or log in to comment