from transformers import AutoTokenizer, TextStreamer, pipeline from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig import time model_name_or_path = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ" model_basename = "Jackson2-gptq_model-4bit-128g" use_triton = False tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False) model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device="cuda:0", use_triton=use_triton, quantize_config=None) ) user_input = st.text_input("Input a phrase") prompt_template = f'USER: {user_input}\nASSISTANT:' if st.button("Generate the prompt"): inputs_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() streamer = TextStreamer(tokenizer) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, streamer=streamer, max_new_tokens=512, temperature=0.2, top_p=0.95, repetition_penalty=1.15 ) pipe(prompt_template) st.write(pipe(prompt_template)[0]['generated_text'])