Spaces:
Paused
Paused
File size: 5,062 Bytes
b6934d2 e8ae0c0 1d7c581 1784d72 1d7c581 a110fec 1d7c581 e15f802 1d7c581 229fec7 de1f6e0 229fec7 b6934d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
import subprocess
import traceback
# Function to get memory info
def get_gpu_memory():
try:
result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
memory_info = [x.split(',') for x in result.strip().split('\n')]
memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
except FileNotFoundError:
memory_info = [{"free": "N/A", "total": "N/A"}]
return memory_info
# Display GPU memory information before loading the model
gpu_memory_before = get_gpu_memory()
st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")
# Define pretrained model directory
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
# Check if CUDA is available and get the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Before allocating or loading the model, clear up memory if CUDA is available
if device == "cuda:0":
torch.cuda.empty_cache()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set correctly for the model
# Attempt to load the model, catch any OOM errors
@st.cache_resource
def load_gptq_model():
model = AutoGPTQForCausalLM.from_quantized(
pretrained_model_dir,
model_basename="Jackson2-4bit-128g-GPTQ",
use_safetensors=True,
device=device,
disable_exllamav2=True
)
model.eval() # Set the model to inference mode
return model
model_loaded = False
# Attempt to load the model, catch any OOM errors
try:
model = load_gptq_model()
model_loaded = True
except RuntimeError as e:
if 'CUDA out of memory' in str(e):
st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
st.stop()
else:
raise e
if model_loaded:
# Display GPU memory information after loading the model
gpu_memory_after = get_gpu_memory()
st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
col1, col2 = st.columns(2)
with col1:
user_input = st.text_input("Input a phrase")
with col2:
max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)
# Generate button
if st.button("Generate the prompt"):
try:
prompt_template = f'USER: {user_input}\nASSISTANT:'
inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs = inputs.to(device) # Move inputs to the same device as model
# Generate text using torch.inference_mode for better performance during inference
with torch.inference_mode():
output = model.generate(**inputs, max_new_tokens=max_token)
# Cut the tokens at the input length to display only the generated text
output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
st.markdown(f"**Generated Text:**\n{generated_text}")
except RuntimeError as e:
if 'CUDA out of memory' in str(e):
st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
# Log the detailed error message
with open('error_log.txt', 'a') as f:
f.write(traceback.format_exc())
else:
# Log the error and re-raise it
with open('error_log.txt', 'a') as f:
f.write(traceback.format_exc())
raise e
# Display GPU memory information after generation
gpu_memory_after_generation = get_gpu_memory()
st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")
tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=False)
quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
desc_act=False
)
model = AutoGPTQForCausalLM.from_quantized(local_folder,
use_safetensors=True,
strict=use_strict,
model_basename=model_basename,
device="cuda:0",
use_triton=use_triton,
quantize_config=quantize_config)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.1,
top_p=0.95,
repetition_penalty=1.15
)
user_input = st.text_input("Input a phrase")
prompt_template=f'''USER: {user_input}
ASSISTANT:'''
# Generate output when the "Generate" button is pressed
if st.button("Generate the prompt"):
output = pipe(prompt_template)[0]['generated_text']
st.text_area("Prompt", value=output) |