Spaces:
Paused
Paused
File size: 4,165 Bytes
d5c810f b21da5b 4ab4748 b21da5b 3b33c19 b21da5b 0831088 51717bc b21da5b 6de173e 4b8d66c 5d04704 b21da5b 51717bc b21da5b 24eb0d4 4b8d66c 51717bc 5d04704 4b8d66c 51717bc b21da5b 24eb0d4 b21da5b 24eb0d4 3b33c19 9fc5e2e 557f7d8 d5c810f 24eb0d4 557f7d8 51717bc 557f7d8 24eb0d4 9fc5e2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import streamlit as st
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
import subprocess
import traceback
# Function to get memory info
def get_gpu_memory():
try:
result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
memory_info = [x.split(',') for x in result.strip().split('\n')]
memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
except FileNotFoundError:
memory_info = [{"free": "N/A", "total": "N/A"}]
return memory_info
# Display GPU memory information before loading the model
gpu_memory_before = get_gpu_memory()
st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")
# Define pretrained model directory
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
# Check if CUDA is available and get the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Before allocating or loading the model, clear up memory if CUDA is available
if device == "cuda:0":
torch.cuda.empty_cache()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set correctly for the model
# Attempt to load the model, catch any OOM errors
@st.cache_resource
def load_gptq_model():
model = AutoGPTQForCausalLM.from_quantized(
pretrained_model_dir,
model_basename="Jackson2-4bit-128g-GPTQ",
use_safetensors=True,
device=device,
disable_exllamav2=True
)
model.eval() # Set the model to inference mode
return model
model_loaded = False
# Attempt to load the model, catch any OOM errors
try:
model = load_gptq_model()
model_loaded = True
except RuntimeError as e:
if 'CUDA out of memory' in str(e):
st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
st.stop()
else:
raise e
if model_loaded:
# Display GPU memory information after loading the model
gpu_memory_after = get_gpu_memory()
st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
col1, col2 = st.columns(2)
with col1:
user_input = st.text_input("Input a phrase")
with col2:
max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)
# Generate button
if st.button("Generate the prompt"):
try:
prompt_template = f'USER: {user_input}\nASSISTANT:'
inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs = inputs.to(device) # Move inputs to the same device as model
# Generate text using torch.inference_mode for better performance during inference
with torch.inference_mode():
output = model.generate(**inputs, max_new_tokens=max_token)
# Cut the tokens at the input length to display only the generated text
output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
st.markdown(f"**Generated Text:**\n{generated_text}")
except RuntimeError as e:
if 'CUDA out of memory' in str(e):
st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
# Log the detailed error message
with open('error_log.txt', 'a') as f:
f.write(traceback.format_exc())
else:
# Log the error and re-raise it
with open('error_log.txt', 'a') as f:
f.write(traceback.format_exc())
raise e
# Display GPU memory information after generation
gpu_memory_after_generation = get_gpu_memory()
st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}") |