Spaces:
Paused
Paused
File size: 2,913 Bytes
e008ae1 7843ac8 55ed521 f5ab0cc c4576fb da8d0db 2995eda f093c76 ad7103c f5ab0cc 37f4fec f5ab0cc 37f4fec f5ab0cc 55ed521 37f4fec 55ed521 37f4fec f5ab0cc 37f4fec cb0f371 2995eda fd4c28d 37f4fec fd4c28d 2995eda cb21f37 10d87a5 37f4fec f093c76 55ed521 37f4fec cb0f371 55ed521 b23a956 f093c76 55ed521 2995eda b23a956 8e4ec04 10d87a5 dd8bf51 10d87a5 9ebdc85 b23a956 37f4fec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import streamlit as st
from transformers import AutoTokenizer, TextStreamer, pipeline
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import snapshot_download
import os
import torch
import subprocess
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # => just makes sense with more than one GPU, since is trying to split
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # => just makes sense when more GPUs
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1" # => this is an example of numbers of devices
# Define pretrained and quantized model directories
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
#cwd = os.getcwd()
#quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"
# Check if the model directory is empty (i.e., model not downloaded yet)
#if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
# Create the cache directory if it doesn't exist
# os.makedirs(quantized_model_dir, exist_ok=True)
# snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)
#st.write(f'{os.listdir(quantized_model_dir)}')
#model_name_or_path = quantized_model_dir
model_basename = "Jackson2-4bit-128g-GPTQ"
# Before allocating or loading the model, clear up memory
#gc.collect()
#torch.cuda.empty_cache()
use_triton = False
if torch.cuda.is_available():
torch.cuda.empty_cache()
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
device = "cuda:0" if torch.cuda.is_available() else "cpu" # best configuration besides the auto option
model = AutoGPTQForCausalLM.from_quantized(
pretrained_model_dir,
model_basename=model_basename,
use_safetensors=True,
device=device,
max_memory={0: "10GIB"}
)
viz = torch.cuda.memory_summary()
st.write(viz)
def run():
output: str = ""
try:
output = subprocess.check_output(["nvidia-smi"], text=True)
except FileNotFoundError:
output = subprocess.check_output(["ls", "-alh"], text=True)
return f"#{output}"
st.write(run())
user_input = st.text_input("Input a phrase")
prompt_template = f'USER: {user_input}\nASSISTANT:'
if st.button("Generate the prompt"):
inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length', batch_size=4)
#inputs = tokenizer(prompt_template, return_tensors='pt')
#streamer = TextStreamer(tokenizer)
#pipe = pipeline(
# "text-generation",
# model=model,
# tokenizer=tokenizer,
# streamer=streamer,
# max_new_tokens=512,
# temperature=0.2,
# top_p=0.95,
# repetition_penalty=1.15
#)
output = model.generate(**prompt_template)
st.markdown(f"tokenizer.decode(output)")
#st.write(output[0]['generated_text']) |