rodrigomasini commited on
Commit
ba18e04
·
1 Parent(s): d196de6

Delete app_v4.py

Browse files
Files changed (1) hide show
  1. app_v4.py +0 -101
app_v4.py DELETED
@@ -1,101 +0,0 @@
1
- import streamlit as st
2
- from transformers import AutoTokenizer
3
- from auto_gptq import AutoGPTQForCausalLM
4
- import torch
5
- import subprocess
6
- import traceback
7
-
8
- # Function to get memory info
9
- def get_gpu_memory():
10
- try:
11
- result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
12
- memory_info = [x.split(',') for x in result.strip().split('\n')]
13
- memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
14
- except FileNotFoundError:
15
- memory_info = [{"free": "N/A", "total": "N/A"}]
16
- return memory_info
17
-
18
- # Display GPU memory information before loading the model
19
- gpu_memory_before = get_gpu_memory()
20
- st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")
21
-
22
- # Define pretrained model directory
23
- pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
24
-
25
- # Check if CUDA is available and get the device
26
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
27
-
28
- # Before allocating or loading the model, clear up memory if CUDA is available
29
- if device == "cuda:0":
30
- torch.cuda.empty_cache()
31
-
32
- # Load tokenizer
33
- tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
34
- tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set correctly for the model
35
-
36
- # Attempt to load the model, catch any OOM errors
37
- @st.cache_resource
38
- def load_gptq_model():
39
- model = AutoGPTQForCausalLM.from_quantized(
40
- pretrained_model_dir,
41
- model_basename="Jackson2-4bit-128g-GPTQ",
42
- use_safetensors=True,
43
- device=device,
44
- disable_exllamav2=True
45
- )
46
- model.eval() # Set the model to inference mode
47
- return model
48
-
49
- model_loaded = False
50
- # Attempt to load the model, catch any OOM errors
51
- try:
52
- model = load_gptq_model()
53
- model_loaded = True
54
- except RuntimeError as e:
55
- if 'CUDA out of memory' in str(e):
56
- st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
57
- st.stop()
58
- else:
59
- raise e
60
-
61
- if model_loaded:
62
- # Display GPU memory information after loading the model
63
- gpu_memory_after = get_gpu_memory()
64
- st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
65
-
66
- col1, col2 = st.columns(2)
67
- with col1:
68
- user_input = st.text_input("Input a phrase")
69
- with col2:
70
- max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)
71
-
72
- # Generate button
73
- if st.button("Generate the prompt"):
74
- try:
75
- prompt_template = f'USER: {user_input}\nASSISTANT:'
76
- inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
77
- inputs = inputs.to(device) # Move inputs to the same device as model
78
- # Generate text using torch.inference_mode for better performance during inference
79
- with torch.inference_mode():
80
- output = model.generate(**inputs, max_new_tokens=max_token)
81
-
82
- # Cut the tokens at the input length to display only the generated text
83
- output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
84
- generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
85
-
86
- st.markdown(f"**Generated Text:**\n{generated_text}")
87
- except RuntimeError as e:
88
- if 'CUDA out of memory' in str(e):
89
- st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
90
- # Log the detailed error message
91
- with open('error_log.txt', 'a') as f:
92
- f.write(traceback.format_exc())
93
- else:
94
- # Log the error and re-raise it
95
- with open('error_log.txt', 'a') as f:
96
- f.write(traceback.format_exc())
97
- raise e
98
-
99
- # Display GPU memory information after generation
100
- gpu_memory_after_generation = get_gpu_memory()
101
- st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")