rodrigomasini commited on
Commit
51717bc
·
1 Parent(s): 6de173e

Update app_v4.py

Browse files
Files changed (1) hide show
  1. app_v4.py +15 -4
app_v4.py CHANGED
@@ -31,6 +31,7 @@ if device == "cuda:0":
31
 
32
  # Load tokenizer
33
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
 
34
 
35
  # Attempt to load the model, catch any OOM errors
36
  @st.cache_resource
@@ -39,14 +40,17 @@ def load_gptq_model():
39
  pretrained_model_dir,
40
  model_basename="Jackson2-4bit-128g-GPTQ",
41
  use_safetensors=True,
42
- device=device
 
43
  )
44
  model.eval() # Set the model to inference mode
45
  return model
46
-
 
47
  # Attempt to load the model, catch any OOM errors
48
  try:
49
  model = load_gptq_model()
 
50
  except RuntimeError as e:
51
  if 'CUDA out of memory' in str(e):
52
  st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
@@ -68,8 +72,15 @@ if model_loaded:
68
  prompt_template = f'USER: {user_input}\nASSISTANT:'
69
  inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
70
  inputs = inputs.to(device) # Move inputs to the same device as model
71
- output = model.generate(**inputs)
72
- st.markdown(f"**Generated Text:**\n{tokenizer.decode(output[0])}")
 
 
 
 
 
 
 
73
  except RuntimeError as e:
74
  if 'CUDA out of memory' in str(e):
75
  st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
 
31
 
32
  # Load tokenizer
33
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
34
+ tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set correctly for the model
35
 
36
  # Attempt to load the model, catch any OOM errors
37
  @st.cache_resource
 
40
  pretrained_model_dir,
41
  model_basename="Jackson2-4bit-128g-GPTQ",
42
  use_safetensors=True,
43
+ device=device,
44
+ disable_exllamav2=True
45
  )
46
  model.eval() # Set the model to inference mode
47
  return model
48
+
49
+ model_loaded = False
50
  # Attempt to load the model, catch any OOM errors
51
  try:
52
  model = load_gptq_model()
53
+ model_loaded = True
54
  except RuntimeError as e:
55
  if 'CUDA out of memory' in str(e):
56
  st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
 
72
  prompt_template = f'USER: {user_input}\nASSISTANT:'
73
  inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
74
  inputs = inputs.to(device) # Move inputs to the same device as model
75
+ # Generate text using torch.inference_mode for better performance during inference
76
+ with torch.inference_mode():
77
+ output = model.generate(**inputs, max_new_tokens=50) # Adjust max_new_tokens if needed
78
+
79
+ # Cut the tokens at the input length to display only the generated text
80
+ output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
81
+ generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
82
+
83
+ st.markdown(f"**Generated Text:**\n{generated_text}")
84
  except RuntimeError as e:
85
  if 'CUDA out of memory' in str(e):
86
  st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")