Spaces:

rodrigomasini
/

rephrase

Paused

App Files Files Community

rodrigomasini commited on Nov 8, 2023

Commit

51717bc

1 Parent(s): 6de173e

Update app_v4.py

Browse files

Files changed (1) hide show

app_v4.py +15 -4

app_v4.py CHANGED Viewed

@@ -31,6 +31,7 @@ if device == "cuda:0":
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
 # Attempt to load the model, catch any OOM errors
 @st.cache_resource
@@ -39,14 +40,17 @@ def load_gptq_model():
         pretrained_model_dir,
         model_basename="Jackson2-4bit-128g-GPTQ",
         use_safetensors=True,
-        device=device
     )
     model.eval()  # Set the model to inference mode
     return model
 # Attempt to load the model, catch any OOM errors
 try:
     model = load_gptq_model()
 except RuntimeError as e:
     if 'CUDA out of memory' in str(e):
         st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
@@ -68,8 +72,15 @@ if model_loaded:
             prompt_template = f'USER: {user_input}\nASSISTANT:'
             inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
             inputs = inputs.to(device)  # Move inputs to the same device as model
-            output = model.generate(**inputs)
-            st.markdown(f"**Generated Text:**\n{tokenizer.decode(output[0])}")
         except RuntimeError as e:
             if 'CUDA out of memory' in str(e):
                 st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")

 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
+tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set correctly for the model
 # Attempt to load the model, catch any OOM errors
 @st.cache_resource
         pretrained_model_dir,
         model_basename="Jackson2-4bit-128g-GPTQ",
         use_safetensors=True,
+        device=device,
+        disable_exllamav2=True
     )
     model.eval()  # Set the model to inference mode
     return model
+model_loaded = False
 # Attempt to load the model, catch any OOM errors
 try:
     model = load_gptq_model()
+    model_loaded = True
 except RuntimeError as e:
     if 'CUDA out of memory' in str(e):
         st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
             prompt_template = f'USER: {user_input}\nASSISTANT:'
             inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
             inputs = inputs.to(device)  # Move inputs to the same device as model
+        # Generate text using torch.inference_mode for better performance during inference
+            with torch.inference_mode():
+                output = model.generate(**inputs, max_new_tokens=50)  # Adjust max_new_tokens if needed
+            # Cut the tokens at the input length to display only the generated text
+            output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
+            generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
+            st.markdown(f"**Generated Text:**\n{generated_text}")
         except RuntimeError as e:
             if 'CUDA out of memory' in str(e):
                 st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")