SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 7

Commit

6c63a2d

1 Parent(s): a8243a3

vllm backend swap v1

Browse files

Files changed (2) hide show

requirements.txt +1 -0
utils/models.py +80 -62

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ numpy==1.26.4
 openai>=1.60.2
 torch>=2.5.1
 tqdm==4.67.1

 openai>=1.60.2
 torch>=2.5.1
 tqdm==4.67.1
+vllm>=0.8.5

utils/models.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
 from .prompts import format_rag_prompt
@@ -5,7 +8,7 @@ from .shared import generation_interrupt
 import threading
 import queue
 import time # Added for sleep
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
     "Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
@@ -123,86 +126,101 @@ def run_inference(model_name, context, question, result_queue):
             if tokenizer.chat_template else False # Handle missing chat_template
         )
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        # Check interrupt before loading the model
-        if generation_interrupt.is_set():
-             result_queue.put("")
-             return
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
-        ).to(device)
-        model.eval() # Set model to evaluation mode
         text_input = format_rag_prompt(question, context, accepts_sys)
-        # Check interrupt before tokenization/template application
-        if generation_interrupt.is_set():
-             result_queue.put("")
-             return
-        actual_input = tokenizer.apply_chat_template(
-            text_input,
-            return_tensors="pt",
-            tokenize=True,
-            # Consider reducing max_length if context/question is very long
-            # max_length=tokenizer.model_max_length, # Use model's max length
-            # truncation=True, # Ensure truncation if needed
-            max_length=2048, # Keep original max_length for now
-            add_generation_prompt=True,
-        ).to(device)
-        # Ensure input does not exceed model max length after adding generation prompt
-        # This check might be redundant if tokenizer handles it, but good for safety
-        # if actual_input.shape[1] > tokenizer.model_max_length:
-        #    # Handle too long input - maybe truncate manually or raise error
-        #    print(f"Warning: Input length {actual_input.shape[1]} exceeds model max length {tokenizer.model_max_length}")
-        #    # Simple truncation (might lose important info):
-        #    # actual_input = actual_input[:, -tokenizer.model_max_length:]
-        input_length = actual_input.shape[1]
-        attention_mask = torch.ones_like(actual_input).to(device)
         # Check interrupt before generation
         if generation_interrupt.is_set():
             result_queue.put("")
             return
-        stopping_criteria = StoppingCriteriaList([InterruptCriteria(generation_interrupt)])
-        with torch.inference_mode():
-            outputs = model.generate(
-                actual_input,
-                attention_mask=attention_mask,
-                max_new_tokens=512,
-                pad_token_id=tokenizer.pad_token_id,
-                stopping_criteria=stopping_criteria,
-                do_sample=True, # Consider adding sampling parameters if needed
-                temperature=0.6,
-                top_p=0.9,
-            )
         # Check interrupt immediately after generation finishes or stops
-        if generation_interrupt.is_set():
-            result = "" # Discard potentially partial result if interrupted
-        else:
-            # Decode the generated tokens, excluding the input tokens
-            result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
-        result_queue.put(result)
     except Exception as e:
         print(f"Error in inference thread for {model_name}: {e}")
         # Put error message in queue for the main thread to handle/display
-        result_queue.put(f"Error generating response: {str(e)[:100]}...")
     finally:
         # Clean up resources within the thread
         del model
         del tokenizer
-        del actual_input
         del outputs
         if torch.cuda.is_available():
             torch.cuda.empty_cache()

+import os
+os.environ['MKL_THREADING_LAYER'] = 'GNU'
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
 from .prompts import format_rag_prompt
 import threading
 import queue
 import time # Added for sleep
+from vllm import LLM, SamplingParams
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
     "Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
             if tokenizer.chat_template else False # Handle missing chat_template
         )
+        # if tokenizer.pad_token is None:
+        #     tokenizer.pad_token = tokenizer.eos_token
+        # # Check interrupt before loading the model
+        # if generation_interrupt.is_set():
+        #      result_queue.put("")
+        #      return
+        # model = AutoModelForCausalLM.from_pretrained(
+        #     model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
+        # ).to(device)
+        # model.eval() # Set model to evaluation mode
         text_input = format_rag_prompt(question, context, accepts_sys)
+        # # Check interrupt before tokenization/template application
+        # if generation_interrupt.is_set():
+        #      result_queue.put("")
+        #      return
+        # actual_input = tokenizer.apply_chat_template(
+        #     text_input,
+        #     return_tensors="pt",
+        #     tokenize=True,
+        #     # Consider reducing max_length if context/question is very long
+        #     # max_length=tokenizer.model_max_length, # Use model's max length
+        #     # truncation=True, # Ensure truncation if needed
+        #     max_length=2048, # Keep original max_length for now
+        #     add_generation_prompt=True,
+        # ).to(device)
+        # # Ensure input does not exceed model max length after adding generation prompt
+        # # This check might be redundant if tokenizer handles it, but good for safety
+        # # if actual_input.shape[1] > tokenizer.model_max_length:
+        # #    # Handle too long input - maybe truncate manually or raise error
+        # #    print(f"Warning: Input length {actual_input.shape[1]} exceeds model max length {tokenizer.model_max_length}")
+        # #    # Simple truncation (might lose important info):
+        # #    # actual_input = actual_input[:, -tokenizer.model_max_length:]
+        # input_length = actual_input.shape[1]
+        # attention_mask = torch.ones_like(actual_input).to(device)
+        # # Check interrupt before generation
+        # if generation_interrupt.is_set():
+        #     result_queue.put("")
+        #     return
+        # stopping_criteria = StoppingCriteriaList([InterruptCriteria(generation_interrupt)])
+        # with torch.inference_mode():
+        #     outputs = model.generate(
+        #         actual_input,
+        #         attention_mask=attention_mask,
+        #         max_new_tokens=512,
+        #         pad_token_id=tokenizer.pad_token_id,
+        #         stopping_criteria=stopping_criteria,
+        #         do_sample=True, # Consider adding sampling parameters if needed
+        #         temperature=0.6,
+        #         top_p=0.9,
+        #     )
+        # # Check interrupt immediately after generation finishes or stops
+        # if generation_interrupt.is_set():
+        #     result = "" # Discard potentially partial result if interrupted
+        # else:
+        #     # Decode the generated tokens, excluding the input tokens
+        #     result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+        llm = LLM(model_name, dtype=torch.bfloat16, hf_token=True, enforce_eager=True)
+        params = SamplingParams(
+            max_tokens=512,
+            )
         # Check interrupt before generation
         if generation_interrupt.is_set():
             result_queue.put("")
             return
+        # Generate the response
+        outputs = llm.chat(
+            text_input,
+            sampling_params=params,
+            # stopping_criteria=StoppingCriteriaList([InterruptCriteria(generation_interrupt)]),
+        )
         # Check interrupt immediately after generation finishes or stops
+        result_queue.put(outputs[0].outputs[0].text)
     except Exception as e:
         print(f"Error in inference thread for {model_name}: {e}")
         # Put error message in queue for the main thread to handle/display
+        result_queue.put(f"Error generating response: {str(e)[:200]}...")
     finally:
         # Clean up resources within the thread
         del model
         del tokenizer
+        del text_input
         del outputs
         if torch.cuda.is_available():
             torch.cuda.empty_cache()