SLM-RAG-Arena

Running on Zero

App Files Files Community

SLM-RAG-Arena / utils /models.py

oliver-aizip

remove bitnet handling completely

c0fdd5a about 2 months ago

raw

history blame contribute delete

8.36 kB

	import os

	os.environ["MKL_THREADING_LAYER"] = "GNU"
	import spaces
	from peft import PeftModel
	import traceback

	import torch
	from transformers import (
	pipeline,
	AutoTokenizer,
	AutoModelForCausalLM,
	StoppingCriteria,
	)
	from .prompts import format_rag_prompt
	from .shared import generation_interrupt

	models = {
	"Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
	"Qwen2.5-3b-Instruct": "qwen/qwen2.5-3b-instruct",
	"Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
	"Llama-3.2-3b-Instruct": "meta-llama/llama-3.2-3b-instruct",
	"Gemma-3-1b-it": "google/gemma-3-1b-it",
	"Gemma-3-4b-it": "google/gemma-3-4b-it",
	"Gemma-2-2b-it": "google/gemma-2-2b-it",
	"Phi-4-mini-instruct": "microsoft/phi-4-mini-instruct",
	"Cogito-v1-preview-llama-3b": "deepcogito/cogito-v1-preview-llama-3b",
	"IBM Granite-3.3-2b-instruct": "ibm-granite/granite-3.3-2b-instruct",
	# "Bitnet-b1.58-2B4T": "microsoft/bitnet-b1.58-2B-4T",
	# #"MiniCPM3-RAG-LoRA": "openbmb/MiniCPM3-RAG-LoRA",
	"Qwen3-0.6b": "qwen/qwen3-0.6b",
	"Qwen3-1.7b": "qwen/qwen3-1.7b",
	"Qwen3-4b": "qwen/qwen3-4b",
	"SmolLM2-1.7b-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
	"EXAONE-3.5-2.4B-instruct": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
	"OLMo-2-1B-Instruct": "allenai/OLMo-2-0425-1B-Instruct",
	"icecream-3b": "aizip-dev/icecream-3b",
	}

	tokenizer_cache = {}

	# List of model names for easy access
	model_names = list(models.keys())


	# Custom stopping criteria that checks the interrupt flag
	class InterruptCriteria(StoppingCriteria):
	def __init__(self, interrupt_event):
	self.interrupt_event = interrupt_event

	def __call__(self, input_ids, scores, **kwargs):
	return self.interrupt_event.is_set()


	@spaces.GPU
	def generate_summaries(example, model_a_name, model_b_name):
	"""
	Generates summaries for the given example using the assigned models sequentially.
	"""
	if generation_interrupt.is_set():
	return "", ""

	context_text = ""
	context_parts = []

	if "full_contexts" in example and example["full_contexts"]:
	for i, ctx in enumerate(example["full_contexts"]):
	content = ""

	# Extract content from either dict or string
	if isinstance(ctx, dict) and "content" in ctx:
	content = ctx["content"]
	elif isinstance(ctx, str):
	content = ctx

	# Add document number if not already present
	if not content.strip().startswith("Document"):
	content = f"Document {i + 1}:\n{content}"

	context_parts.append(content)

	context_text = "\n\n".join(context_parts)
	else:
	# Provide a graceful fallback instead of raising an error
	print("Warning: No full context found in the example, using empty context")
	context_text = ""

	question = example.get("question", "")

	if generation_interrupt.is_set():
	return "", ""

	# Run model A
	summary_a = run_inference(models[model_a_name], context_text, question)

	if generation_interrupt.is_set():
	return summary_a, ""

	# Run model B
	summary_b = run_inference(models[model_b_name], context_text, question)

	return summary_a, summary_b


	@spaces.GPU
	def run_inference(model_name, context, question):
	"""
	Run inference using the specified model.
	Returns the generated text or empty string if interrupted.
	"""
	# Check interrupt at the beginning
	if generation_interrupt.is_set():
	return ""

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	result = ""
	tokenizer_kwargs = {
	"add_generation_prompt": True,
	} # make sure qwen3 doesn't use thinking
	generation_kwargs = {
	"max_new_tokens": 512,
	}
	if "qwen3" in model_name.lower():
	print(
	f"Recognized {model_name} as a Qwen3 model. Setting enable_thinking=False."
	)
	tokenizer_kwargs["enable_thinking"] = False

	try:
	if model_name in tokenizer_cache:
	tokenizer = tokenizer_cache[model_name]
	else:
	# Common arguments for tokenizer loading
	tokenizer_load_args = {"padding_side": "left", "token": True}

	actual_model_name_for_tokenizer = model_name
	if "icecream" in model_name.lower():
	actual_model_name_for_tokenizer = "meta-llama/llama-3.2-3b-instruct"

	tokenizer = AutoTokenizer.from_pretrained(actual_model_name_for_tokenizer, **tokenizer_load_args)
	tokenizer_cache[model_name] = tokenizer

	accepts_sys = (
	"System role not supported" not in tokenizer.chat_template
	if tokenizer.chat_template
	else False # Handle missing chat_template
	)

	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Check interrupt before loading the model
	if generation_interrupt.is_set():
	return ""

	print("REACHED HERE BEFORE pipe")
	print(f"Loading model {model_name}...")
	if "icecream" not in model_name.lower():
	pipe = pipeline(
	"text-generation",
	model=model_name,
	tokenizer=tokenizer,
	device_map="cuda",
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	model_kwargs={
	"attn_implementation": "eager",
	},
	)
	else:
	base_model = AutoModelForCausalLM.from_pretrained(
	"meta-llama/llama-3.2-3b-instruct",
	device_map="cuda",
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	)
	model = PeftModel.from_pretrained(
	base_model,
	"aizip-dev/icecream-3b",
	device_map="cuda",
	torch_dtype=torch.bfloat16,
	)

	text_input = format_rag_prompt(question, context, accepts_sys)
	if "Gemma-3".lower() in model_name.lower():
	print("REACHED HERE BEFORE GEN")
	result = pipe(
	text_input,
	max_new_tokens=512,
	generation_kwargs={"skip_special_tokens": True},
	)[0]["generated_text"]

	result = result[-1]["content"]
	elif "icecream" in model_name.lower():

	print("ICECREAM")
	model_inputs = tokenizer.apply_chat_template(
	text_input,
	tokenize=True,
	return_tensors="pt",
	return_dict=True,
	**tokenizer_kwargs,
	)

	model_inputs = model_inputs.to(model.device)
	input_ids = model_inputs.input_ids
	prompt_tokens_length = input_ids.shape[1]

	with torch.inference_mode():
	# Check interrupt before generation
	if generation_interrupt.is_set():
	return ""

	output_sequences = model.generate(
	**model_inputs,
	max_new_tokens=512,
	)

	generated_token_ids = output_sequences[0][prompt_tokens_length:]
	result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
	else: # For other models
	formatted = pipe.tokenizer.apply_chat_template(
	text_input,
	tokenize=False,
	**tokenizer_kwargs,
	)

	input_length = len(formatted)
	# Check interrupt before generation

	outputs = pipe(
	formatted,
	max_new_tokens=512,
	generation_kwargs={"skip_special_tokens": True},
	)
	# print(outputs[0]['generated_text'])
	result = outputs[0]["generated_text"][input_length:]

	except Exception as e:
	print(f"Error in inference for {model_name}: {e}")
	print(traceback.format_exc())
	result = f"Error generating response: {str(e)[:200]}..."

	finally:
	# Clean up resources
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return result