Spaces:

LemiSt
/

SmolLM-135M-instruct-de

Sleeping

App Files Files Community

SmolLM-135M-instruct-de / app.py

LenDigLearn

updated default sampling values

e65a3d7 9 months ago

raw

history blame contribute delete

3.5 kB

	import queue
	import gradio as gr
	import torch
	import threading
	from transformers import AutoTokenizer, AutoModelForCausalLM

	"""
	For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
	"""
	checkpoint = "LemiSt/SmolLM-135M-instruct-de-merged"
	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16)


	class CustomIterable:
	def __init__(self):
	self._queue = queue.Queue() # Thread-safe queue
	self.first = True

	def put(self, item):
	"""Add an element to the internal queue."""
	if self.first:
	self.first = False
	else:
	self._queue.put(item)

	def end(self):
	"""Signal that no more elements will be added."""
	self._queue.put(None) # Sentinel value to indicate the end of the queue

	def __iter__(self):
	"""Return the iterator (self in this case)."""
	return self

	def __next__(self):
	"""Return the next element from the queue, blocking if necessary."""
	try:
	item = self._queue.get(block=True) # Wait for an item
	except queue.Empty:
	raise StopIteration

	if item is None: # Sentinel value to end the iteration
	raise StopIteration

	return item

	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	top_k,
	repetition_penalty
	):
	messages = [{"role": "system", "content": system_message}]

	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})

	messages.append({"role": "user", "content": message})

	streamer = CustomIterable()

	inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt", add_generation_prompt=True)
	thread = threading.Thread(target=model.generate, args=([inputs]), kwargs={"max_new_tokens": max_tokens, "do_sample": True, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty, "streamer": streamer})
	thread.start()
	response = ""

	for token in streamer:
	decoded = tokenizer.decode(token, skip_special_tokens=True)
	response += decoded
	yield response

	thread.join()

	"""
	For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
	"""
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="Du bist ein hilfreicher Assistent.", label="System message"),
	gr.Slider(minimum=1, maximum=1024, value=512, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.4, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	gr.Slider(
	minimum=16,
	maximum=1024,
	value=512,
	step=1,
	label="Top-k",
	),
	gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=1.1,
	step=0.05,
	label="Repetition penalty",
	),
	],
	)


	if __name__ == "__main__":
	demo.launch()