Spaces:

Akjava
/

chat-phi-4-deepseek-R1K-RL-EZO

Runtime error

App Files Files Community

chat-phi-4-deepseek-R1K-RL-EZO / app.py

Akjava

Update app.py

0fb257c verified 10 months ago

raw

history blame

2.97 kB

	import spaces
	import os
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from transformers import TextIteratorStreamer
	from threading import Thread

	import gradio as gr

	text_generator = None
	is_hugging_face = True
	model_id = "AXCXEPT/phi-4-deepseek-R1K-RL-EZO"
	model_id = "AXCXEPT/phi-4-open-R1-Distill-EZOv1"

	huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
	huggingface_token = None
	device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
	device = "cuda"
	dtype = torch.bfloat16
	dtype = torch.float16

	if not huggingface_token:
	pass
	print("no HUGGINGFACE_TOKEN if you need set secret ")
	#raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")








	tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)

	print(model_id,device,dtype)
	histories = []
	#model = None



	if not is_hugging_face:
	model = AutoModelForCausalLM.from_pretrained(
	model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
	)
	text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device)

	if next(model.parameters()).is_cuda:
	print("The model is on a GPU")
	else:
	print("The model is on a CPU")

	#print(f"text_generator.device='{text_generator.device}")
	if str(text_generator.device).strip() == 'cuda':
	print("The pipeline is using a GPU")
	else:
	print("The pipeline is using a CPU")

	print("initialized")


	def generate_text(messages):
	if is_hugging_face:#need everytime initialize for ZeroGPU
	model = AutoModelForCausalLM.from_pretrained(
	model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
	)
	model.to(device)
	question = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	question = tokenizer(question, return_tensors="pt").to(device)

	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
	generation_kwargs = dict(question, streamer=streamer, max_new_tokens=200)
	thread = Thread(target=model.generate, kwargs=generation_kwargs)

	generated_output = ""
	thread.start()
	for new_text in streamer:
	generated_output += new_text
	yield generated_output
	generate_text.zerogpu = True



	@spaces.GPU(duration=60)
	def call_generate_text(message, history):
	# history.append({"role": "user", "content": message})
	#print(message)
	#print(history)

	messages = history+[{"role":"user","content":message}]
	try:

	for text in generate_text(messages):
	yield text
	except RuntimeError as e:
	print(f"An unexpected error occurred: {e}")
	yield ""

	demo = gr.ChatInterface(call_generate_text,type="messages")

	if __name__ == "__main__":
	demo.launch()