fingpt / app.py
futosane's picture
Update space
e9f3453
import gradio as gr
# import torch
# from transformers import (
# AutoModel,
# AutoTokenizer,
# BitsAndBytesConfig,
# pipeline
# )
# from peft import (
# TaskType,
# LoraConfig,
# get_peft_model,
# set_peft_model_state_dict,
# prepare_model_for_kbit_training,
# prepare_model_for_int8_training,
# )
# model_dir = "finetuned_model/checkpoint-50"
# tokenizer = AutoTokenizer.from_pretrained(model_dir)
# # # load our finetuned model
# base_model = "THUDM/chatglm2-6b"
# peft_model = "./finetuned_model_bak"
# # Quantization
# q_config = BitsAndBytesConfig(load_in_4bit=True,
# bnb_4bit_quant_type='nf4',
# bnb_4bit_use_double_quant=True,
# bnb_4bit_compute_dtype=torch.float16
# )
# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# model = AutoModel.from_pretrained(base_model, trust_remote_code=True, quantization_config=q_config, device_map="cuda")
# model = PeftModel.from_pretrained(model, peft_model)
# pipe = pipeline(
# "text-generation",
# model=model,
# tokenizer=tokenizer,
# device_map="auto"
# )
# def respond(
# message,
# history: list[tuple[str, str]],
# system_message,
# max_tokens,
# temperature,
# top_p,
# ):
# # 将对话历史和新消息结合起来
# # 为了更好地适应模型,我们通常需要将对话转换为特定的模板格式
# # 不同的模型有不同的对话模板,这里以 LLaMA 2 或 Zephyr 为例
# # 您需要根据您微调模型的实际模板进行调整
# formatted_messages = [{"role": "system", "content": system_message}]
# for user_msg, bot_msg in history:
# formatted_messages.append({"role": "user", "content": user_msg})
# formatted_messages.append({"role": "assistant", "content": bot_msg})
# formatted_messages.append({"role": "user", "content": message})
# # 使用 tokenizer.apply_chat_template 准备输入
# # add_generation_prompt=True 告诉 tokenizer 在末尾添加一个用于模型生成响应的特殊 token
# prompt = tokenizer.apply_chat_template(
# formatted_messages,
# tokenize=False,
# add_generation_prompt=True
# )
# # 调用 pipeline 的生成方法
# # 注意:这里的参数名与 Gradio 的输入不完全匹配,需要映射
# outputs = pipe(
# prompt,
# max_new_tokens=max_tokens, # max_new_tokens 是 pipeline 的参数
# do_sample=True, # 启用采样
# temperature=temperature,
# top_p=top_p,
# eos_token_id=tokenizer.eos_token_id, # 确保在遇到 eos token 时停止生成
# pad_token_id=tokenizer.pad_token_id,
# )
# # 解析 pipeline 的输出
# # outputs 是一个列表,包含生成的文本
# generated_text = outputs[0]["generated_text"]
# # 找到原始 prompt 的结束位置,以提取模型生成的响应部分
# response_start = generated_text.find(prompt) + len(prompt)
# response = generated_text[response_start:].strip()
# # yield 响应
# # 如果要实现流式输出,需要对 pipeline 或模型本身进行更复杂的修改
# # 这里的代码是一个简化的非流式版本,如果需要流式输出,可以考虑使用
# # `model.generate` 并设置 `stream=True`
# yield response
def echo(message, history):
return message
demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Echo Bot")
demo.launch()