import gradio as gr | |
# import torch | |
# from transformers import ( | |
# AutoModel, | |
# AutoTokenizer, | |
# BitsAndBytesConfig, | |
# pipeline | |
# ) | |
# from peft import ( | |
# TaskType, | |
# LoraConfig, | |
# get_peft_model, | |
# set_peft_model_state_dict, | |
# prepare_model_for_kbit_training, | |
# prepare_model_for_int8_training, | |
# ) | |
# model_dir = "finetuned_model/checkpoint-50" | |
# tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
# # # load our finetuned model | |
# base_model = "THUDM/chatglm2-6b" | |
# peft_model = "./finetuned_model_bak" | |
# # Quantization | |
# q_config = BitsAndBytesConfig(load_in_4bit=True, | |
# bnb_4bit_quant_type='nf4', | |
# bnb_4bit_use_double_quant=True, | |
# bnb_4bit_compute_dtype=torch.float16 | |
# ) | |
# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) | |
# model = AutoModel.from_pretrained(base_model, trust_remote_code=True, quantization_config=q_config, device_map="cuda") | |
# model = PeftModel.from_pretrained(model, peft_model) | |
# pipe = pipeline( | |
# "text-generation", | |
# model=model, | |
# tokenizer=tokenizer, | |
# device_map="auto" | |
# ) | |
# def respond( | |
# message, | |
# history: list[tuple[str, str]], | |
# system_message, | |
# max_tokens, | |
# temperature, | |
# top_p, | |
# ): | |
# # 将对话历史和新消息结合起来 | |
# # 为了更好地适应模型,我们通常需要将对话转换为特定的模板格式 | |
# # 不同的模型有不同的对话模板,这里以 LLaMA 2 或 Zephyr 为例 | |
# # 您需要根据您微调模型的实际模板进行调整 | |
# formatted_messages = [{"role": "system", "content": system_message}] | |
# for user_msg, bot_msg in history: | |
# formatted_messages.append({"role": "user", "content": user_msg}) | |
# formatted_messages.append({"role": "assistant", "content": bot_msg}) | |
# formatted_messages.append({"role": "user", "content": message}) | |
# # 使用 tokenizer.apply_chat_template 准备输入 | |
# # add_generation_prompt=True 告诉 tokenizer 在末尾添加一个用于模型生成响应的特殊 token | |
# prompt = tokenizer.apply_chat_template( | |
# formatted_messages, | |
# tokenize=False, | |
# add_generation_prompt=True | |
# ) | |
# # 调用 pipeline 的生成方法 | |
# # 注意:这里的参数名与 Gradio 的输入不完全匹配,需要映射 | |
# outputs = pipe( | |
# prompt, | |
# max_new_tokens=max_tokens, # max_new_tokens 是 pipeline 的参数 | |
# do_sample=True, # 启用采样 | |
# temperature=temperature, | |
# top_p=top_p, | |
# eos_token_id=tokenizer.eos_token_id, # 确保在遇到 eos token 时停止生成 | |
# pad_token_id=tokenizer.pad_token_id, | |
# ) | |
# # 解析 pipeline 的输出 | |
# # outputs 是一个列表,包含生成的文本 | |
# generated_text = outputs[0]["generated_text"] | |
# # 找到原始 prompt 的结束位置,以提取模型生成的响应部分 | |
# response_start = generated_text.find(prompt) + len(prompt) | |
# response = generated_text[response_start:].strip() | |
# # yield 响应 | |
# # 如果要实现流式输出,需要对 pipeline 或模型本身进行更复杂的修改 | |
# # 这里的代码是一个简化的非流式版本,如果需要流式输出,可以考虑使用 | |
# # `model.generate` 并设置 `stream=True` | |
# yield response | |
def echo(message, history): | |
return message | |
demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Echo Bot") | |
demo.launch() |