import gradio as gr # import torch # from transformers import ( # AutoModel, # AutoTokenizer, # BitsAndBytesConfig, # pipeline # ) # from peft import ( # TaskType, # LoraConfig, # get_peft_model, # set_peft_model_state_dict, # prepare_model_for_kbit_training, # prepare_model_for_int8_training, # ) # model_dir = "finetuned_model/checkpoint-50" # tokenizer = AutoTokenizer.from_pretrained(model_dir) # # # load our finetuned model # base_model = "THUDM/chatglm2-6b" # peft_model = "./finetuned_model_bak" # # Quantization # q_config = BitsAndBytesConfig(load_in_4bit=True, # bnb_4bit_quant_type='nf4', # bnb_4bit_use_double_quant=True, # bnb_4bit_compute_dtype=torch.float16 # ) # tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) # model = AutoModel.from_pretrained(base_model, trust_remote_code=True, quantization_config=q_config, device_map="cuda") # model = PeftModel.from_pretrained(model, peft_model) # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer, # device_map="auto" # ) # def respond( # message, # history: list[tuple[str, str]], # system_message, # max_tokens, # temperature, # top_p, # ): # # 将对话历史和新消息结合起来 # # 为了更好地适应模型,我们通常需要将对话转换为特定的模板格式 # # 不同的模型有不同的对话模板,这里以 LLaMA 2 或 Zephyr 为例 # # 您需要根据您微调模型的实际模板进行调整 # formatted_messages = [{"role": "system", "content": system_message}] # for user_msg, bot_msg in history: # formatted_messages.append({"role": "user", "content": user_msg}) # formatted_messages.append({"role": "assistant", "content": bot_msg}) # formatted_messages.append({"role": "user", "content": message}) # # 使用 tokenizer.apply_chat_template 准备输入 # # add_generation_prompt=True 告诉 tokenizer 在末尾添加一个用于模型生成响应的特殊 token # prompt = tokenizer.apply_chat_template( # formatted_messages, # tokenize=False, # add_generation_prompt=True # ) # # 调用 pipeline 的生成方法 # # 注意:这里的参数名与 Gradio 的输入不完全匹配,需要映射 # outputs = pipe( # prompt, # max_new_tokens=max_tokens, # max_new_tokens 是 pipeline 的参数 # do_sample=True, # 启用采样 # temperature=temperature, # top_p=top_p, # eos_token_id=tokenizer.eos_token_id, # 确保在遇到 eos token 时停止生成 # pad_token_id=tokenizer.pad_token_id, # ) # # 解析 pipeline 的输出 # # outputs 是一个列表,包含生成的文本 # generated_text = outputs[0]["generated_text"] # # 找到原始 prompt 的结束位置,以提取模型生成的响应部分 # response_start = generated_text.find(prompt) + len(prompt) # response = generated_text[response_start:].strip() # # yield 响应 # # 如果要实现流式输出,需要对 pipeline 或模型本身进行更复杂的修改 # # 这里的代码是一个简化的非流式版本,如果需要流式输出,可以考虑使用 # # `model.generate` 并设置 `stream=True` # yield response def echo(message, history): return message demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Echo Bot") demo.launch()