File size: 3,642 Bytes
7c53351
7465324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c53351
7465324
7c53351
7465324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c53351
ef30a7a
7465324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef30a7a
7465324
ef30a7a
7465324
 
 
7c53351
7465324
ef30a7a
7465324
 
 
 
 
 
 
ef30a7a
7465324
 
 
 
 
 
 
 
 
 
 
ef30a7a
7465324
 
 
ef30a7a
7465324
 
 
7c53351
7465324
 
 
 
 
53fc378
 
 
7c53351
e9f3453
53fc378
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
# import torch
# from transformers import (
#     AutoModel,
#     AutoTokenizer,
#     BitsAndBytesConfig,
#     pipeline
# )
# from peft import (
#     TaskType,
#     LoraConfig,
#     get_peft_model,
#     set_peft_model_state_dict,
#     prepare_model_for_kbit_training,
#     prepare_model_for_int8_training,
# )

# model_dir = "finetuned_model/checkpoint-50"

# tokenizer = AutoTokenizer.from_pretrained(model_dir)
# # # load our finetuned model
# base_model = "THUDM/chatglm2-6b"
# peft_model = "./finetuned_model_bak"

# # Quantization
# q_config = BitsAndBytesConfig(load_in_4bit=True,
#                                 bnb_4bit_quant_type='nf4',
#                                 bnb_4bit_use_double_quant=True,
#                                 bnb_4bit_compute_dtype=torch.float16
#                                 )

# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# model = AutoModel.from_pretrained(base_model, trust_remote_code=True, quantization_config=q_config, device_map="cuda")

# model = PeftModel.from_pretrained(model, peft_model)


# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     device_map="auto" 
# )

# def respond(
#     message,
#     history: list[tuple[str, str]],
#     system_message,
#     max_tokens,
#     temperature,
#     top_p,
# ):
#     # 将对话历史和新消息结合起来
#     # 为了更好地适应模型,我们通常需要将对话转换为特定的模板格式
#     # 不同的模型有不同的对话模板,这里以 LLaMA 2 或 Zephyr 为例
#     # 您需要根据您微调模型的实际模板进行调整
    
#     formatted_messages = [{"role": "system", "content": system_message}]
    
#     for user_msg, bot_msg in history:
#         formatted_messages.append({"role": "user", "content": user_msg})
#         formatted_messages.append({"role": "assistant", "content": bot_msg})

#     formatted_messages.append({"role": "user", "content": message})
    
#     # 使用 tokenizer.apply_chat_template 准备输入
#     # add_generation_prompt=True 告诉 tokenizer 在末尾添加一个用于模型生成响应的特殊 token
#     prompt = tokenizer.apply_chat_template(
#         formatted_messages,
#         tokenize=False,
#         add_generation_prompt=True
#     )
    
#     # 调用 pipeline 的生成方法
#     # 注意:这里的参数名与 Gradio 的输入不完全匹配,需要映射
#     outputs = pipe(
#         prompt,
#         max_new_tokens=max_tokens, # max_new_tokens 是 pipeline 的参数
#         do_sample=True, # 启用采样
#         temperature=temperature,
#         top_p=top_p,
#         eos_token_id=tokenizer.eos_token_id, # 确保在遇到 eos token 时停止生成
#         pad_token_id=tokenizer.pad_token_id,
#     )
    
#     # 解析 pipeline 的输出
#     # outputs 是一个列表,包含生成的文本
#     generated_text = outputs[0]["generated_text"]
    
#     # 找到原始 prompt 的结束位置,以提取模型生成的响应部分
#     response_start = generated_text.find(prompt) + len(prompt)
#     response = generated_text[response_start:].strip()

#     # yield 响应
#     # 如果要实现流式输出,需要对 pipeline 或模型本身进行更复杂的修改
#     # 这里的代码是一个简化的非流式版本,如果需要流式输出,可以考虑使用
#     # `model.generate` 并设置 `stream=True`
#     yield response
  
def echo(message, history):
    return message

demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Echo Bot")
demo.launch()