MegaTronX commited on
Commit
b55d8a4
·
verified ·
1 Parent(s): fc8d00d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -1
app.py CHANGED
@@ -1,3 +1,109 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
 
3
- gr.Interface.load("models/MegaTronX/Odyssey-SelectolaxQLoRA").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import json
3
+ import subprocess
4
+ from llama_cpp import Llama
5
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
6
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
7
+ from llama_cpp_agent.chat_history import BasicChatHistory
8
+ from llama_cpp_agent.chat_history.messages import Roles
9
  import gradio as gr
10
+ from huggingface_hub import hf_hub_download
11
 
12
+
13
+ hf_hub_download(
14
+ repo_id="MegaTronX/Odyssey-SelectolaxLoRA-F32-GGUF",
15
+ filename="Odyssey-SelectolaxQLoRA-f32.gguf",
16
+ local_dir="./models"
17
+ )
18
+
19
+
20
+ @spaces.GPU(duration=120) #Is this setting the timeout?
21
+ def respond(
22
+ message,
23
+ history: list[tuple[str, str]],
24
+ model,
25
+ system_message,
26
+ max_tokens,
27
+ temperature,
28
+ top_p,
29
+ top_k,
30
+ repeat_penalty,
31
+ ):
32
+ chat_template = MessagesFormatterType.GEMMA_2
33
+
34
+ llm = Llama(
35
+ model_path=f"models/{model}",
36
+ flash_attn=True,
37
+ n_gpu_layers=81,
38
+ n_batch=1024,
39
+ n_ctx=8192,
40
+ )
41
+ provider = LlamaCppPythonProvider(llm)
42
+
43
+ agent = LlamaCppAgent(
44
+ provider,
45
+ system_prompt=f"{system_message}",
46
+ predefined_messages_formatter_type=chat_template,
47
+ debug_output=True
48
+ )
49
+
50
+ settings = provider.get_provider_default_settings()
51
+ settings.temperature = temperature
52
+ settings.top_k = top_k
53
+ settings.top_p = top_p
54
+ settings.max_tokens = max_tokens
55
+ settings.repeat_penalty = repeat_penalty
56
+ settings.stream = True
57
+
58
+ messages = BasicChatHistory()
59
+
60
+ for msn in history:
61
+ user = {
62
+ 'role': Roles.user,
63
+ 'content': msn[0]
64
+ }
65
+ assistant = {
66
+ 'role': Roles.assistant,
67
+ 'content': msn[1]
68
+ }
69
+ messages.add_message(user)
70
+ messages.add_message(assistant)
71
+
72
+ stream = agent.get_chat_response(
73
+ message,
74
+ llm_sampling_settings=settings,
75
+ chat_history=messages,
76
+ returns_streaming_generator=True,
77
+ print_output=False
78
+ )
79
+
80
+ outputs = ""
81
+ for output in stream:
82
+ outputs += output
83
+ yield outputs
84
+
85
+
86
+ def create_interface(model_name):
87
+ return gr.ChatInterface(
88
+ respond,
89
+ additional_inputs=[
90
+ gr.Textbox(value=model_name, label="Model", interactive=False),
91
+ gr.Textbox(value="", label="System Message"),
92
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
93
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.3, step=0.1, label="Temperature"),
94
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.80, step=0.05, label="Top-p"),
95
+ gr.Slider(minimum=0, maximum=100, value=40, step=1, label="Top-k"),
96
+ gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.1, label="Repetition penalty"),
97
+ ],
98
+ submit_btn="Send",
99
+ title=model_name,
100
+ )
101
+
102
+ interface = create_interface("Odyssey-SelectolaxQLoRA-f32.gguf")
103
+
104
+ demo = gr.Blocks()
105
+ with demo:
106
+ interface.render()
107
+
108
+ if __name__ == "__main__":
109
+ demo.launch()