HuatuoGPT-o1-7B-GGUF-Demo-Q4

Running

File size: 2,771 Bytes

fc46f2c
 
 
ebe9f12
372a5eb
ebe9f12
fc46f2c
 
 
e40962e
 
fc46f2c
 
 
 
372a5eb
b388fe7
 
fc46f2c
 
 
 
 
 
 
 
3f93878
 
 
 
fc46f2c
3f93878
 
 
fc46f2c
3f93878
fc46f2c
 
 
 
 
3f93878
 
fc46f2c
 
 
3f93878
 
fc46f2c
 
 
 
3f93878
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc46f2c
 
372a5eb
 
fc46f2c
3f93878
 
fc46f2c
 
3f93878
fc46f2c

import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download #, login

#login(os.getenv("HF_TOKEN"))# my bad now its public

model = Llama(
    model_path=hf_hub_download(
        repo_id=os.environ.get("REPO_ID", "bartowski/HuatuoGPT-o1-7B-GGUF"),#"bartowski/HuatuoGPT-o1-7B-v0.1-GGUF"),
        filename=os.environ.get("MODEL_FILE", "HuatuoGPT-o1-7B-Q4_K_M.gguf"),#"HuatuoGPT-o1-7B-v0.1-Q4_0.gguf"),
    )
)

DESCRIPTION = '''
# FreedomIntelligence/HuatuoGPT-o1-7B | Duplicate the space and set it to private for faster & personal inference for free.
HuatuoGPT-o1 is a medical LLM designed for advanced medical reasoning.
It generates a complex thought process, reflecting and refining its reasoning, before providing a final response. 

**To start a new chat**, click "clear" and start a new dialog.
'''

LICENSE = """
--- Apache 2.0 License ---
"""

def user(message, history):
    return "", history + [{"role": "user", "content": message}]

def generate_text(history, max_tokens=512, temperature=0.9, top_p=0.95):
    """Generate a response using the Llama model."""
    messages = [{"role": item["role"], "content": item["content"]} for item in history[:-1]]
    message = history[-1]['content']
    
    response = model.create_chat_completion(
        messages=messages + [{"role": "user", "content": message}],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        stream=True,
    )
    history.append({"role": "assistant", "content": ""})

    for streamed in response:
        delta = streamed["choices"][0].get("delta", {})
        text_chunk = delta.get("content", "")
        history[-1]['content'] += text_chunk
        yield history

with gr.Blocks() as demo:
    gr.Markdown(DESCRIPTION)

    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    with gr.Accordion("Adjust Parameters", open=False):
        max_tokens = gr.Slider(minimum=512, maximum=4096, value=1024, step=1, label="Max Tokens")
        temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.9, step=0.1, label="Temperature")
        top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        generate_text, [chatbot, max_tokens, temperature, top_p], chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

    gr.Examples(
        examples=[
            ["How many r's are in the word strawberry?"],
            ['How to stop a cough?'],
            ['How do I relieve feet pain?'],
        ],
        inputs=msg,
        label="Examples",
    )

    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.launch()