try-this-model / app.py
m8than's picture
Update app.py
2433428 verified
from openai import OpenAI
import gradio as gr
import os
import json
import functools
import random
import datetime
from transformers import AutoTokenizer
reflection_tokenizer = AutoTokenizer.from_pretrained("mattshumer/Reflection-Llama-3.1-70B")
api_key = os.environ.get('FEATHERLESS_API_KEY')
client = OpenAI(
base_url="https://api.featherless.ai/v1",
api_key=api_key
)
SYSTEM_PROMPT = """You are a world-class AI system, capable of complex reasoning and reflection. Reason through the query inside <thinking> tags, and then provide your final response inside <output> tags. If you detect that you made a mistake in your reasoning at any point, correct yourself inside <reflection> tags."""
def respond(message, history, model):
history_openai_format = [{"role": "system", "content": SYSTEM_PROMPT}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human })
history_openai_format.append({"role": "assistant", "content":assistant})
history_openai_format.append({"role": "user", "content": message})
if model == "mattshumer/Reflection-Llama-3.1-70B":
# chat/completions not working for this model;
# apply chat template locally
response = client.completions.create(
model=model,
prompt=reflection_tokenizer.apply_chat_template(history_openai_format, tokenize=False),
temperature=1.0,
stream=True,
max_tokens=2000,
extra_headers={
'HTTP-Referer': 'https://huggingface.co/spaces/featherless-ai/try-this-model',
'X-Title': "HF's missing inference widget"
}
)
# debugger_ran = False
partial_message = ""
for chunk in response:
# if not debugger_ran:
# import code
# code.InteractiveConsole(locals=locals()).interact()
# debugger_ran = True
if chunk.choices[0].text is not None:
partial_message = partial_message + chunk.choices[0].text
prefix_to_strip = "<|start_header_id|>assistant<|end_header_id|>\n\n"
yield partial_message[len(prefix_to_strip):]
else:
response = client.chat.completions.create(
model=model,
messages= history_openai_format,
temperature=1.0,
stream=True,
max_tokens=2000,
extra_headers={
'HTTP-Referer': 'https://huggingface.co/spaces/featherless-ai/try-this-model',
'X-Title': "HF's missing inference widget"
}
)
partial_message = ""
for chunk in response:
if chunk.choices[0].delta.content is not None:
partial_message = partial_message + chunk.choices[0].delta.content
yield partial_message
logo = open('./logo.svg').read()
with open('./model-cache.json', 'r') as f_model_cache:
model_cache = json.load(f_model_cache)
model_class_filter = {
"mistral-v02-7b-std-lc": True,
"llama3-8b-8k": True,
"llama2-solar-10b7-4k": True,
"mistral-nemo-12b-lc": True,
"llama2-13b-4k": True,
"llama3-15b-8k": True,
"qwen2-32b-lc":False,
"llama3-70b-8k":False,
"qwen2-72b-lc":False,
"mixtral-8x22b-lc":False,
"llama3-405b-lc":False,
}
def build_model_choices():
all_choices = []
for model_class in model_cache:
if model_class not in model_class_filter:
print(f"Warning: new model class {model_class}. Treating as blacklisted")
continue
if not model_class_filter[model_class]:
continue
all_choices += [ (f"{model_id} ({model_class})", model_id) for model_id in model_cache[model_class] ]
# and add one more ...
model_class = "llama3-70b-8k"
model_id = "mattshumer/Reflection-Llama-3.1-70B"
all_choices += [(f"{model_id} ({model_class})", model_id)]
return all_choices
model_choices = build_model_choices()
def initial_model(referer=None):
return "mattshumer/Reflection-Llama-3.1-70B"
# if referer == 'http://127.0.0.1:7860/':
# return 'Sao10K/Venomia-1.1-m7'
# if referer and referer.startswith("https://huggingface.co/"):
# possible_model = referer[23:]
# full_model_list = functools.reduce(lambda x,y: x+y, model_cache.values(), [])
# model_is_supported = possible_model in full_model_list
# if model_is_supported:
# return possible_model
# # let's use a random but different model each day.
# key=os.environ.get('RANDOM_SEED', 'kcOtfNHA+e')
# o = random.Random(f"{key}-{datetime.date.today().strftime('%Y-%m-%d')}")
# return o.choice(model_choices)[1]
title_text="HuggingFace's missing inference widget"
css = """
.logo-mark { fill: #ffe184; }
/* from https://github.com/gradio-app/gradio/issues/4001
* necessary as putting ChatInterface in gr.Blocks changes behaviour
*/
.contain { display: flex; flex-direction: column; }
.gradio-container { height: 100vh !important; }
#component-0 { height: 100%; }
#chatbot { flex-grow: 1; overflow: auto;}
"""
with gr.Blocks(title_text, css=css) as demo:
gr.HTML("""
<h1 align="center">HuggingFace's missing inference widget</h1>
<h2 align="center">
Please select your model from the list 👇
</h2>
""")
# hidden_state = gr.State(value=initial_model)
with gr.Row():
model_selector = gr.Dropdown(
label="Select your Model",
choices=build_model_choices(),
value=initial_model,
# value=hidden_state,
scale=4
)
gr.Button(
value="Visit Model Card ↗️",
scale=1
).click(
inputs=[model_selector],
js="(model_selection) => { window.open(`https://huggingface.co/${model_selection}`, '_blank') }",
fn=None,
)
gr.ChatInterface(
respond,
additional_inputs=[model_selector],
head=""",
<script>console.log("Hello from gradio!")</script>
""",
concurrency_limit=5
)
gr.HTML(f"""
<p align="center">
Inference by <a href="https://featherless.ai">{logo}</a>
</p>
""")
def update_initial_model_choice(request: gr.Request):
return initial_model(request.headers.get('referer'))
demo.load(update_initial_model_choice, outputs=model_selector)
demo.launch()