Spaces:
Running
Running
File size: 4,842 Bytes
54ba978 cc6f54d 692a239 cc6f54d 73f52cd 5cecbb0 692a239 73f52cd 5cecbb0 73f52cd 5cecbb0 54ba978 bec8f6d 5cecbb0 cc6f54d 5cecbb0 54ba978 cc6f54d 4bdd945 cc6f54d b14c8d8 cc6f54d 692a239 4bdd945 54ba978 cc6f54d b2d905e cc6f54d 73f52cd cc6f54d 54ba978 4bdd945 5cecbb0 fd78eab 4bdd945 54ba978 4bdd945 5cecbb0 363f32b 5cecbb0 54ba978 cc6f54d 692a239 54ba978 5cecbb0 54ba978 4bdd945 5cecbb0 54ba978 fd78eab 73f52cd 692a239 73f52cd cc6f54d 4bdd945 5cecbb0 73f52cd 54ba978 5cecbb0 e50e821 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import time
import threading
import torch
import gradio as gr
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
MODEL_REPO = "daniel-dona/gemma-3-270m-it"
LOCAL_DIR = os.path.join(os.getcwd(), "local_model")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ.setdefault("OMP_NUM_THREADS", str(os.cpu_count() or 1))
os.environ.setdefault("MKL_NUM_THREADS", os.environ["OMP_NUM_THREADS"])
os.environ.setdefault("OMP_PROC_BIND", "TRUE")
torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"]))
torch.set_num_interop_threads(1)
torch.set_float32_matmul_precision("high")
def ensure_local_model(repo_id: str, local_dir: str, tries: int = 3, sleep_s: float = 3.0) -> str:
os.makedirs(local_dir, exist_ok=True)
for i in range(tries):
try:
snapshot_download(
repo_id=repo_id,
local_dir=local_dir,
local_dir_use_symlinks=False,
resume_download=True,
allow_patterns=["*.json", "*.model", "*.safetensors", "*.bin", "*.txt", "*.py"]
)
return local_dir
except Exception:
if i == tries - 1:
raise
time.sleep(sleep_s * (2 ** i))
return local_dir
model_path = ensure_local_model(MODEL_REPO, LOCAL_DIR)
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
local_files_only=True,
torch_dtype=torch.float32,
device_map=None
)
model.eval()
def build_prompt(message, history, system_message, max_ctx_tokens=1024):
msgs = [{"role": "system", "content": system_message}]
for u, a in history:
if u:
msgs.append({"role": "user", "content": u})
if a:
msgs.append({"role": "assistant", "content": a})
msgs.append({"role": "user", "content": message})
while True:
chat_template = """{% for m in messages %}
{{ m['role'] }}: {{ m['content'] }}
{% endfor %}
Assistant:"""
text = tokenizer.apply_chat_template(
msgs,
chat_template=chat_template,
tokenize=False,
add_generation_prompt=True
)
if len(tokenizer(text, add_special_tokens=False).input_ids) <= max_ctx_tokens:
return text
for i in range(1, len(msgs)):
if msgs[i]["role"] != "system":
del msgs[i:i+2]
break
def respond_stream(message, history, system_message, max_tokens, temperature, top_p):
text = build_prompt(message, history, system_message)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
do_sample = bool(temperature and temperature > 0.0)
gen_kwargs = dict(
max_new_tokens=max_tokens,
do_sample=do_sample,
top_p=top_p,
temperature=temperature if do_sample else None,
use_cache=True,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id
)
try:
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
except TypeError:
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = threading.Thread(
target=model.generate,
kwargs={**inputs, **{k: v for k, v in gen_kwargs.items() if v is not None}, "streamer": streamer}
)
partial_text = ""
token_count = 0
start_time = None
with torch.inference_mode():
thread.start()
try:
for chunk in streamer:
if start_time is None:
start_time = time.time()
partial_text += chunk
token_count += 1
yield partial_text
finally:
thread.join()
end_time = time.time() if start_time is not None else time.time()
duration = max(1e-6, end_time - start_time) if start_time else 0.0
tps = (token_count / duration) if duration > 0 else 0.0
yield partial_text + f"\n\n⚡ Hız: {tps:.2f} token/sn"
demo = gr.ChatInterface(
respond_stream,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="Max new tokens"),
gr.Slider(minimum=0.0, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
]
)
if __name__ == "__main__":
with torch.inference_mode():
_ = model.generate(
**tokenizer(["Hi"], return_tensors="pt").to(model.device),
max_new_tokens=1, do_sample=False, use_cache=True
)
demo.queue(max_size=32).launch()
|