Spaces:

daltron
/

GPT_History

Sleeping

App Files Files Community

daltron commited on Aug 17

Commit

8e477e5

verified ·

1 Parent(s): fb73398

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -30

app.py CHANGED Viewed

@@ -1,25 +1,42 @@
 import os
 import gradio as gr
 import torch
-from transformers import pipeline
 from openai import OpenAI
 # -------------------------
 # Model choices
 # -------------------------
 MODEL_OPTIONS = [
     "GPT-1 (openai-gpt) - local",
     "GPT-2 (gpt2) - local",
     "GPT-3.5 (gpt-3.5-turbo) - OpenAI",
 ]
 MODEL_MAP = {
     "GPT-1 (openai-gpt) - local": {"kind": "hf", "id": "openai-gpt"},
     "GPT-2 (gpt2) - local": {"kind": "hf", "id": "gpt2"},
     "GPT-3.5 (gpt-3.5-turbo) - OpenAI": {"kind": "openai-chat", "id": "gpt-3.5-turbo"},
 }
-# Cache pipelines for HF models so we only load once
 HF_PIPELINES = {}
 # OpenAI client (only if key exists)
@@ -28,65 +45,118 @@ OPENAI_CLIENT = OpenAI(api_key=OPENAI_KEY) if OPENAI_KEY else None
 def get_hf_pipeline(model_id: str):
-    """Create/fetch a lightweight text-generation pipeline for CPU/GPU."""
     if model_id in HF_PIPELINES:
         return HF_PIPELINES[model_id]
     device = 0 if torch.cuda.is_available() else -1
     gen = pipeline(
         "text-generation",
-        model=model_id,
         device=device,
     )
     HF_PIPELINES[model_id] = gen
     return gen
-def generate(model_choice, prompt, max_new_tokens, temperature, top_p, seed):
-    if not prompt.strip():
-        return "Please enter a prompt."
     info = MODEL_MAP[model_choice]
     kind = info["kind"]
     model_id = info["id"]
-    if seed is not None and int(seed) >= 0:
-        torch.manual_seed(int(seed))
     try:
         if kind == "hf":
             gen = get_hf_pipeline(model_id)
-            out = gen(
-                prompt,
                 max_new_tokens=int(max_new_tokens),
-                do_sample=temperature > 0,
                 temperature=max(1e-6, float(temperature)),
                 top_p=float(top_p),
-                pad_token_id=gen.tokenizer.eos_token_id,
-                return_full_text=False,  # don't echo the prompt
             )
-            return out[0]["generated_text"]
         if kind == "openai-chat":
             if OPENAI_CLIENT is None:
-                return "⚠️ To use GPT-3.5, set OPENAI_API_KEY in your Space (Settings → Variables & secrets)."
-            resp = OPENAI_CLIENT.chat.completions.create(
                 model=model_id,
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=int(max_new_tokens),
                 temperature=float(temperature),
                 top_p=float(top_p),
             )
-            return (resp.choices[0].message.content or "").strip()
-        return f"Unknown model kind: {kind}"
     except Exception as e:
-        return f"❌ Error from {model_choice} ({model_id}): {str(e)}"
 def maybe_warn(choice):
-    """Show a small banner if user picked GPT-3.5 without an API key set."""
     info = MODEL_MAP[choice]
     needs_key = (info["kind"] == "openai-chat") and (OPENAI_CLIENT is None)
     if needs_key:
@@ -99,30 +169,35 @@ with gr.Blocks(title="Mini GPT Playground") as demo:
         """
         # Mini GPT Playground
         Type a prompt and choose a model.
-        **Local (HF):** GPT-1 / GPT-2 — runs in this Space container with `transformers`.
-        **OpenAI (API):** GPT-3.5 — requires `OPENAI_API_KEY`.
         """
     )
     with gr.Row():
-        model_choice = gr.Dropdown(MODEL_OPTIONS, value=MODEL_OPTIONS[1], label="Model")
-        max_new_tokens = gr.Slider(1, 512, value=128, step=1, label="Max new tokens")
     with gr.Row():
         temperature = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="Temperature")
-        top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="Top-p")
         seed = gr.Number(value=42, precision=0, label="Seed (≥0 to fix sampling)")
-    prompt = gr.Textbox(lines=6, label="Prompt", placeholder="Write a short story about a curious robot...")
     warn = gr.Markdown("", visible=False)
     generate_btn = gr.Button("Generate", variant="primary")
     output = gr.Textbox(lines=12, label="Output")
     model_choice.change(maybe_warn, inputs=[model_choice], outputs=[warn])
     generate_btn.click(
-        generate,
         inputs=[model_choice, prompt, max_new_tokens, temperature, top_p, seed],
         outputs=[output],
     )
-demo.queue(max_size=16).launch()

 import os
 import gradio as gr
 import torch
+from threading import Thread
+from transformers import (
+    pipeline,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TextIteratorStreamer,
+)
 from openai import OpenAI
+# -------------------------
+# Runtime tuning for 2 vCPU Spaces
+# -------------------------
+try:
+    torch.set_num_threads(min(2, os.cpu_count() or 2))
+    torch.set_num_interop_threads(1)
+except Exception:
+    pass
 # -------------------------
 # Model choices
 # -------------------------
 MODEL_OPTIONS = [
     "GPT-1 (openai-gpt) - local",
     "GPT-2 (gpt2) - local",
+    "DistilGPT-2 (distilgpt2) - local (fast)",
     "GPT-3.5 (gpt-3.5-turbo) - OpenAI",
 ]
 MODEL_MAP = {
     "GPT-1 (openai-gpt) - local": {"kind": "hf", "id": "openai-gpt"},
     "GPT-2 (gpt2) - local": {"kind": "hf", "id": "gpt2"},
+    "DistilGPT-2 (distilgpt2) - local (fast)": {"kind": "hf", "id": "distilgpt2"},
     "GPT-3.5 (gpt-3.5-turbo) - OpenAI": {"kind": "openai-chat", "id": "gpt-3.5-turbo"},
 }
+# Cache for loaded Hugging Face models/pipelines
 HF_PIPELINES = {}
 # OpenAI client (only if key exists)
 def get_hf_pipeline(model_id: str):
+    """Create/fetch a lightweight text-generation pipeline for CPU/GPU with cached weights."""
     if model_id in HF_PIPELINES:
         return HF_PIPELINES[model_id]
     device = 0 if torch.cuda.is_available() else -1
+    # Prefer safetensors, load once
+    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+    mdl = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        low_cpu_mem_usage=True,
+        torch_dtype=torch.float32,  # CPU-safe
+    )
+    # Some older models (e.g., GPT-1/2) have no pad token
+    if tok.pad_token_id is None and tok.eos_token_id is not None:
+        tok.pad_token = tok.eos_token
     gen = pipeline(
         "text-generation",
+        model=mdl,
+        tokenizer=tok,
         device=device,
     )
     HF_PIPELINES[model_id] = gen
     return gen
+def generate_stream(model_choice, prompt, max_new_tokens, temperature, top_p, seed):
+    """Stream tokens for both HF and OpenAI for faster perceived latency."""
+    prompt = (prompt or "").strip()
+    if not prompt:
+        yield "Please enter a prompt."
+        return
     info = MODEL_MAP[model_choice]
     kind = info["kind"]
     model_id = info["id"]
     try:
+        if seed is not None and int(seed) >= 0:
+            torch.manual_seed(int(seed))
         if kind == "hf":
             gen = get_hf_pipeline(model_id)
+            tok = gen.tokenizer
+            mdl = gen.model
+            streamer = TextIteratorStreamer(
+                tok, skip_prompt=True, skip_special_tokens=True
+            )
+            inputs = tok(prompt, return_tensors="pt")
+            if torch.cuda.is_available():
+                inputs = {k: v.to("cuda") for k, v in inputs.items()}
+            generate_kwargs = dict(
+                **inputs,
                 max_new_tokens=int(max_new_tokens),
+                do_sample=float(temperature) > 0.0,
                 temperature=max(1e-6, float(temperature)),
                 top_p=float(top_p),
+                pad_token_id=tok.eos_token_id,
+                eos_token_id=tok.eos_token_id,
+                streamer=streamer,
             )
+            # Run generation in a thread so we can iterate streamer
+            thread = Thread(target=mdl.generate, kwargs=generate_kwargs)
+            thread.start()
+            out = ""
+            for token_text in streamer:
+                out += token_text
+                yield out
+            return
         if kind == "openai-chat":
             if OPENAI_CLIENT is None:
+                yield "⚠️ To use GPT-3.5, set OPENAI_API_KEY in your Space (Settings → Variables & secrets)."
+                return
+            stream = OPENAI_CLIENT.chat.completions.create(
                 model=model_id,
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=int(max_new_tokens),
                 temperature=float(temperature),
                 top_p=float(top_p),
+                stream=True,
             )
+            out = ""
+            for chunk in stream:
+                delta = ""
+                try:
+                    # v1 SDK streaming shape
+                    delta = chunk.choices[0].delta.content or ""
+                except Exception:
+                    # fallback if SDK variant differs
+                    delta = getattr(chunk.choices[0], "text", "") or ""
+                if delta:
+                    out += delta
+                    yield out
+            return
+        yield f"Unknown model kind: {kind}"
     except Exception as e:
+        yield f"❌ Error from {model_choice} ({model_id}): {str(e)}"
 def maybe_warn(choice):
     info = MODEL_MAP[choice]
     needs_key = (info["kind"] == "openai-chat") and (OPENAI_CLIENT is None)
     if needs_key:
         """
         # Mini GPT Playground
         Type a prompt and choose a model.
+        **Local (HF):** GPT-1 / GPT-2 / DistilGPT-2 — runs in this Space container.
+        **OpenAI (API):** GPT-3.5 — requires `OPENAI_API_KEY`.
+        *(Tip: DistilGPT-2 is much faster on CPU.)*
         """
     )
     with gr.Row():
+        model_choice = gr.Dropdown(MODEL_OPTIONS, value="DistilGPT-2 (distilgpt2) - local (fast)", label="Model")
+        max_new_tokens = gr.Slider(1, 512, value=96, step=1, label="Max new tokens")  # lower default for speed
     with gr.Row():
         temperature = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="Temperature")
+        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
         seed = gr.Number(value=42, precision=0, label="Seed (≥0 to fix sampling)")
+    prompt = gr.Textbox(lines=6, label="Prompt", placeholder="Write a short story about a curious robot…")
     warn = gr.Markdown("", visible=False)
     generate_btn = gr.Button("Generate", variant="primary")
     output = gr.Textbox(lines=12, label="Output")
     model_choice.change(maybe_warn, inputs=[model_choice], outputs=[warn])
+    # Streamed generation
     generate_btn.click(
+        fn=generate_stream,
         inputs=[model_choice, prompt, max_new_tokens, temperature, top_p, seed],
         outputs=[output],
     )
+# Keep concurrency low on 2 vCPU; smaller queue reduces tail latency
+demo.queue(concurrency_count=1, max_size=8, status_update_rate=75).launch()