Spaces:

amos1088
/

SmallThinker

Sleeping

amos1088 commited on Aug 5

Commit

3a70ea8

1 Parent(s): 421c124

uu

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,30 +2,23 @@ import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import torch
-# Load model and tokenizer with trust_remote_code=True
 model_id = "PowerInfer/SmallThinker-21BA3B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(
-    model_id,
-    trust_remote_code=True  # Required for models with custom code
-)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    device_map="cpu",              # Run on CPU
-    torch_dtype=torch.float32,     # Use float32 on CPU
-    trust_remote_code=True         # Allow custom code execution
 )
-# Create text generation pipeline
 generator = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
-    device=-1  # CPU
 )
-# Define the chat function
 def chat(prompt, max_new_tokens=256, temperature=0.7):
     output = generator(
         prompt,
@@ -36,8 +29,8 @@ def chat(prompt, max_new_tokens=256, temperature=0.7):
     )
     return output[0]["generated_text"]
-# Launch the Gradio interface
-gr.Interface(
     fn=chat,
     inputs=[
         gr.Textbox(label="Prompt", lines=4, placeholder="Ask anything..."),
@@ -46,5 +39,8 @@ gr.Interface(
     ],
     outputs=gr.Textbox(label="Response"),
     title="💬 SmallThinker-21BA3B-Instruct",
-    description="Run PowerInfer/SmallThinker-21BA3B-Instruct locally on CPU using Hugging Face + Gradio"
-).launch()

 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import torch
 model_id = "PowerInfer/SmallThinker-21BA3B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    device_map="auto",            # Let HF decide best device
+    torch_dtype=torch.float16,    # Use float16 for speed if GPU available
+    trust_remote_code=True
 )
 generator = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
+    device=0 if torch.cuda.is_available() else -1
 )
 def chat(prompt, max_new_tokens=256, temperature=0.7):
     output = generator(
         prompt,
     )
     return output[0]["generated_text"]
+# Define the interface but do NOT launch manually
+demo = gr.Interface(
     fn=chat,
     inputs=[
         gr.Textbox(label="Prompt", lines=4, placeholder="Ask anything..."),
     ],
     outputs=gr.Textbox(label="Response"),
     title="💬 SmallThinker-21BA3B-Instruct",
+    description="Run PowerInfer/SmallThinker-21BA3B-Instruct"
+)
+if __name__ == "__main__":
+    demo.launch()