Spaces:

akhaliq
/

MobileLLM-R1-950M

Running on Zero

akhaliq HF Staff commited on Sep 12

Commit

deec06c

verified ·

1 Parent(s): de39f7f

Upload app.py with huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,50 +1,50 @@
 import gradio as gr
 from transformers import pipeline
 import torch
 # Initialize the model pipeline
-model_id = "facebook/MobileLLM-R1-950M"
 pipe = pipeline(
     "text-generation",
     model=model_id,
-    torch_dtype="auto",
     device_map="auto",
 )
 def respond(message, history):
-    # Convert history to messages format
-    messages = []
-    # Add conversation history
     for user_msg, assistant_msg in history:
         if user_msg:
-            messages.append({"role": "user", "content": user_msg})
         if assistant_msg:
-            messages.append({"role": "assistant", "content": assistant_msg})
     # Add current message
-    messages.append({"role": "user", "content": message})
-    # Generate response
-    outputs = pipe(
-        messages,
-        max_new_tokens=512,
         temperature=0.7,
         do_sample=True,
         pad_token_id=pipe.tokenizer.eos_token_id,
-    )
-    # Extract only the assistant's response
-    generated_text = outputs[0]["generated_text"]
-    assistant_response = generated_text[-1]["content"]
-    return assistant_response
 # Create the chat interface
 demo = gr.ChatInterface(
     fn=respond,
     title="MobileLLM Chat",
-    description="Chat with Facebook's MobileLLM-R1-950M model",
     examples=[
         "Write a Python function that returns the square of a number.",
         "Compute: 1-2+3-4+5- ... +99-100.",

 import gradio as gr
 from transformers import pipeline
 import torch
+import spaces
 # Initialize the model pipeline
+model_id = "facebook/MobileLLM-1B"
 pipe = pipeline(
     "text-generation",
     model=model_id,
+    torch_dtype=torch.float16,
     device_map="auto",
 )
+@spaces.GPU(duration=120)
 def respond(message, history):
+    # Build prompt from history
+    prompt = ""
     for user_msg, assistant_msg in history:
         if user_msg:
+            prompt += f"User: {user_msg}\n"
         if assistant_msg:
+            prompt += f"Assistant: {assistant_msg}\n"
     # Add current message
+    prompt += f"User: {message}\nAssistant: "
+    # Generate response with streaming
+    response = ""
+    for token in pipe(
+        prompt,
+        max_new_tokens=256,
         temperature=0.7,
         do_sample=True,
         pad_token_id=pipe.tokenizer.eos_token_id,
+        return_full_text=False,
+        stream=True,
+    ):
+        chunk = token[0]["generated_text"]
+        response = chunk
+        yield response
 # Create the chat interface
 demo = gr.ChatInterface(
     fn=respond,
     title="MobileLLM Chat",
+    description="Chat with Facebook's MobileLLM-1B model",
     examples=[
         "Write a Python function that returns the square of a number.",
         "Compute: 1-2+3-4+5- ... +99-100.",