Spaces:
Running
Running
File size: 9,387 Bytes
6d6c49f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import gradio as gr
import torch
import os
import time
# --- Try to import ctransformers for GGUF, provide helpful message if not found ---
# We try to import ctransformers first as it's the preferred method for ZeroCPU efficiency
try:
from ctransformers import AutoModelForCausalLM as AutoModelForCausalLM_GGUF
# We still need AutoTokenizer from transformers for standard tokenizing
from transformers import AutoTokenizer, AutoModelForCausalLM
GGUF_AVAILABLE = True
except ImportError:
GGUF_AVAILABLE = False
print("WARNING: 'ctransformers' not found. This app relies on it for efficient CPU inference.")
print("Please install it with: pip install ctransformers transformers")
# If ctransformers isn't available, we'll fall back to standard transformers loading, which is slower on CPU.
from transformers import AutoTokenizer, AutoModelForCausalLM
# --- Configuration for Models and Generation ---
# Original model (for reference, or if a GPU is detected, though ZeroCPU is target)
ORIGINAL_MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
# !!! IMPORTANT !!! For efficient ZeroCPU (CPU-only) inference,
# a GGUF quantized model is HIGHLY RECOMMENDED.
# SmolLM2-360M-Instruct does NOT have a readily available GGUF version from common providers.
# Therefore, for ZeroCPU deployment, this app will use a common, small GGUF model by default.
# If you find a GGUF for SmolLM2 later, you can update these:
GGUF_MODEL_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # Recommended GGUF placeholder for ZeroCPU
GGUF_MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Corresponding GGUF file name
# --- Generation Parameters ---
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.7
TOP_K = 50
TOP_P = 0.95
DO_SAMPLE = True # Important for varied responses
# Global model and tokenizer variables
model = None
tokenizer = None
device = "cpu" # Explicitly set to CPU for ZeroCPU deployment
# --- Model Loading Function ---
def load_model_for_zerocpu():
global model, tokenizer, device
# Attempt to load the GGUF model first for efficiency on ZeroCPU
if GGUF_AVAILABLE:
print(f"Attempting to load GGUF model '{GGUF_MODEL_ID}' (file: '{GGUF_MODEL_FILENAME}') for ZeroCPU...")
try:
model = AutoModelForCausalLM_GGUF.from_pretrained(
GGUF_MODEL_ID,
model_file=GGUF_MODEL_FILENAME,
model_type="llama", # Most GGUF models are Llama-based (TinyLlama is)
gpu_layers=0 # Ensures it runs on CPU, not GPU
)
# Use the tokenizer from the original SmolLM2 for chat template consistency
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"GGUF model '{GGUF_MODEL_ID}' loaded successfully for CPU.")
return # Exit function if GGUF model loaded successfully
except Exception as e:
print(f"WARNING: Could not load GGUF model '{GGUF_MODEL_ID}' from '{GGUF_MODEL_FILENAME}': {e}")
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
# Continue to the next block to try loading the standard HF model
else:
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
# Fallback/alternative: Load the standard Hugging Face model (will be slower on CPU without GGUF)
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
try:
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.to(device) # Explicitly move model to CPU
print(f"Standard model '{ORIGINAL_MODEL_ID}' loaded successfully on CPU.")
except Exception as e:
print(f"CRITICAL ERROR: Could not load standard model '{ORIGINAL_MODEL_ID}' on CPU: {e}")
print("Please ensure the model ID is correct, you have enough RAM, and dependencies are installed.")
model = None # Indicate failure to load
tokenizer = None # Indicate failure to load
# --- Inference Function for Gradio ChatInterface ---
def predict_chat(message: str, history: list):
# 'history' is a list of lists, where each inner list is [user_message, bot_message]
# 'message' is the current user input
if model is None or tokenizer is None:
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
return
# Build the full conversation history for the model's chat template
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
for human_msg, ai_msg in history:
messages.append({"role": "user", "content": human_msg})
messages.append({"role": "assistant", "content": ai_msg})
messages.append({"role": "user", "content": message}) # Add the current user message
generated_text = ""
start_time = time.time() # Start timing for the current turn
if isinstance(model, AutoModelForCausalLM_GGUF): # Check if the loaded model is from ctransformers
# For ctransformers (GGUF), manually construct a simple prompt string
prompt_input = ""
for msg in messages:
if msg["role"] == "system":
prompt_input += f"{msg['content']}\n"
elif msg["role"] == "user":
prompt_input += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
prompt_input += f"Assistant: {msg['content']}\n"
prompt_input += "Assistant:" # Instruct the model to generate the assistant's response
# Use the GGUF model's generate method
for token in model.generate(
prompt_input,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
top_k=TOP_K,
top_p=TOP_P,
do_sample=DO_SAMPLE,
repetition_penalty=1.1, # Common for GGUF models
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"] # Common stop tokens
):
generated_text += token
yield generated_text # Yield partial response for streaming in Gradio
else: # If standard Hugging Face transformers model was loaded (slower on CPU)
# Apply the tokenizer's chat template
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
# Generate the response
outputs = model.generate(
inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
top_k=TOP_K,
top_p=TOP_P,
do_sample=DO_SAMPLE,
pad_token_id=tokenizer.pad_token_id # Important for generation
)
# Decode only the newly generated tokens
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
yield generated_text # Yield the full response at once (transformers.generate doesn't stream by default)
end_time = time.time()
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
# --- Gradio Interface Setup ---
if __name__ == "__main__":
# Load the model globally when the Gradio app starts
load_model_for_zerocpu()
# Define a custom startup message for the chatbot
initial_chatbot_message = (
"Hello! I'm an AI assistant. I'm currently running in a CPU-only "
"environment for efficient demonstration. How can I help you today?"
)
demo = gr.ChatInterface(
fn=predict_chat, # The function that handles chat prediction
chatbot=gr.Chatbot(height=500), # The chat display area
textbox=gr.Textbox(
placeholder="Ask me a question...",
container=False,
scale=7
),
title="SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU",
description=(
f"This Space demonstrates an LLM for efficient CPU-only inference. "
f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
f"without GGUF. Expect varied responses each run due to randomized generation."
),
theme="soft",
examples=[ # Pre-defined examples for quick testing
["What is the capital of France?"],
["Can you tell me a fun fact about outer space?"],
["What's the best way to stay motivated?"],
],
cache_examples=False, # Important: Ensures examples run inference each time, not from cache
clear_btn="Clear Chat", # Button to clear the conversation
# Custom message to start the conversation from the assistant
initial_chatbot_message=initial_chatbot_message
)
# Launch the Gradio app
# `share=True` creates a public link (useful for testing, but not needed on HF Spaces)
# `server_name="0.0.0.0"` and `server_port=7860` are typically default for HF Spaces
demo.launch() |