import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # List of available SmilyAI Sam models (adjust as needed) MODELS = [ "Smilyai-labs/Sam-reason-S1", "Smilyai-labs/Sam-reason-S1.5", "Smilyai-labs/Sam-reason-S2", "Smilyai-labs/Sam-reason-S3", "Smilyai-labs/Sam-reason-v1", "Smilyai-labs/Sam-reason-v2", "Smilyai-labs/Sam-reason-A1", "Smilyai-labs/Sam-flash-mini-v1" ] device = "cuda" if torch.cuda.is_available() else "cpu" # Global vars to hold model and tokenizer model = None tokenizer = None def load_model(model_name): global model, tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name).to(device) model.eval() return f"Loaded model: {model_name}" def generate_stream(prompt, max_length=100, temperature=0.7, top_p=0.9): global model, tokenizer if model is None or tokenizer is None: yield "Model not loaded. Please select a model first." return input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) generated_ids = input_ids output_text = tokenizer.decode(input_ids[0]) # Generate tokens one by one for _ in range(max_length): outputs = model(generated_ids) logits = outputs.logits # Get logits for last token next_token_logits = logits[:, -1, :] / temperature # Apply top_p filtering for nucleus sampling sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True) cumulative_probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1) # Remove tokens with cumulative prob above top_p sorted_indices_to_remove = cumulative_probs > top_p # Shift mask right to keep at least one token sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 filtered_logits = next_token_logits.clone() filtered_logits[:, sorted_indices[sorted_indices_to_remove]] = -float('Inf') # Sample from filtered distribution probabilities = torch.softmax(filtered_logits, dim=-1) next_token = torch.multinomial(probabilities, num_samples=1) generated_ids = torch.cat([generated_ids, next_token], dim=-1) new_token_text = tokenizer.decode(next_token[0]) output_text += new_token_text yield output_text # Stop if EOS token generated if next_token.item() == tokenizer.eos_token_id: break def on_model_change(model_name): status = load_model(model_name) return status with gr.Blocks() as demo: gr.Markdown("# SmilyAI Sam Models — Manual Token Streaming Generator") with gr.Row(): model_selector = gr.Dropdown(choices=MODELS, value=MODELS[0], label="Select Model") status = gr.Textbox(label="Status", interactive=False) prompt_input = gr.Textbox(lines=3, placeholder="Enter your prompt here...", label="Prompt") output_box = gr.Textbox(label="Generated Text", lines=15, interactive=False) generate_btn = gr.Button("Generate") # Load default model status.value = load_model(MODELS[0]) model_selector.change(on_model_change, inputs=model_selector, outputs=status) def generate_func(prompt): if not prompt.strip(): yield "Please enter a prompt." return yield from generate_stream(prompt) generate_btn.click(generate_func, inputs=prompt_input, outputs=output_box) demo.launch()