Spaces:

Spestly
/

AthenaPlayground

Running on Zero

App Files Files Community

Spestly commited on 1 day ago

Commit

6263300

verified ·

1 Parent(s): a2633ea

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -259

app.py CHANGED Viewed

@@ -4,11 +4,6 @@ import torch
 import time
 import spaces
 import re
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Model configurations
 MODELS = {
@@ -23,110 +18,72 @@ MODELS = {
     "Athena-1 7B": "Spestly/Athena-1-7B"
 }
-# Models that need the enable_thinking parameter
-THINKING_ENABLED_MODELS = ["Spestly/Athena-R3X-4B"]
-# Cache for loaded models
-loaded_models = {}
-@spaces.GPU
-def load_model(model_id):
-    """Load model and tokenizer once and cache them"""
-    try:
-        if model_id not in loaded_models:
-            logger.info(f"🚀 Loading {model_id}...")
-            start_time = time.time()
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                torch_dtype=torch.float16,
-                device_map="auto",
-                trust_remote_code=True
-            )
-            load_time = time.time() - start_time
-            logger.info(f"✅ Model loaded in {load_time:.2f}s")
-            loaded_models[model_id] = (model, tokenizer, load_time)
-        return loaded_models[model_id]
-    except Exception as e:
-        logger.error(f"Error loading model {model_id}: {str(e)}")
-        raise gr.Error(f"Failed to load model {model_id}. Please try another model.")
 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
-    """Generate response using the specified model"""
-    try:
-        model, tokenizer, _ = load_model(model_id)
-        # Build messages in proper chat format
-        messages = []
-        system_prompt = (
-            "You are Athena, a helpful, harmless, and honest AI assistant. "
-            "You provide clear, accurate, and concise responses to user questions. "
-            "You are knowledgeable across many domains and always aim to be respectful and helpful. "
-            "You are finetuned by Aayan Mishra"
-        )
-        messages.append({"role": "system", "content": system_prompt})
-        # Add conversation history
-        for msg in conversation:
-            messages.append(msg)
-        # Add current user message
-        messages.append({"role": "user", "content": user_message})
-        # Check if this model needs the enable_thinking parameter
-        if model_id in THINKING_ENABLED_MODELS:
-            prompt = tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
-                enable_thinking=True
-            )
-        else:
-            prompt = tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
-        inputs = tokenizer(prompt, return_tensors="pt")
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        generation_start = time.time()
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=max_length,
-                temperature=temperature,
-                do_sample=True,
-                top_p=0.9,
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id
-            )
-        generation_time = time.time() - generation_start
-        response = tokenizer.decode(
-            outputs[0][inputs['input_ids'].shape[-1]:],
-            skip_special_tokens=True
-        ).strip()
-        logger.info(f"Generation time: {generation_time:.2f}s")
-        return response, generation_time
-    except Exception as e:
-        logger.error(f"Error in generate_response: {str(e)}")
-        raise gr.Error(f"Error generating response: {str(e)}")
 def format_response_with_thinking(response):
     """Format response to handle <think></think> tags"""
     if '<think>' in response and '</think>' in response:
         pattern = r'(.*?)(<think>(.*?)</think>)(.*)'
         match = re.search(pattern, response, re.DOTALL)
@@ -135,66 +92,53 @@ def format_response_with_thinking(response):
             thinking_content = match.group(3).strip()
             after_thinking = match.group(4).strip()
             html = f"{before_thinking}\n"
             html += f'<div class="thinking-container">'
-            html += f'<button class="thinking-toggle"><div class="thinking-icon"></div> Thinking completed <span class="dropdown-arrow">▼</span></button>'
             html += f'<div class="thinking-content hidden">{thinking_content}</div>'
             html += f'</div>\n'
             html += after_thinking
             return html
     return response
-def validate_input(message):
-    """Validate user input"""
-    if not message or not message.strip():
-        raise gr.Error("Message cannot be empty")
-    if len(message) > 2000:
-        raise gr.Error("Message too long (max 2000 characters)")
-    return message
 def chat_submit(message, history, conversation_state, model_name, max_length, temperature):
     """Process a new message and update the chat history"""
     try:
-        # Validate input
-        message = validate_input(message)
-        # Get model ID
-        model_id = MODELS.get(model_name, MODELS["Athena-R3X 4B"])
-        # Show generating message
-        yield "", history + [(message, "Generating response...")], conversation_state, gr.update(visible=True)
-        # Generate response
-        response, generation_time = generate_response(
             model_id, conversation_state, message, max_length, temperature
         )
-        # Update conversation state
         conversation_state.append({"role": "user", "content": message})
         conversation_state.append({"role": "assistant", "content": response})
-        # Limit conversation history to last 10 exchanges
-        if len(conversation_state) > 20:  # 10 user + 10 assistant messages
-            conversation_state = conversation_state[-20:]
         # Format the response for display
         formatted_response = format_response_with_thinking(response)
         # Update the visible chat history
-        updated_history = history[:-1] + [(message, formatted_response)]
-        yield "", updated_history, conversation_state, gr.update(visible=False)
     except Exception as e:
-        logger.error(f"Error in chat_submit: {str(e)}")
         error_message = f"Error: {str(e)}"
-        yield error_message, history, conversation_state, gr.update(visible=False)
-def clear_conversation():
-    """Clear the conversation history"""
-    return [], [], gr.update(visible=False)
 css = """
 .message {
@@ -206,140 +150,47 @@ css = """
     margin: 10px 0;
 }
 .thinking-toggle {
-    background-color: rgba(30, 30, 40, 0.8);
-    border: none;
-    border-radius: 25px;
-    padding: 8px 15px;
     cursor: pointer;
-    font-size: 0.95em;
-    margin-bottom: 8px;
-    color: white;
-    display: flex;
-    align-items: center;
-    gap: 8px;
-    box-shadow: 0 2px 5px rgba(0,0,0,0.2);
-    transition: background-color 0.2s;
-    width: auto;
-    max-width: 280px;
-}
-.thinking-toggle:hover {
-    background-color: rgba(40, 40, 50, 0.9);
-}
-.thinking-icon {
-    width: 16px;
-    height: 16px;
-    border-radius: 50%;
-    background-color: #6366f1;
-    position: relative;
-    overflow: hidden;
-}
-.thinking-icon::after {
-    content: "";
-    position: absolute;
-    top: 50%;
-    left: 50%;
-    width: 60%;
-    height: 60%;
-    background-color: #a5b4fc;
-    transform: translate(-50%, -50%);
-    border-radius: 50%;
-}
-.dropdown-arrow {
-    font-size: 0.7em;
-    margin-left: auto;
-    transition: transform 0.3s;
 }
 .thinking-content {
-    background-color: rgba(30, 30, 40, 0.8);
-    border-left: 2px solid #6366f1;
-    padding: 15px;
     margin-top: 5px;
-    margin-bottom: 15px;
     font-size: 0.95em;
-    color: #e2e8f0;
     font-family: monospace;
     white-space: pre-wrap;
     overflow-x: auto;
-    border-radius: 5px;
-    line-height: 1.5;
 }
 .hidden {
     display: none;
 }
-.progress-container {
-    text-align: center;
-    margin: 10px 0;
-    color: #6366f1;
-}
 """
-js = """
-function setupThinkingToggle() {
-    document.querySelectorAll('.thinking-toggle').forEach(button => {
-        if (!button.dataset.listenerAdded) {
-            button.addEventListener('click', function() {
-                const content = this.nextElementSibling;
-                content.classList.toggle('hidden');
-                const arrow = this.querySelector('.dropdown-arrow');
-                arrow.textContent = content.classList.contains('hidden') ? '▼' : '▲';
-            });
-            button.dataset.listenerAdded = 'true';
-        }
-    });
-}
-document.addEventListener('DOMContentLoaded', () => {
-    setupThinkingToggle();
-    const observer = new MutationObserver((mutations) => {
-        setupThinkingToggle();
-    });
-    observer.observe(document.body, {
-        childList: true,
-        subtree: true
-    });
-});
-"""
-# Create Gradio interface
-with gr.Blocks(title="Athena Playground Chat", css=css, js=js) as demo:
     gr.Markdown("# 🚀 Athena Playground Chat")
     gr.Markdown("*Powered by HuggingFace ZeroGPU*")
     # State to keep track of the conversation for the model
     conversation_state = gr.State([])
-    # Hidden progress indicator
-    progress = gr.HTML(
-        """<div class="progress-container">Generating response...</div>""",
-        visible=False
-    )
-    # Chatbot component
-    chatbot = gr.Chatbot(
-        height=500,
-        label="Athena",
-        render_markdown=True,
-        elem_classes=["chatbot"]
-    )
-    # Input and send button row
     with gr.Row():
-        user_input = gr.Textbox(
-            label="Your message",
-            scale=8,
-            autofocus=True,
-            placeholder="Type your message here...",
-            lines=2
-        )
-        send_btn = gr.Button(
-            value="Send",
-            scale=1,
-            variant="primary"
-        )
-    # Clear button
     clear_btn = gr.Button("Clear Conversation")
     # Configuration controls
@@ -362,25 +213,28 @@ with gr.Blocks(title="Athena Playground Chat", css=css, js=js) as demo:
             info="Higher values = more creative responses"
         )
-    # Connect the interface components
-    submit_event = user_input.submit(
-        fn=chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
-        outputs=[user_input, chatbot, conversation_state, progress]
     )
-    send_click = send_btn.click(
-        fn=chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
-        outputs=[user_input, chatbot, conversation_state, progress]
     )
-    clear_btn.click(
-        fn=clear_conversation,
-        outputs=[chatbot, conversation_state, progress]
-    )
-    # Examples
     gr.Examples(
         examples=[
             "What is artificial intelligence?",
@@ -388,15 +242,14 @@ with gr.Blocks(title="Athena Playground Chat", css=css, js=js) as demo:
             "Write a short poem about technology",
             "What are some ethical concerns about AI?"
         ],
-        inputs=user_input
     )
     gr.Markdown("""
     ### About the Thinking Tags
     Some Athena models (particularly R3X series) include reasoning in `<think></think>` tags.
-    Click on "Thinking completed" to view the model's thought process behind its answers.
     """)
 if __name__ == "__main__":
-    demo.queue()
-    demo.launch(debug=True)

 import time
 import spaces
 import re
 # Model configurations
 MODELS = {
     "Athena-1 7B": "Spestly/Athena-1-7B"
 }
 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
+    """Generate response using ZeroGPU - all CUDA operations happen here"""
+    print(f"🚀 Loading {model_id}...")
+    start_time = time.time()
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    load_time = time.time() - start_time
+    print(f"✅ Model loaded in {load_time:.2f}s")
+    # Build messages in proper chat format (OpenAI-style messages)
+    messages = []
+    system_prompt = (
+        "You are Athena, a helpful, harmless, and honest AI assistant. "
+        "You provide clear, accurate, and concise responses to user questions. "
+        "You are knowledgeable across many domains and always aim to be respectful and helpful. "
+        "You are finetuned by Aayan Mishra"
+    )
+    messages.append({"role": "system", "content": system_prompt})
+    # Add conversation history
+    for msg in conversation:
+        messages.append(msg)
+    # Add current user message
+    messages.append({"role": "user", "content": user_message})
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt")
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    generation_start = time.time()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_length,
+            temperature=temperature,
+            do_sample=True,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    generation_time = time.time() - generation_start
+    response = tokenizer.decode(
+        outputs[0][inputs['input_ids'].shape[-1]:],
+        skip_special_tokens=True
+    ).strip()
+    print(f"Generation time: {generation_time:.2f}s")
+    return response, load_time, generation_time
 def format_response_with_thinking(response):
     """Format response to handle <think></think> tags"""
+    # Check if response contains thinking tags
     if '<think>' in response and '</think>' in response:
+        # Split the response into parts
         pattern = r'(.*?)(<think>(.*?)</think>)(.*)'
         match = re.search(pattern, response, re.DOTALL)
             thinking_content = match.group(3).strip()
             after_thinking = match.group(4).strip()
+            # Create HTML with collapsible thinking section
             html = f"{before_thinking}\n"
             html += f'<div class="thinking-container">'
+            html += f'<button class="thinking-toggle" onclick="this.nextElementSibling.classList.toggle(\'hidden\'); this.textContent = this.textContent === \'Show reasoning\' ? \'Hide reasoning\' : \'Show reasoning\'">Show reasoning</button>'
             html += f'<div class="thinking-content hidden">{thinking_content}</div>'
             html += f'</div>\n'
             html += after_thinking
             return html
+    # If no thinking tags, return the original response
     return response
 def chat_submit(message, history, conversation_state, model_name, max_length, temperature):
     """Process a new message and update the chat history"""
+    if not message.strip():
+        return "", history, conversation_state
+    model_id = MODELS.get(model_name, MODELS["Athena-R3X 4B"])
     try:
+        # Print debug info to help diagnose issues
+        print(f"Processing message: {message}")
+        print(f"Selected model: {model_name} ({model_id})")
+        response, load_time, generation_time = generate_response(
             model_id, conversation_state, message, max_length, temperature
         )
+        # Update the conversation state with the raw response
         conversation_state.append({"role": "user", "content": message})
         conversation_state.append({"role": "assistant", "content": response})
         # Format the response for display
         formatted_response = format_response_with_thinking(response)
         # Update the visible chat history
+        history.append((message, formatted_response))
+        print(f"Response added to history. Current length: {len(history)}")
+        return "", history, conversation_state
     except Exception as e:
+        import traceback
+        print(f"Error in chat_submit: {str(e)}")
+        print(traceback.format_exc())
         error_message = f"Error: {str(e)}"
+        history.append((message, error_message))
+        return "", history, conversation_state
 css = """
 .message {
     margin: 10px 0;
 }
 .thinking-toggle {
+    background-color: #f1f1f1;
+    border: 1px solid #ddd;
+    border-radius: 4px;
+    padding: 5px 10px;
     cursor: pointer;
+    font-size: 0.9em;
+    margin-bottom: 5px;
+    color: #555;
 }
 .thinking-content {
+    background-color: #f9f9f9;
+    border-left: 3px solid #ccc;
+    padding: 10px;
     margin-top: 5px;
     font-size: 0.95em;
+    color: #555;
     font-family: monospace;
     white-space: pre-wrap;
     overflow-x: auto;
 }
 .hidden {
     display: none;
 }
 """
+theme = gr.themes.Soft()
+with gr.Blocks(title="Athena Playground Chat", css=css, theme=theme) as demo:
     gr.Markdown("# 🚀 Athena Playground Chat")
     gr.Markdown("*Powered by HuggingFace ZeroGPU*")
     # State to keep track of the conversation for the model
     conversation_state = gr.State([])
+    chatbot = gr.Chatbot(height=500, label="Athena", render_markdown=True)
     with gr.Row():
+        user_input = gr.Textbox(label="Your message", scale=8, autofocus=True, placeholder="Type your message here...")
+        send_btn = gr.Button(value="Send", scale=1, variant="primary")
+    # Clear button for resetting the conversation
     clear_btn = gr.Button("Clear Conversation")
     # Configuration controls
             info="Higher values = more creative responses"
         )
+    # Function to clear the conversation
+    def clear_conversation():
+        return [], []
+    # Connect the interface components - note the specific ordering
+    user_input.submit(
+        chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
+        outputs=[user_input, chatbot, conversation_state]
     )
+    # Make sure send button uses the exact same function with the same parameter ordering
+    send_btn.click(
+        chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
+        outputs=[user_input, chatbot, conversation_state]
     )
+    # Connect clear button
+    clear_btn.click(clear_conversation, outputs=[chatbot, conversation_state])
+    # Add examples if desired
     gr.Examples(
         examples=[
             "What is artificial intelligence?",
             "Write a short poem about technology",
             "What are some ethical concerns about AI?"
         ],
+        inputs=[user_input]
     )
     gr.Markdown("""
     ### About the Thinking Tags
     Some Athena models (particularly R3X series) include reasoning in `<think></think>` tags.
+    Click "Show reasoning" to see the model's thought process behind its answers.
     """)
 if __name__ == "__main__":
+    demo.launch(debug=True)  # Enable debug mode for better error reporting