Spaces:

Spestly
/

AthenaPlayground

Running on Zero

App Files Files Community

Spestly commited on 5 days ago

Commit

a2633ea

verified ·

1 Parent(s): 2a46096

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -128

app.py CHANGED Viewed

@@ -4,6 +4,11 @@ import torch
 import time
 import spaces
 import re
 # Model configurations
 MODELS = {
@@ -21,82 +26,107 @@ MODELS = {
 # Models that need the enable_thinking parameter
 THINKING_ENABLED_MODELS = ["Spestly/Athena-R3X-4B"]
 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
-    """Generate response using ZeroGPU - all CUDA operations happen here"""
-    print(f"🚀 Loading {model_id}...")
-    start_time = time.time()
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch.float16,
-        device_map="auto",
-        trust_remote_code=True
-    )
-    load_time = time.time() - start_time
-    print(f"✅ Model loaded in {load_time:.2f}s")
-    # Build messages in proper chat format (OpenAI-style messages)
-    messages = []
-    system_prompt = (
-        "You are Athena, a helpful, harmless, and honest AI assistant. "
-        "You provide clear, accurate, and concise responses to user questions. "
-        "You are knowledgeable across many domains and always aim to be respectful and helpful. "
-        "You are finetuned by Aayan Mishra"
-    )
-    messages.append({"role": "system", "content": system_prompt})
-    # Add conversation history
-    for msg in conversation:
-        messages.append(msg)
-    # Add current user message
-    messages.append({"role": "user", "content": user_message})
-    # Check if this model needs the enable_thinking parameter
-    if model_id in THINKING_ENABLED_MODELS:
-        prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-            enable_thinking=True
-        )
-    else:
-        prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-    inputs = tokenizer(prompt, return_tensors="pt")
-    device = next(model.parameters()).device
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    generation_start = time.time()
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=max_length,
-            temperature=temperature,
-            do_sample=True,
-            top_p=0.9,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id
-        )
-    generation_time = time.time() - generation_start
-    response = tokenizer.decode(
-        outputs[0][inputs['input_ids'].shape[-1]:],
-        skip_special_tokens=True
-    ).strip()
-    print(f"Generation time: {generation_time:.2f}s")
-    return response, load_time, generation_time
 def format_response_with_thinking(response):
     """Format response to handle <think></think> tags"""
-    # Check if response contains thinking tags
     if '<think>' in response and '</think>' in response:
-        # Split the response into parts
         pattern = r'(.*?)(<think>(.*?)</think>)(.*)'
         match = re.search(pattern, response, re.DOTALL)
@@ -105,7 +135,6 @@ def format_response_with_thinking(response):
             thinking_content = match.group(3).strip()
             after_thinking = match.group(4).strip()
-            # Create HTML with collapsible thinking section
             html = f"{before_thinking}\n"
             html += f'<div class="thinking-container">'
             html += f'<button class="thinking-toggle"><div class="thinking-icon"></div> Thinking completed <span class="dropdown-arrow">▼</span></button>'
@@ -115,43 +144,57 @@ def format_response_with_thinking(response):
             return html
-    # If no thinking tags, return the original response
     return response
 def chat_submit(message, history, conversation_state, model_name, max_length, temperature):
     """Process a new message and update the chat history"""
-    # For debugging - print when the function is called
-    print(f"chat_submit function called with message: '{message}'")
-    if not message or not message.strip():
-        print("Empty message, returning without processing")
-        return "", history, conversation_state
-    model_id = MODELS.get(model_name, MODELS["Athena-R3X 4B"])
     try:
-        response, load_time, generation_time = generate_response(
             model_id, conversation_state, message, max_length, temperature
         )
-        # Update the conversation state with the raw response
         conversation_state.append({"role": "user", "content": message})
         conversation_state.append({"role": "assistant", "content": response})
         # Format the response for display
         formatted_response = format_response_with_thinking(response)
         # Update the visible chat history
-        history.append((message, formatted_response))
-        print(f"Response added to history. Current length: {len(history)}")
-        return "", history, conversation_state
     except Exception as e:
-        import traceback
-        print(f"Error in chat_submit: {str(e)}")
-        print(traceback.format_exc())
         error_message = f"Error: {str(e)}"
-        history.append((message, error_message))
-        return "", history, conversation_state
 css = """
 .message {
@@ -223,53 +266,39 @@ css = """
 .hidden {
     display: none;
 }
 """
-# Add JavaScript to make the thinking buttons work
 js = """
 function setupThinkingToggle() {
     document.querySelectorAll('.thinking-toggle').forEach(button => {
-        if (!button.hasEventListener) {
             button.addEventListener('click', function() {
                 const content = this.nextElementSibling;
                 content.classList.toggle('hidden');
                 const arrow = this.querySelector('.dropdown-arrow');
-                if (content.classList.contains('hidden')) {
-                    arrow.textContent = '▼';
-                    arrow.style.transform = '';
-                } else {
-                    arrow.textContent = '▲';
-                    arrow.style.transform = 'rotate(0deg)';
-                }
             });
-            button.hasEventListener = true;
         }
     });
 }
-// Setup a mutation observer to watch for changes in the DOM
-const observer = new MutationObserver(function(mutations) {
-    setupThinkingToggle();
-});
-// Start observing after DOM is loaded
 document.addEventListener('DOMContentLoaded', () => {
     setupThinkingToggle();
-    setTimeout(() => {
-        const chatbot = document.querySelector('.chatbot');
-        if (chatbot) {
-            observer.observe(chatbot, {
-                childList: true,
-                subtree: true,
-                characterData: true
-            });
-        } else {
-            observer.observe(document.body, {
-                childList: true,
-                subtree: true
-            });
-        }
-    }, 1000);
 });
 """
@@ -281,6 +310,12 @@ with gr.Blocks(title="Athena Playground Chat", css=css, js=js) as demo:
     # State to keep track of the conversation for the model
     conversation_state = gr.State([])
     # Chatbot component
     chatbot = gr.Chatbot(
         height=500,
@@ -327,28 +362,22 @@ with gr.Blocks(title="Athena Playground Chat", css=css, js=js) as demo:
             info="Higher values = more creative responses"
         )
-    # Function to clear the conversation
-    def clear_conversation():
-        return [], []
-    # Connect the interface components with explicit handlers
-    submit_click = user_input.submit(
         fn=chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
-        outputs=[user_input, chatbot, conversation_state]
     )
-    # Connect send button explicitly
     send_click = send_btn.click(
         fn=chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
-        outputs=[user_input, chatbot, conversation_state]
     )
-    # Clear conversation
     clear_btn.click(
         fn=clear_conversation,
-        outputs=[chatbot, conversation_state]
     )
     # Examples
@@ -369,6 +398,5 @@ with gr.Blocks(title="Athena Playground Chat", css=css, js=js) as demo:
     """)
 if __name__ == "__main__":
-    # Enable queue and debugging
     demo.queue()
     demo.launch(debug=True)

 import time
 import spaces
 import re
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Model configurations
 MODELS = {
 # Models that need the enable_thinking parameter
 THINKING_ENABLED_MODELS = ["Spestly/Athena-R3X-4B"]
+# Cache for loaded models
+loaded_models = {}
+@spaces.GPU
+def load_model(model_id):
+    """Load model and tokenizer once and cache them"""
+    try:
+        if model_id not in loaded_models:
+            logger.info(f"🚀 Loading {model_id}...")
+            start_time = time.time()
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
+            )
+            load_time = time.time() - start_time
+            logger.info(f"✅ Model loaded in {load_time:.2f}s")
+            loaded_models[model_id] = (model, tokenizer, load_time)
+        return loaded_models[model_id]
+    except Exception as e:
+        logger.error(f"Error loading model {model_id}: {str(e)}")
+        raise gr.Error(f"Failed to load model {model_id}. Please try another model.")
 @spaces.GPU
 def generate_response(model_id, conversation, user_message, max_length=512, temperature=0.7):
+    """Generate response using the specified model"""
+    try:
+        model, tokenizer, _ = load_model(model_id)
+        # Build messages in proper chat format
+        messages = []
+        system_prompt = (
+            "You are Athena, a helpful, harmless, and honest AI assistant. "
+            "You provide clear, accurate, and concise responses to user questions. "
+            "You are knowledgeable across many domains and always aim to be respectful and helpful. "
+            "You are finetuned by Aayan Mishra"
+        )
+        messages.append({"role": "system", "content": system_prompt})
+        # Add conversation history
+        for msg in conversation:
+            messages.append(msg)
+        # Add current user message
+        messages.append({"role": "user", "content": user_message})
+        # Check if this model needs the enable_thinking parameter
+        if model_id in THINKING_ENABLED_MODELS:
+            prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=True
+            )
+        else:
+            prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+        inputs = tokenizer(prompt, return_tensors="pt")
+        device = next(model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        generation_start = time.time()
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_length,
+                temperature=temperature,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id
+            )
+        generation_time = time.time() - generation_start
+        response = tokenizer.decode(
+            outputs[0][inputs['input_ids'].shape[-1]:],
+            skip_special_tokens=True
+        ).strip()
+        logger.info(f"Generation time: {generation_time:.2f}s")
+        return response, generation_time
+    except Exception as e:
+        logger.error(f"Error in generate_response: {str(e)}")
+        raise gr.Error(f"Error generating response: {str(e)}")
 def format_response_with_thinking(response):
     """Format response to handle <think></think> tags"""
     if '<think>' in response and '</think>' in response:
         pattern = r'(.*?)(<think>(.*?)</think>)(.*)'
         match = re.search(pattern, response, re.DOTALL)
             thinking_content = match.group(3).strip()
             after_thinking = match.group(4).strip()
             html = f"{before_thinking}\n"
             html += f'<div class="thinking-container">'
             html += f'<button class="thinking-toggle"><div class="thinking-icon"></div> Thinking completed <span class="dropdown-arrow">▼</span></button>'
             return html
     return response
+def validate_input(message):
+    """Validate user input"""
+    if not message or not message.strip():
+        raise gr.Error("Message cannot be empty")
+    if len(message) > 2000:
+        raise gr.Error("Message too long (max 2000 characters)")
+    return message
 def chat_submit(message, history, conversation_state, model_name, max_length, temperature):
     """Process a new message and update the chat history"""
     try:
+        # Validate input
+        message = validate_input(message)
+        # Get model ID
+        model_id = MODELS.get(model_name, MODELS["Athena-R3X 4B"])
+        # Show generating message
+        yield "", history + [(message, "Generating response...")], conversation_state, gr.update(visible=True)
+        # Generate response
+        response, generation_time = generate_response(
             model_id, conversation_state, message, max_length, temperature
         )
+        # Update conversation state
         conversation_state.append({"role": "user", "content": message})
         conversation_state.append({"role": "assistant", "content": response})
+        # Limit conversation history to last 10 exchanges
+        if len(conversation_state) > 20:  # 10 user + 10 assistant messages
+            conversation_state = conversation_state[-20:]
         # Format the response for display
         formatted_response = format_response_with_thinking(response)
         # Update the visible chat history
+        updated_history = history[:-1] + [(message, formatted_response)]
+        yield "", updated_history, conversation_state, gr.update(visible=False)
     except Exception as e:
+        logger.error(f"Error in chat_submit: {str(e)}")
         error_message = f"Error: {str(e)}"
+        yield error_message, history, conversation_state, gr.update(visible=False)
+def clear_conversation():
+    """Clear the conversation history"""
+    return [], [], gr.update(visible=False)
 css = """
 .message {
 .hidden {
     display: none;
 }
+.progress-container {
+    text-align: center;
+    margin: 10px 0;
+    color: #6366f1;
+}
 """
 js = """
 function setupThinkingToggle() {
     document.querySelectorAll('.thinking-toggle').forEach(button => {
+        if (!button.dataset.listenerAdded) {
             button.addEventListener('click', function() {
                 const content = this.nextElementSibling;
                 content.classList.toggle('hidden');
                 const arrow = this.querySelector('.dropdown-arrow');
+                arrow.textContent = content.classList.contains('hidden') ? '▼' : '▲';
             });
+            button.dataset.listenerAdded = 'true';
         }
     });
 }
 document.addEventListener('DOMContentLoaded', () => {
     setupThinkingToggle();
+    const observer = new MutationObserver((mutations) => {
+        setupThinkingToggle();
+    });
+    observer.observe(document.body, {
+        childList: true,
+        subtree: true
+    });
 });
 """
     # State to keep track of the conversation for the model
     conversation_state = gr.State([])
+    # Hidden progress indicator
+    progress = gr.HTML(
+        """<div class="progress-container">Generating response...</div>""",
+        visible=False
+    )
     # Chatbot component
     chatbot = gr.Chatbot(
         height=500,
             info="Higher values = more creative responses"
         )
+    # Connect the interface components
+    submit_event = user_input.submit(
         fn=chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
+        outputs=[user_input, chatbot, conversation_state, progress]
     )
     send_click = send_btn.click(
         fn=chat_submit,
         inputs=[user_input, chatbot, conversation_state, model_choice, max_length, temperature],
+        outputs=[user_input, chatbot, conversation_state, progress]
     )
     clear_btn.click(
         fn=clear_conversation,
+        outputs=[chatbot, conversation_state, progress]
     )
     # Examples
     """)
 if __name__ == "__main__":
     demo.queue()
     demo.launch(debug=True)