import os, logging, gradio as gr
from pydoc import html
from openai import OpenAI
from gateway import request_generation
from utils import LATEX_DELIMS

openai_api_key = os.getenv("API_KEY")
openai_api_base = os.getenv("API_ENDPOINT")
model_name = os.getenv("MODEL_NAME")
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))

logging.basicConfig(level=logging.INFO)

def format_final(analysis_text: str, visible_text: str) -> str:
    """Render final message with collapsible analysis + normal Markdown answer."""
    reasoning_safe = html.escape((analysis_text or "").strip())
    response = (visible_text or "").strip()
    # Collapsible analysis, normal markdown answer
    return (
        "<details><summary><strong>🤔 Analysis</strong></summary>\n"
        "<pre style='white-space:pre-wrap;'>"
        f"{reasoning_safe}"
        "</pre>\n</details>\n\n"
        "**💬 Response:**\n\n"
        f"{response}"
    )

def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
    if not message.strip():
        yield "Please enter a prompt."
        return

    # Flatten gradio history
    msgs = []
    for h in history:
        if isinstance(h, dict):
            msgs.append(h)
        elif isinstance(h, (list, tuple)) and len(h) == 2:
            u, a = h
            if u: msgs.append({"role": "user", "content": u})
            if a: msgs.append({"role": "assistant", "content": a})

    logging.info(f"[User] {message}")
    logging.info(f"[System] {system_prompt} | Temp={temperature}")

    tools = [{"type": "web_search_preview"}] if enable_browsing else None
    tool_choice = "auto" if enable_browsing else None

    in_analysis = False
    in_visible  = False

    raw_analysis = ""   
    raw_visible  = ""   

    raw_started = False
    last_flush_len = 0

    def make_raw_preview() -> str:
        return (
            "```text\n"
            "Analysis (live):\n"
            f"{raw_analysis}\n\n"
            "Response (draft):\n"
            f"{raw_visible}\n"
            "```"
        )

    try:
        for chunk in request_generation(
            api_key=openai_api_key, api_base=openai_api_base,
            message=message, system_prompt=system_prompt,
            model_name=model_name, chat_history=msgs,
            temperature=temperature, reasoning_effort=reasoning_effort,
            max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
        ):
            if chunk == "analysis":
                in_analysis, in_visible = True, False
                if not raw_started:
                    raw_started = True
                    yield make_raw_preview()
                continue

            if chunk == "assistantfinal":
                in_analysis, in_visible = False, True
                if not raw_started:
                    raw_started = True
                    yield make_raw_preview()
                continue

            if in_analysis:
                raw_analysis += chunk
            elif in_visible:
                raw_visible += chunk
            else:
                raw_visible += chunk

            total_len = len(raw_analysis) + len(raw_visible)
            if total_len - last_flush_len >= 120 or "\n" in chunk:
                last_flush_len = total_len
                yield make_raw_preview()

        final_markdown = format_final(raw_analysis, raw_visible)

        if final_markdown.count("$") % 2:
            final_markdown += "$"

        # This replaces the raw preview in-place with the pretty final message
        yield final_markdown

    except Exception as e:
        logging.exception("Stream failed")
        yield f"❌ Error: {e}"


chatbot_ui = gr.ChatInterface(
    fn=generate,
    type="messages",
    chatbot=gr.Chatbot(
        label="OSS vLLM Chatbot",
        type="messages",
        height=600,
        latex_delimiters=LATEX_DELIMS,
    ),
    additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
    additional_inputs=[
        gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
        gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
        gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
        gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
    ],
    stop_btn=True,
    examples=[
        ["Explain the difference between supervised and unsupervised learning."],
        ["Summarize the plot of Inception in two sentences."],
        ["Show me the LaTeX for the quadratic formula."],
        ["What are advantages of AMD Instinct MI300X GPU?"],
        ["Derive the gradient of softmax cross-entropy loss."],
        ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
    ],
    title=" GPT-OSS-120B on AMD MI300X",
    description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.",
)

if __name__ == "__main__":
    chatbot_ui.queue(max_size=QUEUE_SIZE,
                     default_concurrency_limit=CONCURRENCY_LIMIT).launch()