import streamlit as st
import json
import logging
import os
from preprocess.html_to_documents import extract_documents_from_html
from preprocess.embed_documents import create_or_load_vectorstore
from graph.build_graph import build_sdg_graph
from graph.types import SDGState
from langchain_openai import ChatOpenAI

# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Page config
st.set_page_config(
    page_title="SDG via LangGraph",
    page_icon="🧠",
    layout="wide"
)

# Title
st.title("🧠 Synthetic Data Generation via LangGraph")
st.markdown("This app demonstrates the RAGAS Synthetic Data Generation steps using LangGraph.")

# Initialize the graph and documents (this would be done once at startup)
@st.cache_resource
def initialize_resources():
    st.info("Initializing resources... This may take a moment.")
    
    # Load documents
    docs = []
    for html_file, label in [
        ("data/2023_llms.html", "llm-2023"),
        ("data/2024_llms.html", "llm-2024"),
    ]:
        docs.extend(extract_documents_from_html(html_file, label))
    
    # Create vectorstore
    vectorstore_path = os.environ.get("VECTORSTORE_PATH", "/tmp/vectorstore")
    vectorstore = create_or_load_vectorstore(docs, path=vectorstore_path)
    
    # Initialize LLM client
    llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=None)  # None will use env var
    
    # Build graph
    graph = build_sdg_graph(docs, vectorstore, llm)
    
    st.success("Resources initialized successfully!")
    return docs, vectorstore, graph

# Initialize resources
docs, vectorstore, graph = initialize_resources()

# Add a number input for evolution passes
num_evolve_passes = st.number_input(
    label="Number of Evolution Passes",
    min_value=1,
    max_value=10,
    value=2,
    step=1,
    help="How many times to evolve the question (alternates between challenging and creative prompts)."
)

# Generate synthetic data button
if st.button("Generate Synthetic Data"):
    with st.spinner("Generating synthetic data..."):
        # Create initial state
        state = SDGState(
            input="Generate synthetic data about LLM evolution",
            documents=[],
            evolved_questions=[],
            context=[],
            answer="",
            num_evolve_passes=num_evolve_passes
        )
        
        # Run the graph for each evolution pass
        all_results = []
        for i in range(num_evolve_passes):
            logger.debug(f"Running evolution pass {i+1}/{num_evolve_passes}")
            result = graph.invoke(state)
            if not isinstance(result, SDGState):
                result = SDGState(**dict(result))
            all_results.append(result)
            # Update state for next iteration with evolved questions
            state = SDGState(
                input=state.input,
                documents=state.documents,
                evolved_questions=result.evolved_questions,  # Pass forward all evolved questions
                context=[],  # Reset context for next iteration
                answer="",   # Reset answer for next iteration
                num_evolve_passes=num_evolve_passes
            )
        
        # Display results
        st.subheader("Generated Data")
        
        # Display evolved questions
        st.markdown("### Evolved Questions")
        evolved_questions = [
            {"id": f"q{i}", "question": result.evolved_questions[-1], "evolution_type": "simple"}
            for i, result in enumerate(all_results)
        ]
        st.json(evolved_questions)
        
        # Display answers
        st.markdown("### Answers")
        answers = [
            {"id": f"q{i}", "answer": result.answer}
            for i, result in enumerate(all_results)
        ]
        st.json(answers)
        
        # Display contexts
        st.markdown("### Contexts")
        contexts = [
            {"id": f"q{i}", "contexts": result.context}
            for i, result in enumerate(all_results)
        ]
        st.json(contexts)
        
        # Download results
        results = {
            "evolved_questions": evolved_questions,
            "answers": answers,
            "contexts": contexts
        }
        
        st.download_button(
            label="Download Results as JSON",
            data=json.dumps(results, indent=2),
            file_name="synthetic_data.json",
            mime="application/json"
        )