Spaces:

mwalker22
/

TMD-SDG-via-LangGraph

Sleeping

mwalker22 commited on Apr 27

Commit

3e8133b

1 Parent(s): 43b3bc7

Debug, abd resulting refactorings to correct errors occurring in when the application

is deployed via Docker.

- Ensured that the state objectof the LangGraph is always an SDGState object, preventing attribute errors.
- Improved debugging and logging in app.py.
- Updated Dockerfile build process to use uv for reliable builds and dependency resolution.
- Refactored graph node logic and types for clarity and maintainability.
- Enhanced vectorstore embedding logic and improved test mocking for OpenAI embeddings.
- Updated and cleaned up test suite for preprocess/embed_documents.
- General codebase cleanup and modernization.

Files changed (11) hide show

Dockerfile +27 -9
app.py +20 -5
graph/build_graph.py +9 -3
graph/nodes/answer.py +18 -2
graph/nodes/evolve.py +15 -3
graph/nodes/retrieve.py +19 -2
graph/types.py +6 -6
main.py +11 -1
preprocess/embed_documents.py +2 -0
pyproject.toml +2 -1
tests/preprocess/test_embed_documents.py +7 -3

Dockerfile CHANGED Viewed

@@ -4,22 +4,40 @@ FROM python:3.11-slim
 # Set working directory
 WORKDIR /app
-# Copy requirements and install dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application
-COPY . .
-# Install the package in development mode
-RUN pip install -e .
 # Expose the port Streamlit runs on
 EXPOSE 8501
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
-ENV ENVIRONMENT=production
 # Command to run the application
-CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

 # Set working directory
 WORKDIR /app
+# Install UV
+RUN pip install uv
+# Copy pyproject.toml and install dependencies
+COPY pyproject.toml .
+RUN uv venv && \
+    . .venv/bin/activate && \
+    uv pip install -e .
+# Copy the application code
+COPY preprocess/ preprocess/
+COPY graph/ graph/
+COPY app.py .
+# Create necessary directories
+RUN mkdir -p generated data
+# Copy data after creating directory
+COPY data/ data/
+# Create a shell script to run the application
+RUN echo '#!/bin/bash\n\
+source /app/.venv/bin/activate\n\
+exec /app/.venv/bin/streamlit run app.py --server.port=8501 --server.address=0.0.0.0' > /app/run.sh && \
+    chmod +x /app/run.sh
 # Expose the port Streamlit runs on
 EXPOSE 8501
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
+ENV ENVIRONMENT=development
+ENV LANGCHAIN_TRACING_V2=false
+ENV PATH="/app/.venv/bin:$PATH"
 # Command to run the application
+CMD ["/app/run.sh"]

app.py CHANGED Viewed

@@ -1,10 +1,15 @@
 import streamlit as st
 import json
 from preprocess.html_to_documents import extract_documents_from_html
 from preprocess.embed_documents import create_or_load_vectorstore
 from graph.build_graph import build_sdg_graph
 from graph.types import SDGState
 # Page config
 st.set_page_config(
     page_title="SDG via LangGraph",
@@ -45,10 +50,20 @@ docs, vectorstore, graph = initialize_resources()
 if st.button("Generate Synthetic Data"):
     with st.spinner("Generating synthetic data..."):
         # Create initial state
-        initial_state = SDGState(input="Generate synthetic data about LLM evolution")
-        # Invoke the graph
         result = graph.invoke(initial_state)
         # Display results
         st.subheader("Generated Data")
@@ -57,21 +72,21 @@ if st.button("Generate Synthetic Data"):
         st.markdown("### Evolved Questions")
         evolved_questions = [
             {"id": f"q{i}", "question": q, "evolution_type": "simple"}
-            for i, q in enumerate([result["evolved_question"]])  # Currently only one question
         ]
         st.json(evolved_questions)
         # Display answers
         st.markdown("### Answers")
         answers = [
-            {"id": "q0", "answer": result["answer"]}
         ]
         st.json(answers)
         # Display contexts
         st.markdown("### Contexts")
         contexts = [
-            {"id": "q0", "contexts": result["context"]}
         ]
         st.json(contexts)

 import streamlit as st
 import json
+import logging
 from preprocess.html_to_documents import extract_documents_from_html
 from preprocess.embed_documents import create_or_load_vectorstore
 from graph.build_graph import build_sdg_graph
 from graph.types import SDGState
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
 # Page config
 st.set_page_config(
     page_title="SDG via LangGraph",
 if st.button("Generate Synthetic Data"):
     with st.spinner("Generating synthetic data..."):
         # Create initial state
+        initial_state = SDGState(
+            input="Generate synthetic data about LLM evolution",
+            documents=[],
+            evolved_question="",
+            context=[],
+            answer=""
+        )
+        logger.debug(f"Initial state before invoke: {initial_state}")
+        # Invoke the graph with the SDGState object
         result = graph.invoke(initial_state)
+        logger.debug(f"Graph result: {result}")
+        if not isinstance(result, SDGState):
+            result = SDGState(**dict(result))
         # Display results
         st.subheader("Generated Data")
         st.markdown("### Evolved Questions")
         evolved_questions = [
             {"id": f"q{i}", "question": q, "evolution_type": "simple"}
+            for i, q in enumerate([result.evolved_question])  # Currently only one question
         ]
         st.json(evolved_questions)
         # Display answers
         st.markdown("### Answers")
         answers = [
+            {"id": "q0", "answer": result.answer}
         ]
         st.json(answers)
         # Display contexts
         st.markdown("### Contexts")
         contexts = [
+            {"id": "q0", "contexts": result.context}
         ]
         st.json(contexts)

graph/build_graph.py CHANGED Viewed

@@ -6,16 +6,22 @@ from graph.nodes.answer import generate_answer
 def build_sdg_graph(docs, vectorstore) -> StateGraph:
     builder = StateGraph(SDGState)
-    # Add nodes
     builder.add_node("evolve", evolve_question)
     builder.add_node("retrieve", lambda state: retrieve_relevant_context(state, vectorstore))
     builder.add_node("generate_answer", generate_answer)
-    # Define flow
     builder.set_entry_point("evolve")
     builder.add_edge("evolve", "retrieve")
     builder.add_edge("retrieve", "generate_answer")
-    return builder.compile()

 def build_sdg_graph(docs, vectorstore) -> StateGraph:
+    # Create a new graph with our state type
     builder = StateGraph(SDGState)
+    # Add nodes with explicit state handling
     builder.add_node("evolve", evolve_question)
     builder.add_node("retrieve", lambda state: retrieve_relevant_context(state, vectorstore))
     builder.add_node("generate_answer", generate_answer)
+    # Define the flow
     builder.set_entry_point("evolve")
     builder.add_edge("evolve", "retrieve")
     builder.add_edge("retrieve", "generate_answer")
+    builder.set_finish_point("generate_answer")
+    # Compile the graph
+    graph = builder.compile()
+    # Return the graph
+    return graph

graph/nodes/answer.py CHANGED Viewed

@@ -1,10 +1,26 @@
 from graph.types import SDGState
 def generate_answer(state: SDGState) -> SDGState:
     """
     Synthesizes an answer from the retrieved context.
     This is a placeholder and would normally call an LLM in production.
     """
     context_snippet = "\n".join(state.context)
-    state.answer = f"Based on the retrieved context:\n{context_snippet}"
-    return state

 from graph.types import SDGState
+import logging
+logger = logging.getLogger(__name__)
 def generate_answer(state: SDGState) -> SDGState:
     """
     Synthesizes an answer from the retrieved context.
     This is a placeholder and would normally call an LLM in production.
     """
+    logger.debug(f"Answer node received state: {state}")
+    # Generate the answer
     context_snippet = "\n".join(state.context)
+    # Create a new state with the generated answer
+    new_state = SDGState(
+        input=state.input,
+        documents=state.documents,
+        evolved_question=state.evolved_question,
+        context=state.context,
+        answer=f"Based on the retrieved context:\n{context_snippet}"
+    )
+    logger.debug(f"Answer node returning state: {new_state}")
+    return new_state

graph/nodes/evolve.py CHANGED Viewed

@@ -1,8 +1,20 @@
 from graph.prompts import question_evolution_prompt
 from graph.types import SDGState
 def evolve_question(state: SDGState) -> SDGState:
-    # Placeholder for LLM-driven evolution
-    state.evolved_question = f"Evolved version of: {state.input}"
-    return state

 from graph.prompts import question_evolution_prompt
 from graph.types import SDGState
+import logging
+logger = logging.getLogger(__name__)
 def evolve_question(state: SDGState) -> SDGState:
+    logger.debug(f"Evolve node received state: {state}")
+    # Create a new state with the evolved question
+    new_state = SDGState(
+        input=state.input,
+        documents=state.documents,
+        evolved_question=f"Evolved version of: {state.input}",
+        context=state.context,
+        answer=state.answer
+    )
+    logger.debug(f"Evolve node returning state: {new_state}")
+    return new_state

graph/nodes/retrieve.py CHANGED Viewed

@@ -1,6 +1,23 @@
 from graph.types import SDGState
 def retrieve_relevant_context(state: SDGState, vectorstore) -> SDGState:
     retrieved_docs = vectorstore.similarity_search(state.evolved_question, k=5)
-    state.context = [doc.page_content for doc in retrieved_docs]
-    return state

 from graph.types import SDGState
+import logging
+logger = logging.getLogger(__name__)
 def retrieve_relevant_context(state: SDGState, vectorstore) -> SDGState:
+    logger.debug(f"Retrieve node received state: {state}")
+    # Perform retrieval
     retrieved_docs = vectorstore.similarity_search(state.evolved_question, k=5)
+    logger.debug(f"Retrieved {len(retrieved_docs)} documents")
+    # Create a new state with the retrieved context
+    new_state = SDGState(
+        input=state.input,
+        documents=state.documents,
+        evolved_question=state.evolved_question,
+        context=[doc.page_content for doc in retrieved_docs],
+        answer=state.answer
+    )
+    logger.debug(f"Retrieve node returning state: {new_state}")
+    return new_state

graph/types.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from typing import List
 from langchain.schema import Document
-from pydantic import BaseModel
 class SDGState(BaseModel):
-    input: str
-    documents: List[Document] = []
-    evolved_question: str = ""
-    context: List[str] = []
-    answer: str = ""

 from typing import List
 from langchain.schema import Document
+from pydantic import BaseModel, Field
 class SDGState(BaseModel):
+    input: str = Field(default="")
+    documents: List[Document] = Field(default_factory=list)
+    evolved_question: str = Field(default="")
+    context: List[str] = Field(default_factory=list)
+    answer: str = Field(default="")

main.py CHANGED Viewed

@@ -72,7 +72,17 @@ def main():
         graph = build_sdg_graph(docs, vectorstore)
         initial_state = SDGState(input="How did LLMs evolve in 2023?")
-        result = graph.invoke(initial_state)
         print("🧠 Agent Output:")
         print(json.dumps(result, indent=2, ensure_ascii=False, cls=DocumentEncoder))
     else:

         graph = build_sdg_graph(docs, vectorstore)
         initial_state = SDGState(input="How did LLMs evolve in 2023?")
+        # Convert the state to a dictionary with the expected keys
+        state_dict = {
+            "input": initial_state.input,
+            "documents": initial_state.documents,
+            "evolved_question": initial_state.evolved_question,
+            "context": initial_state.context,
+            "answer": initial_state.answer
+        }
+        result = graph.invoke(state_dict)
         print("🧠 Agent Output:")
         print(json.dumps(result, indent=2, ensure_ascii=False, cls=DocumentEncoder))
     else:

preprocess/embed_documents.py CHANGED Viewed

@@ -10,6 +10,8 @@ import os
 def create_or_load_vectorstore(docs: list[Document], path: str = "generated/vectorstore") -> FAISS:
     path = Path(path)
     embeddings = OpenAIEmbeddings(
         model="text-embedding-3-small",
         openai_api_key=os.getenv("OPENAI_API_KEY")

 def create_or_load_vectorstore(docs: list[Document], path: str = "generated/vectorstore") -> FAISS:
     path = Path(path)
+    # Initialize embeddings with minimal configuration
     embeddings = OpenAIEmbeddings(
         model="text-embedding-3-small",
         openai_api_key=os.getenv("OPENAI_API_KEY")

pyproject.toml CHANGED Viewed

@@ -15,7 +15,8 @@ dependencies = [
     "openai",
     "tiktoken",
     "langchain-openai",
-    "faiss-cpu"
 ]
 [project.optional-dependencies]

     "openai",
     "tiktoken",
     "langchain-openai",
+    "faiss-cpu",
+    "streamlit"
 ]
 [project.optional-dependencies]

tests/preprocess/test_embed_documents.py CHANGED Viewed

@@ -25,11 +25,15 @@ def test_create_vectorstore_when_not_cached(mock_open_file, mock_exists, mock_em
 @patch("preprocess.embed_documents.FAISS.load_local")
 @patch("preprocess.embed_documents.Path.exists", return_value=True)
 @patch("preprocess.embed_documents.open", new_callable=mock_open)
-def test_load_existing_vectorstore(mock_open_file, mock_exists, mock_load_local):
     mock_vectorstore = MagicMock()
     mock_load_local.return_value = mock_vectorstore
     result = create_or_load_vectorstore([], path="tests/tmp/vectorstore.pkl")
     assert result == mock_vectorstore
-    mock_load_local.assert_called_once()

 @patch("preprocess.embed_documents.FAISS.load_local")
 @patch("preprocess.embed_documents.Path.exists", return_value=True)
 @patch("preprocess.embed_documents.open", new_callable=mock_open)
+@patch("preprocess.embed_documents.OpenAIEmbeddings")
+def test_load_existing_vectorstore(mock_embeddings, mock_open_file, mock_exists, mock_load_local):
     mock_vectorstore = MagicMock()
     mock_load_local.return_value = mock_vectorstore
+    mock_embeddings_instance = MagicMock()
+    mock_embeddings.return_value = mock_embeddings_instance
     result = create_or_load_vectorstore([], path="tests/tmp/vectorstore.pkl")
     assert result == mock_vectorstore
+    mock_load_local.assert_called_once()
+    mock_embeddings.assert_called_once()