mwalker22 commited on
Commit
3e8133b
·
1 Parent(s): 43b3bc7

Debug, abd resulting refactorings to correct errors occurring in when the application

Browse files

is deployed via Docker.

- Ensured that the state objectof the LangGraph is always an SDGState object, preventing attribute errors.
- Improved debugging and logging in app.py.
- Updated Dockerfile build process to use uv for reliable builds and dependency resolution.
- Refactored graph node logic and types for clarity and maintainability.
- Enhanced vectorstore embedding logic and improved test mocking for OpenAI embeddings.
- Updated and cleaned up test suite for preprocess/embed_documents.
- General codebase cleanup and modernization.

Dockerfile CHANGED
@@ -4,22 +4,40 @@ FROM python:3.11-slim
4
  # Set working directory
5
  WORKDIR /app
6
 
7
- # Copy requirements and install dependencies
8
- COPY requirements.txt .
9
- RUN pip install --no-cache-dir -r requirements.txt
10
 
11
- # Copy the rest of the application
12
- COPY . .
 
 
 
13
 
14
- # Install the package in development mode
15
- RUN pip install -e .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Expose the port Streamlit runs on
18
  EXPOSE 8501
19
 
20
  # Set environment variables
21
  ENV PYTHONUNBUFFERED=1
22
- ENV ENVIRONMENT=production
 
 
23
 
24
  # Command to run the application
25
- CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
+ # Install UV
8
+ RUN pip install uv
 
9
 
10
+ # Copy pyproject.toml and install dependencies
11
+ COPY pyproject.toml .
12
+ RUN uv venv && \
13
+ . .venv/bin/activate && \
14
+ uv pip install -e .
15
 
16
+ # Copy the application code
17
+ COPY preprocess/ preprocess/
18
+ COPY graph/ graph/
19
+ COPY app.py .
20
+
21
+ # Create necessary directories
22
+ RUN mkdir -p generated data
23
+
24
+ # Copy data after creating directory
25
+ COPY data/ data/
26
+
27
+ # Create a shell script to run the application
28
+ RUN echo '#!/bin/bash\n\
29
+ source /app/.venv/bin/activate\n\
30
+ exec /app/.venv/bin/streamlit run app.py --server.port=8501 --server.address=0.0.0.0' > /app/run.sh && \
31
+ chmod +x /app/run.sh
32
 
33
  # Expose the port Streamlit runs on
34
  EXPOSE 8501
35
 
36
  # Set environment variables
37
  ENV PYTHONUNBUFFERED=1
38
+ ENV ENVIRONMENT=development
39
+ ENV LANGCHAIN_TRACING_V2=false
40
+ ENV PATH="/app/.venv/bin:$PATH"
41
 
42
  # Command to run the application
43
+ CMD ["/app/run.sh"]
app.py CHANGED
@@ -1,10 +1,15 @@
1
  import streamlit as st
2
  import json
 
3
  from preprocess.html_to_documents import extract_documents_from_html
4
  from preprocess.embed_documents import create_or_load_vectorstore
5
  from graph.build_graph import build_sdg_graph
6
  from graph.types import SDGState
7
 
 
 
 
 
8
  # Page config
9
  st.set_page_config(
10
  page_title="SDG via LangGraph",
@@ -45,10 +50,20 @@ docs, vectorstore, graph = initialize_resources()
45
  if st.button("Generate Synthetic Data"):
46
  with st.spinner("Generating synthetic data..."):
47
  # Create initial state
48
- initial_state = SDGState(input="Generate synthetic data about LLM evolution")
 
 
 
 
 
 
 
49
 
50
- # Invoke the graph
51
  result = graph.invoke(initial_state)
 
 
 
52
 
53
  # Display results
54
  st.subheader("Generated Data")
@@ -57,21 +72,21 @@ if st.button("Generate Synthetic Data"):
57
  st.markdown("### Evolved Questions")
58
  evolved_questions = [
59
  {"id": f"q{i}", "question": q, "evolution_type": "simple"}
60
- for i, q in enumerate([result["evolved_question"]]) # Currently only one question
61
  ]
62
  st.json(evolved_questions)
63
 
64
  # Display answers
65
  st.markdown("### Answers")
66
  answers = [
67
- {"id": "q0", "answer": result["answer"]}
68
  ]
69
  st.json(answers)
70
 
71
  # Display contexts
72
  st.markdown("### Contexts")
73
  contexts = [
74
- {"id": "q0", "contexts": result["context"]}
75
  ]
76
  st.json(contexts)
77
 
 
1
  import streamlit as st
2
  import json
3
+ import logging
4
  from preprocess.html_to_documents import extract_documents_from_html
5
  from preprocess.embed_documents import create_or_load_vectorstore
6
  from graph.build_graph import build_sdg_graph
7
  from graph.types import SDGState
8
 
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.DEBUG)
11
+ logger = logging.getLogger(__name__)
12
+
13
  # Page config
14
  st.set_page_config(
15
  page_title="SDG via LangGraph",
 
50
  if st.button("Generate Synthetic Data"):
51
  with st.spinner("Generating synthetic data..."):
52
  # Create initial state
53
+ initial_state = SDGState(
54
+ input="Generate synthetic data about LLM evolution",
55
+ documents=[],
56
+ evolved_question="",
57
+ context=[],
58
+ answer=""
59
+ )
60
+ logger.debug(f"Initial state before invoke: {initial_state}")
61
 
62
+ # Invoke the graph with the SDGState object
63
  result = graph.invoke(initial_state)
64
+ logger.debug(f"Graph result: {result}")
65
+ if not isinstance(result, SDGState):
66
+ result = SDGState(**dict(result))
67
 
68
  # Display results
69
  st.subheader("Generated Data")
 
72
  st.markdown("### Evolved Questions")
73
  evolved_questions = [
74
  {"id": f"q{i}", "question": q, "evolution_type": "simple"}
75
+ for i, q in enumerate([result.evolved_question]) # Currently only one question
76
  ]
77
  st.json(evolved_questions)
78
 
79
  # Display answers
80
  st.markdown("### Answers")
81
  answers = [
82
+ {"id": "q0", "answer": result.answer}
83
  ]
84
  st.json(answers)
85
 
86
  # Display contexts
87
  st.markdown("### Contexts")
88
  contexts = [
89
+ {"id": "q0", "contexts": result.context}
90
  ]
91
  st.json(contexts)
92
 
graph/build_graph.py CHANGED
@@ -6,16 +6,22 @@ from graph.nodes.answer import generate_answer
6
 
7
 
8
  def build_sdg_graph(docs, vectorstore) -> StateGraph:
 
9
  builder = StateGraph(SDGState)
10
 
11
- # Add nodes
12
  builder.add_node("evolve", evolve_question)
13
  builder.add_node("retrieve", lambda state: retrieve_relevant_context(state, vectorstore))
14
  builder.add_node("generate_answer", generate_answer)
15
 
16
- # Define flow
17
  builder.set_entry_point("evolve")
18
  builder.add_edge("evolve", "retrieve")
19
  builder.add_edge("retrieve", "generate_answer")
 
20
 
21
- return builder.compile()
 
 
 
 
 
6
 
7
 
8
  def build_sdg_graph(docs, vectorstore) -> StateGraph:
9
+ # Create a new graph with our state type
10
  builder = StateGraph(SDGState)
11
 
12
+ # Add nodes with explicit state handling
13
  builder.add_node("evolve", evolve_question)
14
  builder.add_node("retrieve", lambda state: retrieve_relevant_context(state, vectorstore))
15
  builder.add_node("generate_answer", generate_answer)
16
 
17
+ # Define the flow
18
  builder.set_entry_point("evolve")
19
  builder.add_edge("evolve", "retrieve")
20
  builder.add_edge("retrieve", "generate_answer")
21
+ builder.set_finish_point("generate_answer")
22
 
23
+ # Compile the graph
24
+ graph = builder.compile()
25
+
26
+ # Return the graph
27
+ return graph
graph/nodes/answer.py CHANGED
@@ -1,10 +1,26 @@
1
  from graph.types import SDGState
 
 
 
2
 
3
  def generate_answer(state: SDGState) -> SDGState:
4
  """
5
  Synthesizes an answer from the retrieved context.
6
  This is a placeholder and would normally call an LLM in production.
7
  """
 
 
 
8
  context_snippet = "\n".join(state.context)
9
- state.answer = f"Based on the retrieved context:\n{context_snippet}"
10
- return state
 
 
 
 
 
 
 
 
 
 
 
1
  from graph.types import SDGState
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
 
6
  def generate_answer(state: SDGState) -> SDGState:
7
  """
8
  Synthesizes an answer from the retrieved context.
9
  This is a placeholder and would normally call an LLM in production.
10
  """
11
+ logger.debug(f"Answer node received state: {state}")
12
+
13
+ # Generate the answer
14
  context_snippet = "\n".join(state.context)
15
+
16
+ # Create a new state with the generated answer
17
+ new_state = SDGState(
18
+ input=state.input,
19
+ documents=state.documents,
20
+ evolved_question=state.evolved_question,
21
+ context=state.context,
22
+ answer=f"Based on the retrieved context:\n{context_snippet}"
23
+ )
24
+
25
+ logger.debug(f"Answer node returning state: {new_state}")
26
+ return new_state
graph/nodes/evolve.py CHANGED
@@ -1,8 +1,20 @@
1
  from graph.prompts import question_evolution_prompt
2
  from graph.types import SDGState
 
3
 
 
4
 
5
  def evolve_question(state: SDGState) -> SDGState:
6
- # Placeholder for LLM-driven evolution
7
- state.evolved_question = f"Evolved version of: {state.input}"
8
- return state
 
 
 
 
 
 
 
 
 
 
 
1
  from graph.prompts import question_evolution_prompt
2
  from graph.types import SDGState
3
+ import logging
4
 
5
+ logger = logging.getLogger(__name__)
6
 
7
  def evolve_question(state: SDGState) -> SDGState:
8
+ logger.debug(f"Evolve node received state: {state}")
9
+
10
+ # Create a new state with the evolved question
11
+ new_state = SDGState(
12
+ input=state.input,
13
+ documents=state.documents,
14
+ evolved_question=f"Evolved version of: {state.input}",
15
+ context=state.context,
16
+ answer=state.answer
17
+ )
18
+
19
+ logger.debug(f"Evolve node returning state: {new_state}")
20
+ return new_state
graph/nodes/retrieve.py CHANGED
@@ -1,6 +1,23 @@
1
  from graph.types import SDGState
 
 
 
2
 
3
  def retrieve_relevant_context(state: SDGState, vectorstore) -> SDGState:
 
 
 
4
  retrieved_docs = vectorstore.similarity_search(state.evolved_question, k=5)
5
- state.context = [doc.page_content for doc in retrieved_docs]
6
- return state
 
 
 
 
 
 
 
 
 
 
 
 
1
  from graph.types import SDGState
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
 
6
  def retrieve_relevant_context(state: SDGState, vectorstore) -> SDGState:
7
+ logger.debug(f"Retrieve node received state: {state}")
8
+
9
+ # Perform retrieval
10
  retrieved_docs = vectorstore.similarity_search(state.evolved_question, k=5)
11
+ logger.debug(f"Retrieved {len(retrieved_docs)} documents")
12
+
13
+ # Create a new state with the retrieved context
14
+ new_state = SDGState(
15
+ input=state.input,
16
+ documents=state.documents,
17
+ evolved_question=state.evolved_question,
18
+ context=[doc.page_content for doc in retrieved_docs],
19
+ answer=state.answer
20
+ )
21
+
22
+ logger.debug(f"Retrieve node returning state: {new_state}")
23
+ return new_state
graph/types.py CHANGED
@@ -1,10 +1,10 @@
1
  from typing import List
2
  from langchain.schema import Document
3
- from pydantic import BaseModel
4
 
5
  class SDGState(BaseModel):
6
- input: str
7
- documents: List[Document] = []
8
- evolved_question: str = ""
9
- context: List[str] = []
10
- answer: str = ""
 
1
  from typing import List
2
  from langchain.schema import Document
3
+ from pydantic import BaseModel, Field
4
 
5
  class SDGState(BaseModel):
6
+ input: str = Field(default="")
7
+ documents: List[Document] = Field(default_factory=list)
8
+ evolved_question: str = Field(default="")
9
+ context: List[str] = Field(default_factory=list)
10
+ answer: str = Field(default="")
main.py CHANGED
@@ -72,7 +72,17 @@ def main():
72
 
73
  graph = build_sdg_graph(docs, vectorstore)
74
  initial_state = SDGState(input="How did LLMs evolve in 2023?")
75
- result = graph.invoke(initial_state)
 
 
 
 
 
 
 
 
 
 
76
  print("🧠 Agent Output:")
77
  print(json.dumps(result, indent=2, ensure_ascii=False, cls=DocumentEncoder))
78
  else:
 
72
 
73
  graph = build_sdg_graph(docs, vectorstore)
74
  initial_state = SDGState(input="How did LLMs evolve in 2023?")
75
+
76
+ # Convert the state to a dictionary with the expected keys
77
+ state_dict = {
78
+ "input": initial_state.input,
79
+ "documents": initial_state.documents,
80
+ "evolved_question": initial_state.evolved_question,
81
+ "context": initial_state.context,
82
+ "answer": initial_state.answer
83
+ }
84
+
85
+ result = graph.invoke(state_dict)
86
  print("🧠 Agent Output:")
87
  print(json.dumps(result, indent=2, ensure_ascii=False, cls=DocumentEncoder))
88
  else:
preprocess/embed_documents.py CHANGED
@@ -10,6 +10,8 @@ import os
10
 
11
  def create_or_load_vectorstore(docs: list[Document], path: str = "generated/vectorstore") -> FAISS:
12
  path = Path(path)
 
 
13
  embeddings = OpenAIEmbeddings(
14
  model="text-embedding-3-small",
15
  openai_api_key=os.getenv("OPENAI_API_KEY")
 
10
 
11
  def create_or_load_vectorstore(docs: list[Document], path: str = "generated/vectorstore") -> FAISS:
12
  path = Path(path)
13
+
14
+ # Initialize embeddings with minimal configuration
15
  embeddings = OpenAIEmbeddings(
16
  model="text-embedding-3-small",
17
  openai_api_key=os.getenv("OPENAI_API_KEY")
pyproject.toml CHANGED
@@ -15,7 +15,8 @@ dependencies = [
15
  "openai",
16
  "tiktoken",
17
  "langchain-openai",
18
- "faiss-cpu"
 
19
  ]
20
 
21
  [project.optional-dependencies]
 
15
  "openai",
16
  "tiktoken",
17
  "langchain-openai",
18
+ "faiss-cpu",
19
+ "streamlit"
20
  ]
21
 
22
  [project.optional-dependencies]
tests/preprocess/test_embed_documents.py CHANGED
@@ -25,11 +25,15 @@ def test_create_vectorstore_when_not_cached(mock_open_file, mock_exists, mock_em
25
  @patch("preprocess.embed_documents.FAISS.load_local")
26
  @patch("preprocess.embed_documents.Path.exists", return_value=True)
27
  @patch("preprocess.embed_documents.open", new_callable=mock_open)
28
- def test_load_existing_vectorstore(mock_open_file, mock_exists, mock_load_local):
 
29
  mock_vectorstore = MagicMock()
30
  mock_load_local.return_value = mock_vectorstore
31
-
 
 
32
  result = create_or_load_vectorstore([], path="tests/tmp/vectorstore.pkl")
33
 
34
  assert result == mock_vectorstore
35
- mock_load_local.assert_called_once()
 
 
25
  @patch("preprocess.embed_documents.FAISS.load_local")
26
  @patch("preprocess.embed_documents.Path.exists", return_value=True)
27
  @patch("preprocess.embed_documents.open", new_callable=mock_open)
28
+ @patch("preprocess.embed_documents.OpenAIEmbeddings")
29
+ def test_load_existing_vectorstore(mock_embeddings, mock_open_file, mock_exists, mock_load_local):
30
  mock_vectorstore = MagicMock()
31
  mock_load_local.return_value = mock_vectorstore
32
+ mock_embeddings_instance = MagicMock()
33
+ mock_embeddings.return_value = mock_embeddings_instance
34
+
35
  result = create_or_load_vectorstore([], path="tests/tmp/vectorstore.pkl")
36
 
37
  assert result == mock_vectorstore
38
+ mock_load_local.assert_called_once()
39
+ mock_embeddings.assert_called_once()