Spaces:
Sleeping
Sleeping
Merge remote-tracking branch 'origin/main'
Browse files- Dockerfile +27 -9
- app.py +20 -5
- graph/build_graph.py +9 -3
- graph/nodes/answer.py +18 -2
- graph/nodes/evolve.py +15 -3
- graph/nodes/retrieve.py +19 -2
- graph/types.py +6 -6
- main.py +1 -0
- preprocess/embed_documents.py +7 -1
- pyproject.toml +2 -1
- requirements.txt +0 -11
- tests/preprocess/test_embed_documents.py +7 -3
Dockerfile
CHANGED
|
@@ -4,22 +4,40 @@ FROM python:3.11-slim
|
|
| 4 |
# Set working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
|
| 11 |
-
# Copy
|
| 12 |
-
COPY . .
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Expose the port Streamlit runs on
|
| 18 |
EXPOSE 8501
|
| 19 |
|
| 20 |
# Set environment variables
|
| 21 |
ENV PYTHONUNBUFFERED=1
|
| 22 |
-
ENV ENVIRONMENT=
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Command to run the application
|
| 25 |
-
CMD ["
|
|
|
|
| 4 |
# Set working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
+
# Install UV
|
| 8 |
+
RUN pip install uv
|
|
|
|
| 9 |
|
| 10 |
+
# Copy pyproject.toml and install dependencies
|
| 11 |
+
COPY pyproject.toml .
|
| 12 |
+
RUN uv venv && \
|
| 13 |
+
. .venv/bin/activate && \
|
| 14 |
+
uv pip install -e .
|
| 15 |
|
| 16 |
+
# Copy the application code
|
| 17 |
+
COPY preprocess/ preprocess/
|
| 18 |
+
COPY graph/ graph/
|
| 19 |
+
COPY app.py .
|
| 20 |
+
|
| 21 |
+
# Create necessary directories
|
| 22 |
+
RUN mkdir -p generated data
|
| 23 |
+
|
| 24 |
+
# Copy data after creating directory
|
| 25 |
+
COPY data/ data/
|
| 26 |
+
|
| 27 |
+
# Create a shell script to run the application
|
| 28 |
+
RUN echo '#!/bin/bash\n\
|
| 29 |
+
source /app/.venv/bin/activate\n\
|
| 30 |
+
exec /app/.venv/bin/streamlit run app.py --server.port=8501 --server.address=0.0.0.0' > /app/run.sh && \
|
| 31 |
+
chmod +x /app/run.sh
|
| 32 |
|
| 33 |
# Expose the port Streamlit runs on
|
| 34 |
EXPOSE 8501
|
| 35 |
|
| 36 |
# Set environment variables
|
| 37 |
ENV PYTHONUNBUFFERED=1
|
| 38 |
+
ENV ENVIRONMENT=development
|
| 39 |
+
ENV LANGCHAIN_TRACING_V2=false
|
| 40 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 41 |
|
| 42 |
# Command to run the application
|
| 43 |
+
CMD ["/app/run.sh"]
|
app.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import json
|
|
|
|
| 3 |
from preprocess.html_to_documents import extract_documents_from_html
|
| 4 |
from preprocess.embed_documents import create_or_load_vectorstore
|
| 5 |
from graph.build_graph import build_sdg_graph
|
| 6 |
from graph.types import SDGState
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# Page config
|
| 9 |
st.set_page_config(
|
| 10 |
page_title="SDG via LangGraph",
|
|
@@ -45,10 +50,20 @@ docs, vectorstore, graph = initialize_resources()
|
|
| 45 |
if st.button("Generate Synthetic Data"):
|
| 46 |
with st.spinner("Generating synthetic data..."):
|
| 47 |
# Create initial state
|
| 48 |
-
initial_state = SDGState(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
# Invoke the graph
|
| 51 |
result = graph.invoke(initial_state)
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# Display results
|
| 54 |
st.subheader("Generated Data")
|
|
@@ -57,21 +72,21 @@ if st.button("Generate Synthetic Data"):
|
|
| 57 |
st.markdown("### Evolved Questions")
|
| 58 |
evolved_questions = [
|
| 59 |
{"id": f"q{i}", "question": q, "evolution_type": "simple"}
|
| 60 |
-
for i, q in enumerate([result
|
| 61 |
]
|
| 62 |
st.json(evolved_questions)
|
| 63 |
|
| 64 |
# Display answers
|
| 65 |
st.markdown("### Answers")
|
| 66 |
answers = [
|
| 67 |
-
{"id": "q0", "answer": result
|
| 68 |
]
|
| 69 |
st.json(answers)
|
| 70 |
|
| 71 |
# Display contexts
|
| 72 |
st.markdown("### Contexts")
|
| 73 |
contexts = [
|
| 74 |
-
{"id": "q0", "contexts": result
|
| 75 |
]
|
| 76 |
st.json(contexts)
|
| 77 |
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import json
|
| 3 |
+
import logging
|
| 4 |
from preprocess.html_to_documents import extract_documents_from_html
|
| 5 |
from preprocess.embed_documents import create_or_load_vectorstore
|
| 6 |
from graph.build_graph import build_sdg_graph
|
| 7 |
from graph.types import SDGState
|
| 8 |
|
| 9 |
+
# Configure logging
|
| 10 |
+
logging.basicConfig(level=logging.DEBUG)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
# Page config
|
| 14 |
st.set_page_config(
|
| 15 |
page_title="SDG via LangGraph",
|
|
|
|
| 50 |
if st.button("Generate Synthetic Data"):
|
| 51 |
with st.spinner("Generating synthetic data..."):
|
| 52 |
# Create initial state
|
| 53 |
+
initial_state = SDGState(
|
| 54 |
+
input="Generate synthetic data about LLM evolution",
|
| 55 |
+
documents=[],
|
| 56 |
+
evolved_question="",
|
| 57 |
+
context=[],
|
| 58 |
+
answer=""
|
| 59 |
+
)
|
| 60 |
+
logger.debug(f"Initial state before invoke: {initial_state}")
|
| 61 |
|
| 62 |
+
# Invoke the graph with the SDGState object
|
| 63 |
result = graph.invoke(initial_state)
|
| 64 |
+
logger.debug(f"Graph result: {result}")
|
| 65 |
+
if not isinstance(result, SDGState):
|
| 66 |
+
result = SDGState(**dict(result))
|
| 67 |
|
| 68 |
# Display results
|
| 69 |
st.subheader("Generated Data")
|
|
|
|
| 72 |
st.markdown("### Evolved Questions")
|
| 73 |
evolved_questions = [
|
| 74 |
{"id": f"q{i}", "question": q, "evolution_type": "simple"}
|
| 75 |
+
for i, q in enumerate([result.evolved_question]) # Currently only one question
|
| 76 |
]
|
| 77 |
st.json(evolved_questions)
|
| 78 |
|
| 79 |
# Display answers
|
| 80 |
st.markdown("### Answers")
|
| 81 |
answers = [
|
| 82 |
+
{"id": "q0", "answer": result.answer}
|
| 83 |
]
|
| 84 |
st.json(answers)
|
| 85 |
|
| 86 |
# Display contexts
|
| 87 |
st.markdown("### Contexts")
|
| 88 |
contexts = [
|
| 89 |
+
{"id": "q0", "contexts": result.context}
|
| 90 |
]
|
| 91 |
st.json(contexts)
|
| 92 |
|
graph/build_graph.py
CHANGED
|
@@ -6,16 +6,22 @@ from graph.nodes.answer import generate_answer
|
|
| 6 |
|
| 7 |
|
| 8 |
def build_sdg_graph(docs, vectorstore) -> StateGraph:
|
|
|
|
| 9 |
builder = StateGraph(SDGState)
|
| 10 |
|
| 11 |
-
# Add nodes
|
| 12 |
builder.add_node("evolve", evolve_question)
|
| 13 |
builder.add_node("retrieve", lambda state: retrieve_relevant_context(state, vectorstore))
|
| 14 |
builder.add_node("generate_answer", generate_answer)
|
| 15 |
|
| 16 |
-
# Define flow
|
| 17 |
builder.set_entry_point("evolve")
|
| 18 |
builder.add_edge("evolve", "retrieve")
|
| 19 |
builder.add_edge("retrieve", "generate_answer")
|
|
|
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def build_sdg_graph(docs, vectorstore) -> StateGraph:
|
| 9 |
+
# Create a new graph with our state type
|
| 10 |
builder = StateGraph(SDGState)
|
| 11 |
|
| 12 |
+
# Add nodes with explicit state handling
|
| 13 |
builder.add_node("evolve", evolve_question)
|
| 14 |
builder.add_node("retrieve", lambda state: retrieve_relevant_context(state, vectorstore))
|
| 15 |
builder.add_node("generate_answer", generate_answer)
|
| 16 |
|
| 17 |
+
# Define the flow
|
| 18 |
builder.set_entry_point("evolve")
|
| 19 |
builder.add_edge("evolve", "retrieve")
|
| 20 |
builder.add_edge("retrieve", "generate_answer")
|
| 21 |
+
builder.set_finish_point("generate_answer")
|
| 22 |
|
| 23 |
+
# Compile the graph
|
| 24 |
+
graph = builder.compile()
|
| 25 |
+
|
| 26 |
+
# Return the graph
|
| 27 |
+
return graph
|
graph/nodes/answer.py
CHANGED
|
@@ -1,10 +1,26 @@
|
|
| 1 |
from graph.types import SDGState
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
def generate_answer(state: SDGState) -> SDGState:
|
| 4 |
"""
|
| 5 |
Synthesizes an answer from the retrieved context.
|
| 6 |
This is a placeholder and would normally call an LLM in production.
|
| 7 |
"""
|
|
|
|
|
|
|
|
|
|
| 8 |
context_snippet = "\n".join(state.context)
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from graph.types import SDGState
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
logger = logging.getLogger(__name__)
|
| 5 |
|
| 6 |
def generate_answer(state: SDGState) -> SDGState:
|
| 7 |
"""
|
| 8 |
Synthesizes an answer from the retrieved context.
|
| 9 |
This is a placeholder and would normally call an LLM in production.
|
| 10 |
"""
|
| 11 |
+
logger.debug(f"Answer node received state: {state}")
|
| 12 |
+
|
| 13 |
+
# Generate the answer
|
| 14 |
context_snippet = "\n".join(state.context)
|
| 15 |
+
|
| 16 |
+
# Create a new state with the generated answer
|
| 17 |
+
new_state = SDGState(
|
| 18 |
+
input=state.input,
|
| 19 |
+
documents=state.documents,
|
| 20 |
+
evolved_question=state.evolved_question,
|
| 21 |
+
context=state.context,
|
| 22 |
+
answer=f"Based on the retrieved context:\n{context_snippet}"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger.debug(f"Answer node returning state: {new_state}")
|
| 26 |
+
return new_state
|
graph/nodes/evolve.py
CHANGED
|
@@ -1,8 +1,20 @@
|
|
| 1 |
from graph.prompts import question_evolution_prompt
|
| 2 |
from graph.types import SDGState
|
|
|
|
| 3 |
|
|
|
|
| 4 |
|
| 5 |
def evolve_question(state: SDGState) -> SDGState:
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from graph.prompts import question_evolution_prompt
|
| 2 |
from graph.types import SDGState
|
| 3 |
+
import logging
|
| 4 |
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
|
| 7 |
def evolve_question(state: SDGState) -> SDGState:
|
| 8 |
+
logger.debug(f"Evolve node received state: {state}")
|
| 9 |
+
|
| 10 |
+
# Create a new state with the evolved question
|
| 11 |
+
new_state = SDGState(
|
| 12 |
+
input=state.input,
|
| 13 |
+
documents=state.documents,
|
| 14 |
+
evolved_question=f"Evolved version of: {state.input}",
|
| 15 |
+
context=state.context,
|
| 16 |
+
answer=state.answer
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger.debug(f"Evolve node returning state: {new_state}")
|
| 20 |
+
return new_state
|
graph/nodes/retrieve.py
CHANGED
|
@@ -1,6 +1,23 @@
|
|
| 1 |
from graph.types import SDGState
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
def retrieve_relevant_context(state: SDGState, vectorstore) -> SDGState:
|
|
|
|
|
|
|
|
|
|
| 4 |
retrieved_docs = vectorstore.similarity_search(state.evolved_question, k=5)
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from graph.types import SDGState
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
logger = logging.getLogger(__name__)
|
| 5 |
|
| 6 |
def retrieve_relevant_context(state: SDGState, vectorstore) -> SDGState:
|
| 7 |
+
logger.debug(f"Retrieve node received state: {state}")
|
| 8 |
+
|
| 9 |
+
# Perform retrieval
|
| 10 |
retrieved_docs = vectorstore.similarity_search(state.evolved_question, k=5)
|
| 11 |
+
logger.debug(f"Retrieved {len(retrieved_docs)} documents")
|
| 12 |
+
|
| 13 |
+
# Create a new state with the retrieved context
|
| 14 |
+
new_state = SDGState(
|
| 15 |
+
input=state.input,
|
| 16 |
+
documents=state.documents,
|
| 17 |
+
evolved_question=state.evolved_question,
|
| 18 |
+
context=[doc.page_content for doc in retrieved_docs],
|
| 19 |
+
answer=state.answer
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
logger.debug(f"Retrieve node returning state: {new_state}")
|
| 23 |
+
return new_state
|
graph/types.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
from typing import List
|
| 2 |
from langchain.schema import Document
|
| 3 |
-
from pydantic import BaseModel
|
| 4 |
|
| 5 |
class SDGState(BaseModel):
|
| 6 |
-
input: str
|
| 7 |
-
documents: List[Document] =
|
| 8 |
-
evolved_question: str = ""
|
| 9 |
-
context: List[str] =
|
| 10 |
-
answer: str = ""
|
|
|
|
| 1 |
from typing import List
|
| 2 |
from langchain.schema import Document
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
|
| 5 |
class SDGState(BaseModel):
|
| 6 |
+
input: str = Field(default="")
|
| 7 |
+
documents: List[Document] = Field(default_factory=list)
|
| 8 |
+
evolved_question: str = Field(default="")
|
| 9 |
+
context: List[str] = Field(default_factory=list)
|
| 10 |
+
answer: str = Field(default="")
|
main.py
CHANGED
|
@@ -72,6 +72,7 @@ def main():
|
|
| 72 |
|
| 73 |
graph = build_sdg_graph(docs, vectorstore)
|
| 74 |
initial_state = SDGState(input="How did LLMs evolve in 2023?")
|
|
|
|
| 75 |
result = graph.invoke(initial_state)
|
| 76 |
print("🧠 Agent Output:")
|
| 77 |
print(json.dumps(result, indent=2, ensure_ascii=False, cls=DocumentEncoder))
|
|
|
|
| 72 |
|
| 73 |
graph = build_sdg_graph(docs, vectorstore)
|
| 74 |
initial_state = SDGState(input="How did LLMs evolve in 2023?")
|
| 75 |
+
|
| 76 |
result = graph.invoke(initial_state)
|
| 77 |
print("🧠 Agent Output:")
|
| 78 |
print(json.dumps(result, indent=2, ensure_ascii=False, cls=DocumentEncoder))
|
preprocess/embed_documents.py
CHANGED
|
@@ -5,11 +5,17 @@ from langchain_openai import OpenAIEmbeddings
|
|
| 5 |
from langchain_community.vectorstores import FAISS
|
| 6 |
import json
|
| 7 |
import numpy as np
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def create_or_load_vectorstore(docs: list[Document], path: str = "generated/vectorstore") -> FAISS:
|
| 11 |
path = Path(path)
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
if path.exists():
|
| 15 |
print("✅ Loading FAISS VectorStore from disk...")
|
|
|
|
| 5 |
from langchain_community.vectorstores import FAISS
|
| 6 |
import json
|
| 7 |
import numpy as np
|
| 8 |
+
import os
|
| 9 |
|
| 10 |
|
| 11 |
def create_or_load_vectorstore(docs: list[Document], path: str = "generated/vectorstore") -> FAISS:
|
| 12 |
path = Path(path)
|
| 13 |
+
|
| 14 |
+
# Initialize embeddings with minimal configuration
|
| 15 |
+
embeddings = OpenAIEmbeddings(
|
| 16 |
+
model="text-embedding-3-small",
|
| 17 |
+
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 18 |
+
)
|
| 19 |
|
| 20 |
if path.exists():
|
| 21 |
print("✅ Loading FAISS VectorStore from disk...")
|
pyproject.toml
CHANGED
|
@@ -15,7 +15,8 @@ dependencies = [
|
|
| 15 |
"openai",
|
| 16 |
"tiktoken",
|
| 17 |
"langchain-openai",
|
| 18 |
-
"faiss-cpu"
|
|
|
|
| 19 |
]
|
| 20 |
|
| 21 |
[project.optional-dependencies]
|
|
|
|
| 15 |
"openai",
|
| 16 |
"tiktoken",
|
| 17 |
"langchain-openai",
|
| 18 |
+
"faiss-cpu",
|
| 19 |
+
"streamlit"
|
| 20 |
]
|
| 21 |
|
| 22 |
[project.optional-dependencies]
|
requirements.txt
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
streamlit==1.32.0
|
| 2 |
-
langchain==0.1.9
|
| 3 |
-
langchain-community==0.0.24
|
| 4 |
-
langchain-openai==0.0.8
|
| 5 |
-
langgraph==0.0.26
|
| 6 |
-
beautifulsoup4==4.12.3
|
| 7 |
-
pydantic>=2.5.3,<3.0.0
|
| 8 |
-
openai==1.12.0
|
| 9 |
-
tiktoken==0.6.0
|
| 10 |
-
faiss-cpu==1.10.0
|
| 11 |
-
langsmith>=0.1.0,<0.2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/preprocess/test_embed_documents.py
CHANGED
|
@@ -25,11 +25,15 @@ def test_create_vectorstore_when_not_cached(mock_open_file, mock_exists, mock_em
|
|
| 25 |
@patch("preprocess.embed_documents.FAISS.load_local")
|
| 26 |
@patch("preprocess.embed_documents.Path.exists", return_value=True)
|
| 27 |
@patch("preprocess.embed_documents.open", new_callable=mock_open)
|
| 28 |
-
|
|
|
|
| 29 |
mock_vectorstore = MagicMock()
|
| 30 |
mock_load_local.return_value = mock_vectorstore
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
result = create_or_load_vectorstore([], path="tests/tmp/vectorstore.pkl")
|
| 33 |
|
| 34 |
assert result == mock_vectorstore
|
| 35 |
-
mock_load_local.assert_called_once()
|
|
|
|
|
|
| 25 |
@patch("preprocess.embed_documents.FAISS.load_local")
|
| 26 |
@patch("preprocess.embed_documents.Path.exists", return_value=True)
|
| 27 |
@patch("preprocess.embed_documents.open", new_callable=mock_open)
|
| 28 |
+
@patch("preprocess.embed_documents.OpenAIEmbeddings")
|
| 29 |
+
def test_load_existing_vectorstore(mock_embeddings, mock_open_file, mock_exists, mock_load_local):
|
| 30 |
mock_vectorstore = MagicMock()
|
| 31 |
mock_load_local.return_value = mock_vectorstore
|
| 32 |
+
mock_embeddings_instance = MagicMock()
|
| 33 |
+
mock_embeddings.return_value = mock_embeddings_instance
|
| 34 |
+
|
| 35 |
result = create_or_load_vectorstore([], path="tests/tmp/vectorstore.pkl")
|
| 36 |
|
| 37 |
assert result == mock_vectorstore
|
| 38 |
+
mock_load_local.assert_called_once()
|
| 39 |
+
mock_embeddings.assert_called_once()
|