Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Jan 21

Commit

fc994dc

1 Parent(s): be494ba

RAG local

Browse files

Files changed (9) hide show

app.py +35 -10
climateqa/engine/chains/answer_rag.py +1 -0
climateqa/engine/chains/graph_retriever.py +3 -1
climateqa/engine/chains/prompts.py +1 -1
climateqa/engine/chains/query_transformation.py +74 -55
climateqa/engine/chains/retrieve_documents.py +78 -16
climateqa/engine/graph.py +41 -12
climateqa/event_handler.py +14 -11
front/utils.py +4 -0

app.py CHANGED Viewed

@@ -30,9 +30,17 @@ from climateqa.event_handler import (
     init_audience,
     handle_retrieved_documents,
     stream_answer,
-    handle_retrieved_owid_graphs
 )
 from utils import create_user_id
 # Load environment variables in local mode
 try:
@@ -41,6 +49,7 @@ try:
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
@@ -108,7 +117,7 @@ vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
 reranker = get_reranker("nano")
-agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker)
 # Function to update modal visibility
 def update_config_modal_visibility(config_open):
@@ -170,6 +179,7 @@ async def chat(
     docs = []
     related_contents = []
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
@@ -183,20 +193,26 @@ async def chat(
         "categorize_intent": ("🔄️ Analyzing user message", True),
         "transform_query": ("🔄️ Thinking step by step to answer the question", True),
         "retrieve_documents": ("🔄️ Searching in the knowledge base", False),
     }
     try:
         # Process streaming events
         async for event in result:
             if "langgraph_node" in event["metadata"]:
                 node = event["metadata"]["langgraph_node"]
                 # Handle document retrieval
-                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" and event["data"]["output"] != None:
-                    docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(
                         event, history, used_documents
                     )
                 # Handle intent categorization
                 elif (event["event"] == "on_chain_end" and
                       node == "categorize_intent" and
@@ -231,7 +247,7 @@ async def chat(
                 # Handle query transformation
                 if event["name"] == "transform_query" and event["event"] == "on_chain_end":
                     if hasattr(history[-1], "content"):
-                        sub_questions = [q["question"] for q in event["data"]["output"]["remaining_questions"]]
                         history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
             yield history, docs_html, output_query, output_language, related_contents, graphs_html
@@ -493,9 +509,9 @@ def create_config_modal(config_open):
         )
         dropdown_external_sources = gr.CheckboxGroup(
-            choices=["Figures (IPCC/IPBES)", "Papers (OpenAlex)", "Graphs (OurWorldInData)"],
             label="Select database to search for relevant content",
-            value=["Figures (IPCC/IPBES)"],
             interactive=True
         )
@@ -565,6 +581,8 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     chat_completed_state = gr.State(0)
     current_graphs = gr.State([])
     saved_graphs = gr.State({})
     config_open = gr.State(False)
     with gr.Tab("ClimateQ&A"):
@@ -584,6 +602,7 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                     with gr.Tab("Sources", elem_id="tab-sources", id=1) as tab_sources:
                         sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
                     # Recommended content tab
                     with gr.Tab("Recommended content", elem_id="tab-recommended_content", id=2) as tab_recommended_content:
                         with gr.Tabs(elem_id="group-subtabs") as tabs_recommended_content:
@@ -641,7 +660,7 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     (textbox
         .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name="start_chat_textbox")
-        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, sources_textbox, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
     )
@@ -649,10 +668,16 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     (examples_hidden
         .change(start_chat, [examples_hidden, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name="start_chat_examples")
-        .then(chat, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, sources_textbox, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_examples")
     )
     new_figures.change(process_figures, inputs=[sources_raw, new_figures], outputs=[sources_raw, figures_cards, gallery_component])
     # Update sources numbers

     init_audience,
     handle_retrieved_documents,
     stream_answer,
+    handle_retrieved_owid_graphs,
+    convert_to_docs_to_html
 )
 from utils import create_user_id
+from front.utils import make_html_source
+import logging
+logging.basicConfig(level=logging.WARNING)
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppresses INFO and WARNING logs
+logging.getLogger().setLevel(logging.WARNING)
 # Load environment variables in local mode
 try:
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
 reranker = get_reranker("nano")
+agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0)#TODO put back default 0.2
 # Function to update modal visibility
 def update_config_modal_visibility(config_open):
     docs = []
     related_contents = []
     docs_html = ""
+    new_docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
         "categorize_intent": ("🔄️ Analyzing user message", True),
         "transform_query": ("🔄️ Thinking step by step to answer the question", True),
         "retrieve_documents": ("🔄️ Searching in the knowledge base", False),
+        "retrieve_local_data": ("🔄️ Searching in the knowledge base", False),
     }
     try:
         # Process streaming events
         async for event in result:
             if "langgraph_node" in event["metadata"]:
                 node = event["metadata"]["langgraph_node"]
                 # Handle document retrieval
+                if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
+                    history, used_documents = handle_retrieved_documents(
                         event, history, used_documents
                     )
+                if event["event"] == "on_chain_end" and event["name"] == "answer_search" :
+                    docs = event["data"]["input"]["documents"]
+                    docs_html = convert_to_docs_to_html(docs)
+                    related_contents = event["data"]["input"]["related_contents"]
                 # Handle intent categorization
                 elif (event["event"] == "on_chain_end" and
                       node == "categorize_intent" and
                 # Handle query transformation
                 if event["name"] == "transform_query" and event["event"] == "on_chain_end":
                     if hasattr(history[-1], "content"):
+                        sub_questions = [q["question"] for q in event["data"]["output"]["questions_list"]]
                         history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
             yield history, docs_html, output_query, output_language, related_contents, graphs_html
         )
         dropdown_external_sources = gr.CheckboxGroup(
+            choices=["Figures (IPCC/IPBES)", "Papers (OpenAlex)", "Graphs (OurWorldInData)","POC region"],
             label="Select database to search for relevant content",
+            value=["Figures (IPCC/IPBES)","POC region"],
             interactive=True
         )
     chat_completed_state = gr.State(0)
     current_graphs = gr.State([])
     saved_graphs = gr.State({})
+    new_sources_hmtl = gr.State([])
     config_open = gr.State(False)
     with gr.Tab("ClimateQ&A"):
                     with gr.Tab("Sources", elem_id="tab-sources", id=1) as tab_sources:
                         sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
                     # Recommended content tab
                     with gr.Tab("Recommended content", elem_id="tab-recommended_content", id=2) as tab_recommended_content:
                         with gr.Tabs(elem_id="group-subtabs") as tabs_recommended_content:
     (textbox
         .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name="start_chat_textbox")
+        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name="start_chat_examples")
+        .then(chat, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_examples")
     )
+    def process_new_docs_html(new_docs, docs):
+        if new_docs:
+            return docs + new_docs
+        return docs
+        # return docs + new_docs
+    new_sources_hmtl.change(lambda x : x, inputs = [new_sources_hmtl], outputs = [sources_textbox])
     new_figures.change(process_figures, inputs=[sources_raw, new_figures], outputs=[sources_raw, figures_cards, gallery_component])
     # Update sources numbers

climateqa/engine/chains/answer_rag.py CHANGED Viewed

@@ -65,6 +65,7 @@ def make_rag_node(llm,with_docs = True):
     async def answer_rag(state,config):
         print("---- Answer RAG ----")
         start_time = time.time()
         answer = await rag_chain.ainvoke(state,config)

     async def answer_rag(state,config):
         print("---- Answer RAG ----")
         start_time = time.time()
+        print("Sources used : " +  "\n".join([x.metadata["short_name"] + " - page " + str(x.metadata["page_number"])  for x in state["documents"]]))
         answer = await rag_chain.ainvoke(state,config)

climateqa/engine/chains/graph_retriever.py CHANGED Viewed

@@ -50,7 +50,9 @@ def make_graph_retriever_node(vectorstore, reranker, rerank_by_question=True, k_
         print("---- Retrieving graphs ----")
         POSSIBLE_SOURCES = ["IEA", "OWID"]
-        questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
         # sources_input = state["sources_input"]
         sources_input = ["auto"]

         print("---- Retrieving graphs ----")
         POSSIBLE_SOURCES = ["IEA", "OWID"]
+        # questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
+        questions = state["questions_list"] if state["questions_list"] is not None and state["questions_list"]!=[]  else [state["query"]]
         # sources_input = state["sources_input"]
         sources_input = ["auto"]

climateqa/engine/chains/prompts.py CHANGED Viewed

@@ -37,7 +37,7 @@ You are given a question and extracted passages of the IPCC and/or IPBES reports
 answer_prompt_template = """
-You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.

 answer_prompt_template = """
+You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -7,6 +7,57 @@ from langchain.prompts import ChatPromptTemplate
 from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
 ROUTING_INDEX = {
     "Vector":["IPCC","IPBES","IPOS", "AcclimaTerra"],
@@ -25,7 +76,7 @@ class QueryDecomposition(BaseModel):
     questions: List[str] = Field(
         description="""
-        Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
         Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
         - If it's already a standalone and explicit question, just return the reformulated question for the search engine
         - If you need to decompose the question, output a list of maximum 2 to 3 questions
@@ -39,36 +90,14 @@ class Location(BaseModel):
 class QueryAnalysis(BaseModel):
     """
     Analyzing the user query to extract topics, sources and date
     Also do query expansion to get alternative search queries
     Also provide simple keywords to feed a search engine
     """
-    # keywords: List[str] = Field(
-    #     description="""
-    #     Extract the keywords from the user query to feed a search engine as a list
-    #     Maximum 3 keywords
-    #     Examples:
-    #     - "What is the impact of deep sea mining ?" -> deep sea mining
-    #     - "How will El Nino be impacted by climate change" -> el nino;climate change
-    #     - "Is climate change a hoax" -> climate change;hoax
-    #     """
-    # )
-    # alternative_queries: List[str] = Field(
-    #     description="""
-    #     Generate alternative search questions from the user query to feed a search engine
-    #     """
-    # )
-    # step_back_question: str = Field(
-    #     description="""
-    #     You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
-    #     This questions should help you get more context and information about the user query
-    #     """
-    # )
     sources: List[Literal["IPCC", "IPBES", "IPOS", "AcclimaTerra"]] = Field( #,"OpenAlex"]] = Field(
         ...,
         description="""
@@ -78,31 +107,19 @@ class QueryAnalysis(BaseModel):
             - IPOS is for questions about the ocean and deep sea mining
             - AcclimaTerra is for questions about any specific place in, or close to, the french region "Nouvelle-Aquitaine"
         """,
-            # - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
     )
-    # topics: List[Literal[
-    #     "Climate change",
-    #     "Biodiversity",
-    #     "Energy",
-    #     "Decarbonization",
-    #     "Climate science",
-    #     "Nature",
-    #     "Climate policy and justice",
-    #     "Oceans",
-    #     "Deep sea mining",
-    #     "ESG and regulations",
-    #     "CSRD",
-    # ]] = Field(
-    #     ...,
-    #     description = """
-    #         Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
-    #     """,
-    # )
-    # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
-    # location:Location
 def make_query_decomposition_chain(llm):
     openai_functions = [convert_to_openai_function(QueryDecomposition)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
@@ -116,7 +133,8 @@ def make_query_decomposition_chain(llm):
     return chain
-def make_query_rewriter_chain(llm):
     openai_functions = [convert_to_openai_function(QueryAnalysis)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
@@ -153,7 +171,7 @@ def make_query_transform_node(llm,k_final=15):
     decomposition_chain = make_query_decomposition_chain(llm)
-    rewriter_chain = make_query_rewriter_chain(llm)
     def transform_query(state):
         print("---- Transform query ----")
@@ -172,14 +190,14 @@ def make_query_transform_node(llm,k_final=15):
         questions = []
         for question in new_state["questions"]:
             question_state = {"question":question}
-            analysis_output = rewriter_chain.invoke({"input":question})
             # TODO WARNING llm should always return smthg
-            # The case when the llm does not return any sources
-            if not analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS"] for source in analysis_output["sources"]):
-                analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
-            question_state.update(analysis_output)
             questions.append(question_state)
         # Explode the questions into multiple questions with different sources
@@ -206,8 +224,9 @@ def make_query_transform_node(llm,k_final=15):
         new_state = {
-            "remaining_questions":new_questions,
             "n_questions":len(new_questions),
         }
         return new_state

 from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+# OLD QUERY ANALYSIS
+    # keywords: List[str] = Field(
+    #     description="""
+    #     Extract the keywords from the user query to feed a search engine as a list
+    #     Maximum 3 keywords
+    #     Examples:
+    #     - "What is the impact of deep sea mining ?" -> deep sea mining
+    #     - "How will El Nino be impacted by climate change" -> el nino;climate change
+    #     - "Is climate change a hoax" -> climate change;hoax
+    #     """
+    # )
+    # alternative_queries: List[str] = Field(
+    #     description="""
+    #     Generate alternative search questions from the user query to feed a search engine
+    #     """
+    # )
+    # step_back_question: str = Field(
+    #     description="""
+    #     You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
+    #     This questions should help you get more context and information about the user query
+    #     """
+    # )
+    # - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
+    #
+    # topics: List[Literal[
+    #     "Climate change",
+    #     "Biodiversity",
+    #     "Energy",
+    #     "Decarbonization",
+    #     "Climate science",
+    #     "Nature",
+    #     "Climate policy and justice",
+    #     "Oceans",
+    #     "Deep sea mining",
+    #     "ESG and regulations",
+    #     "CSRD",
+    # ]] = Field(
+    #     ...,
+    #     description = """
+    #         Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
+    #     """,
+    # )
+    # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
+    # location:Location
 ROUTING_INDEX = {
     "Vector":["IPCC","IPBES","IPOS", "AcclimaTerra"],
     questions: List[str] = Field(
         description="""
+        Think step by step to answer this question, and provide one or several search engine questions in the provided language for knowledge that you need.
         Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
         - If it's already a standalone and explicit question, just return the reformulated question for the search engine
         - If you need to decompose the question, output a list of maximum 2 to 3 questions
 class QueryAnalysis(BaseModel):
     """
+    Analyze the user query to extract the relevant sources
+    Deprecated:
     Analyzing the user query to extract topics, sources and date
     Also do query expansion to get alternative search queries
     Also provide simple keywords to feed a search engine
     """
     sources: List[Literal["IPCC", "IPBES", "IPOS", "AcclimaTerra"]] = Field( #,"OpenAlex"]] = Field(
         ...,
         description="""
             - IPOS is for questions about the ocean and deep sea mining
             - AcclimaTerra is for questions about any specific place in, or close to, the french region "Nouvelle-Aquitaine"
         """,
     )
 def make_query_decomposition_chain(llm):
+    """Chain to decompose a query into smaller parts to think step by step to answer this question
+    Args:
+        llm (_type_): _description_
+    Returns:
+        _type_: _description_
+    """
     openai_functions = [convert_to_openai_function(QueryDecomposition)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
     return chain
+def make_query_analysis_chain(llm):
+    """Analyze the user query to extract the relevant sources"""
     openai_functions = [convert_to_openai_function(QueryAnalysis)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
     decomposition_chain = make_query_decomposition_chain(llm)
+    query_analysis_chain = make_query_analysis_chain(llm)
     def transform_query(state):
         print("---- Transform query ----")
         questions = []
         for question in new_state["questions"]:
             question_state = {"question":question}
+            query_analysis_output = query_analysis_chain.invoke({"input":question})
             # TODO WARNING llm should always return smthg
+            # The case when the llm does not return any sources or wrong ouput
+            if not query_analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS"] for source in query_analysis_output["sources"]):
+                query_analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
+            question_state.update(query_analysis_output)
             questions.append(question_state)
         # Explode the questions into multiple questions with different sources
         new_state = {
+            "questions_list":new_questions,
             "n_questions":len(new_questions),
+            "handled_questions_index":[],
         }
         return new_state

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -106,6 +106,17 @@ def _add_metadata_and_score(docs: List) -> Document:
         docs_with_metadata.append(doc)
     return docs_with_metadata
 async def get_POC_relevant_documents(
     query: str,
     vectorstore:VectorStore,
@@ -116,14 +127,18 @@ async def get_POC_relevant_documents(
     threshold:float = 0.6,
     k_images: int = 5,
     reports:list = [],
 ) :
     # Prepare base search kwargs
     filters = {}
-    if len(reports) > 0:
-        filters["short_name"] = {"$in":reports}
-    else:
-        filters["source"] = { "$in": sources}
     filters_text = {
         **filters,
@@ -132,6 +147,8 @@ async def get_POC_relevant_documents(
     }
     docs_question = vectorstore.similarity_search_with_score(query=query,filter = filters_text,k = k_documents)
     docs_question = [x for x in docs_question if x[1] > threshold]
     if search_figures:
@@ -141,6 +158,10 @@ async def get_POC_relevant_documents(
             "chunk_type":"image"
         }
         docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
     return {
         "docs_question" : docs_question,
@@ -236,12 +257,13 @@ async def get_IPCC_relevant_documents(
 def concatenate_documents(index, source_type, docs_question_dict, k_by_question, k_summary_by_question, k_images_by_question):
     # Keep the right number of documents - The k_summary documents from SPM are placed in front
-    if source_type == "Vector" :
-        docs_question = docs_question_dict["docs_summaries"][:k_summary_by_question] + docs_question_dict["docs_full"][:k_by_question - k_summary_by_question]
     elif source_type == "POC" :
         docs_question = docs_question_dict["docs_question"][:k_by_question]
     else :
-        docs_question = [doc for key in docs_question_dict.keys() for doc in docs_question_dict[key]]
     images_question = docs_question_dict["docs_images"][:k_images_by_question]
@@ -278,8 +300,18 @@ async def retrieve_documents(state,config, source_type, vectorstore,reranker,llm
     reports = state["reports"]
     # Get the current question
-    current_question = state["remaining_questions"][0]
-    remaining_questions = state["remaining_questions"][1:]
     k_by_question = k_final // state["n_questions"]
     k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
@@ -318,6 +350,9 @@ async def retrieve_documents(state,config, source_type, vectorstore,reranker,llm
             threshold = 0.5,
             search_only = search_only,
             reports = reports,
         )
@@ -343,9 +378,12 @@ async def retrieve_documents(state,config, source_type, vectorstore,reranker,llm
     images_question = _add_sources_used_in_metadata(images_question,sources,question,index)
     # Add to the list of docs
-    docs.extend(docs_question)
-    related_content.extend(images_question)
-    new_state = {"documents":docs, "related_contents": related_content,"remaining_questions":remaining_questions}
     return new_state
@@ -355,7 +393,20 @@ def make_IPx_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_
     @chain
     async def retrieve_IPx_docs(state, config):
         source_type = "IPx"
-        state =  await retrieve_documents(state,config, source_type, vectorstore,reranker,llm,rerank_by_question, k_final, k_before_reranking, k_summary)
         return state
     return retrieve_IPx_docs
@@ -364,12 +415,23 @@ def make_IPx_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_
 def make_POC_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
     @chain
-    async def retrieve_IPx_docs(state, config):
         source_type = "POC"
-        state =  await retrieve_documents(state,config, source_type, vectorstore,reranker,llm,rerank_by_question, k_final, k_before_reranking, k_summary)
         return state
-    return retrieve_IPx_docs

         docs_with_metadata.append(doc)
     return docs_with_metadata
+def remove_duplicates_chunks(docs):
+    # Remove duplicates or almost duplicates
+    docs = sorted(docs,key=lambda x: x[1],reverse=True)
+    seen = set()
+    result = []
+    for doc in docs:
+        if doc[0].page_content not in seen:
+            seen.add(doc[0].page_content)
+            result.append(doc)
+    return result
 async def get_POC_relevant_documents(
     query: str,
     vectorstore:VectorStore,
     threshold:float = 0.6,
     k_images: int = 5,
     reports:list = [],
+    min_size:int = 200,
 ) :
     # Prepare base search kwargs
     filters = {}
+    docs_question = []
+    docs_images = []
+    # TODO add source selection
+    # if len(reports) > 0:
+    #     filters["short_name"] = {"$in":reports}
+    # else:
+    #     filters["source"] = { "$in": sources}
     filters_text = {
         **filters,
     }
     docs_question = vectorstore.similarity_search_with_score(query=query,filter = filters_text,k = k_documents)
+    # remove duplicates or almost duplicates
+    docs_question = remove_duplicates_chunks(docs_question)
     docs_question = [x for x in docs_question if x[1] > threshold]
     if search_figures:
             "chunk_type":"image"
         }
         docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
+    docs_question, docs_images = _add_metadata_and_score(docs_question), _add_metadata_and_score(docs_images)
+    docs_question = [x for x in docs_question if len(x.page_content) > min_size]
     return {
         "docs_question" : docs_question,
 def concatenate_documents(index, source_type, docs_question_dict, k_by_question, k_summary_by_question, k_images_by_question):
     # Keep the right number of documents - The k_summary documents from SPM are placed in front
+    if source_type == "IPx":
+        docs_question = docs_question_dict["docs_summaries"][:k_summary_by_question] + docs_question_dict["docs_full"][:(k_by_question - k_summary_by_question)]
     elif source_type == "POC" :
         docs_question = docs_question_dict["docs_question"][:k_by_question]
     else :
+        raise ValueError("source_type should be either Vector or POC")
+        # docs_question = [doc for key in docs_question_dict.keys() for doc in docs_question_dict[key]][:(k_by_question)]
     images_question = docs_question_dict["docs_images"][:k_images_by_question]
     reports = state["reports"]
     # Get the current question
+    # current_question = state["questions_list"][0]
+    # remaining_questions = state["remaining_questions"][1:]
+    current_question_id = None
+    print("Here", range(len(state["questions_list"])),state["handled_questions_index"])
+    for i in range(len(state["questions_list"])):
+        if i not in state["handled_questions_index"]:
+            current_question_id = i
+            break
+    current_question = state["questions_list"][current_question_id]
+    # TODO filter on source_type
     k_by_question = k_final // state["n_questions"]
     k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
             threshold = 0.5,
             search_only = search_only,
             reports = reports,
+            min_size= 200,
+            k_documents= k_before_reranking,
+            k_images= k_by_question
         )
     images_question = _add_sources_used_in_metadata(images_question,sources,question,index)
     # Add to the list of docs
+    # docs.extend(docs_question)
+    # related_content.extend(images_question)
+    docs = docs_question
+    related_content = images_question
+    new_state = {"documents":docs, "related_contents": related_content, "handled_questions_index": [current_question_id]}
+    print("Updated state with question ", current_question_id, " added ", len(docs), " documents")
     return new_state
     @chain
     async def retrieve_IPx_docs(state, config):
         source_type = "IPx"
+        return {"documents":[], "related_contents": [], "handled_questions_index": list(range(len(state["questions_list"])))} # TODO Remove
+        state =  await retrieve_documents(
+            state = state,
+            config= config,
+            source_type=source_type,
+            vectorstore=vectorstore,
+            reranker= reranker,
+            llm=llm,
+            rerank_by_question=rerank_by_question,
+            k_final=k_final,
+            k_before_reranking=k_before_reranking,
+            k_summary=k_summary
+        )
         return state
     return retrieve_IPx_docs
 def make_POC_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
     @chain
+    async def retrieve_POC_docs_node(state, config):
         source_type = "POC"
+        state =  await retrieve_documents(
+            state = state,
+            config= config,
+            source_type=source_type,
+            vectorstore=vectorstore,
+            reranker= reranker,
+            llm=llm,
+            rerank_by_question=rerank_by_question,
+            k_final=k_final,
+            k_before_reranking=k_before_reranking,
+            k_summary=k_summary
+        )
         return state
+    return retrieve_POC_docs_node

climateqa/engine/graph.py CHANGED Viewed

@@ -9,6 +9,9 @@ from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
 from typing_extensions import TypedDict
 from typing import List, Dict
 from IPython.display import display, HTML, Image
 from .chains.answer_chitchat import make_chitchat_node
@@ -31,7 +34,8 @@ class GraphState(TypedDict):
     intent : str
     search_graphs_chitchat : bool
     query: str
-    remaining_questions : List[dict]
     n_questions : int
     answer: str
     audience: str = "experts"
@@ -40,20 +44,20 @@ class GraphState(TypedDict):
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
-    documents: List[Document]
-    related_contents : List[Document]
     recommended_content : List[Document]
     search_only : bool = False
     reports : List[str] = []
 def dummy(state):
-    return state
 def search(state): #TODO
-    return state
 def answer_search(state):#TODO
-    return state
 def route_intent(state):
     intent = state["intent"]
@@ -76,22 +80,40 @@ def route_translation(state):
     if state["language"].lower() == "english":
         return "transform_query"
     else:
-        return "translate_query"
 def route_based_on_relevant_docs(state,threshold_docs=0.2):
     docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
     if len(docs) > 0:
         return "answer_rag"
     else:
         return "answer_rag_no_docs"
 def route_continue_retrieve_documents(state):
-    if len(state["remaining_questions"]) == 0 and state["search_only"] :
         return END
-    elif len(state["remaining_questions"]) > 0:
         return "retrieve_documents"
-    else:
         return "answer_search"
 def route_retrieve_documents(state):
     sources_to_retrieve = []
@@ -167,6 +189,12 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_regi
         route_continue_retrieve_documents,
         make_id_dict([END,"retrieve_documents","answer_search"])
     )
     workflow.add_conditional_edges(
         "answer_search",
@@ -188,14 +216,15 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_regi
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_documents")
     workflow.add_edge("retrieve_graphs", END)
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
     workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
     workflow.add_edge("retrieve_graphs_chitchat", END)
-    workflow.add_edge("retrieve_local_data", "answer_search")
     # Compile
     app = workflow.compile()

 from typing_extensions import TypedDict
 from typing import List, Dict
+import operator
+from typing import Annotated
 from IPython.display import display, HTML, Image
 from .chains.answer_chitchat import make_chitchat_node
     intent : str
     search_graphs_chitchat : bool
     query: str
+    questions_list : List[dict]
+    handled_questions_index : Annotated[list[int], operator.add]
     n_questions : int
     answer: str
     audience: str = "experts"
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
+    documents: Annotated[List[Document], operator.add]
+    related_contents : Annotated[List[Document], operator.add]
     recommended_content : List[Document]
     search_only : bool = False
     reports : List[str] = []
 def dummy(state):
+    return
 def search(state): #TODO
+    return
 def answer_search(state):#TODO
+    return
 def route_intent(state):
     intent = state["intent"]
     if state["language"].lower() == "english":
         return "transform_query"
     else:
+        return "transform_query"
+        # return "translate_query" #TODO : add translation
 def route_based_on_relevant_docs(state,threshold_docs=0.2):
     docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
+    print("Route : ", ["answer_rag" if len(docs) > 0 else "answer_rag_no_docs"])
     if len(docs) > 0:
         return "answer_rag"
     else:
         return "answer_rag_no_docs"
 def route_continue_retrieve_documents(state):
+    if len(state["questions_list"]) == len(state["handled_questions_index"]) and state["search_only"] :
         return END
+    elif len(state["questions_list"]) == len(state["handled_questions_index"]):
+        return "answer_search"
+    else :
         return "retrieve_documents"
+def route_continue_retrieve_local_documents(state):
+    if len(state["questions_list"]) == len(state["handled_questions_index"]) and state["search_only"] :
+        return END
+    elif len(state["questions_list"]) == len(state["handled_questions_index"]):
         return "answer_search"
+    else :
+        return "retrieve_local_data"
+    # if len(state["remaining_questions"]) == 0 and state["search_only"] :
+        # return END
+    # elif len(state["remaining_questions"]) > 0:
+    #     return "retrieve_documents"
+    # else:
+    #     return "answer_search"
 def route_retrieve_documents(state):
     sources_to_retrieve = []
         route_continue_retrieve_documents,
         make_id_dict([END,"retrieve_documents","answer_search"])
     )
+    workflow.add_conditional_edges(
+        "retrieve_local_data",
+        # lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
+        route_continue_retrieve_local_documents,
+        make_id_dict([END,"retrieve_local_data","answer_search"])
+    )
     workflow.add_conditional_edges(
         "answer_search",
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
+    # workflow.add_edge("transform_query", "retrieve_documents") #TODO put back
+    workflow.add_edge("transform_query", END) # TODO remove
     workflow.add_edge("retrieve_graphs", END)
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_rag_no_docs", END)
     workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
     workflow.add_edge("retrieve_graphs_chitchat", END)
+    # workflow.add_edge("retrieve_local_data", "answer_search")
     # Compile
     app = workflow.compile()

climateqa/event_handler.py CHANGED Viewed

@@ -15,6 +15,13 @@ def init_audience(audience :str) -> str:
         audience_prompt = audience_prompts["experts"]
     return audience_prompt
 def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage], used_documents : list[str]) -> tuple[str, list[ChatMessage], list[str]]:
     """
     Handles the retrieved documents and returns the HTML representation of the documents
@@ -27,26 +34,22 @@ def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage],
     Returns:
         tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
     """
     try:
-        docs = event["data"]["output"]["documents"]
-        docs_html = []
-        textual_docs = [d for d in docs if d.metadata["chunk_type"] == "text"]
-        for i, d in enumerate(textual_docs, 1):
-            if d.metadata["chunk_type"] == "text":
-                docs_html.append(make_html_source(d, i))
         used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
         if used_documents!=[]:
             history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
-        docs_html = "".join(docs_html)
-        related_contents = event["data"]["output"]["related_contents"]
     except Exception as e:
         print(f"Error getting documents: {e}")
         print(event)
-    return docs, docs_html, history, used_documents, related_contents
 def stream_answer(history: list[ChatMessage], event : StreamEvent, start_streaming : bool, answer_message_content : str)-> tuple[list[ChatMessage], bool, str]:
     """

         audience_prompt = audience_prompts["experts"]
     return audience_prompt
+def convert_to_docs_to_html(docs: list[dict]) -> str:
+    docs_html = []
+    for i, d in enumerate(docs, 1):
+        if d.metadata["chunk_type"] == "text":
+            docs_html.append(make_html_source(d, i))
+    return "".join(docs_html)
 def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage], used_documents : list[str]) -> tuple[str, list[ChatMessage], list[str]]:
     """
     Handles the retrieved documents and returns the HTML representation of the documents
     Returns:
         tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
     """
+    if "documents" not in event["data"]["output"] or event["data"]["output"]["documents"] == []:
+        return history, used_documents, []
     try:
+        docs = event["data"]["output"]["documents"]
         used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
         if used_documents!=[]:
             history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
+        #TODO do the same for related contents
     except Exception as e:
         print(f"Error getting documents: {e}")
         print(event)
+    return history, used_documents
 def stream_answer(history: list[ChatMessage], event : StreamEvent, start_streaming : bool, answer_message_content : str)-> tuple[list[ChatMessage], bool, str]:
     """

front/utils.py CHANGED Viewed

@@ -39,7 +39,11 @@ def parse_output_llm_with_sources(output:str)->str:
     content_parts = "".join(parts)
     return content_parts
 def process_figures(docs:list, new_figures:list)->tuple:
     docs = docs + new_figures
     figures = '<div class="figures-container"><p></p> </div>'

     content_parts = "".join(parts)
     return content_parts
 def process_figures(docs:list, new_figures:list)->tuple:
+    if new_figures == []:
+        return docs, "", []
     docs = docs + new_figures
     figures = '<div class="figures-container"><p></p> </div>'