Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Jan 6

Commit

6093b14

1 Parent(s): 3240e5c

Merge branch 'main' into feature/clean_code

Browse files

Files changed (7) hide show

app.py +22 -17
climateqa/constants.py +2 -0
climateqa/engine/chains/retrieve_documents.py +10 -3
climateqa/engine/chains/retrieve_papers.py +1 -1
climateqa/engine/graph.py +4 -3
front/utils.py +21 -15
style.css +2 -9

app.py CHANGED Viewed

@@ -104,7 +104,7 @@ embeddings_function = get_embeddings_function()
 vectorstore = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX"))
 vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
-llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
 reranker = get_reranker("nano")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
@@ -159,7 +159,8 @@ async def chat(
         "audience": audience_prompt,
         "sources_input": sources,
         "relevant_content_sources": relevant_content_sources,
-        "search_only": search_only
     }
     # Get streaming events from agent
@@ -193,7 +194,7 @@ async def chat(
                 node = event["metadata"]["langgraph_node"]
                 # Handle document retrieval
-                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents":
                     docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(
                         event, history, used_documents
                     )
@@ -220,7 +221,7 @@ async def chat(
                 # Handle answer streaming
                 elif (event["name"] != "transform_query" and
                       event["event"] == "on_chat_model_stream" and
-                      node in ["answer_rag", "answer_search", "answer_chitchat"]):
                     history, start_streaming, answer_message_content = stream_answer(
                         history, event, start_streaming, answer_message_content
                     )
@@ -348,9 +349,9 @@ def change_sample_questions(key):
 def start_chat(query, history, search_only):
     history = history + [ChatMessage(role="user", content=query)]
     if not search_only:
-        return (gr.update(interactive=False), gr.update(selected=1), history)
     else:
-        return (gr.update(interactive=False), gr.update(selected=2), history)
 def finish_chat():
     return gr.update(interactive=True, value="")
@@ -378,7 +379,7 @@ def create_chat_interface():
         textbox = gr.Textbox(
             placeholder="Ask me anything here!",
             show_label=False,
-            scale=7,
             lines=1,
             interactive=True,
             elem_id="input-textbox"
@@ -417,6 +418,8 @@ def create_examples_tab():
 def create_figures_tab():
     sources_raw = gr.State()
     with Modal(visible=False, elem_id="modal_figure_galery") as figure_modal:
         gallery_component = gr.Gallery(
@@ -438,7 +441,7 @@ def create_figures_tab():
     figures_cards = gr.HTML(show_label=False, elem_id="sources-figures")
-    return sources_raw, gallery_component, figures_cards, figure_modal
 def create_papers_tab():
     with gr.Accordion(
@@ -492,9 +495,9 @@ def create_config_modal(config_open):
         )
         dropdown_external_sources = gr.CheckboxGroup(
-            choices=["IPCC figures", "OpenAlex", "OurWorldInData"],
             label="Select database to search for relevant content",
-            value=["IPCC figures"],
             interactive=True
         )
@@ -543,7 +546,7 @@ def create_config_modal(config_open):
         )
         dropdown_external_sources.change(
-            lambda x: gr.update(visible="OpenAlex" in x),
             inputs=[dropdown_external_sources],
             outputs=[after]
         )
@@ -588,7 +591,7 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                         with gr.Tabs(elem_id="group-subtabs") as tabs_recommended_content:
                             # Figures subtab
                             with gr.Tab("Figures", elem_id="tab-figures", id=3) as tab_figures:
-                                sources_raw, gallery_component, figures_cards, figure_modal = create_figures_tab()
                             # Papers subtab
                             with gr.Tab("Papers", elem_id="tab-citations", id=4) as tab_papers:
@@ -641,18 +644,20 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     (textbox
-        .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
-        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, sources_textbox, output_query, output_language, sources_raw, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
     )
     (examples_hidden
-        .change(start_chat, [examples_hidden, chatbot, search_only], [textbox, tabs, chatbot], queue=False, api_name="start_chat_examples")
-        .then(chat, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, sources_textbox, output_query, output_language, sources_raw, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_examples")
     )
-    sources_raw.change(process_figures, inputs=[sources_raw], outputs=[figures_cards, gallery_component])
     # Update sources numbers
     sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs, papers_html], [tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])

 vectorstore = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX"))
 vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
+llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
 reranker = get_reranker("nano")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
         "audience": audience_prompt,
         "sources_input": sources,
         "relevant_content_sources": relevant_content_sources,
+        "search_only": search_only,
+        "reports": reports
     }
     # Get streaming events from agent
                 node = event["metadata"]["langgraph_node"]
                 # Handle document retrieval
+                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" and event["data"]["output"] != None:
                     docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(
                         event, history, used_documents
                     )
                 # Handle answer streaming
                 elif (event["name"] != "transform_query" and
                       event["event"] == "on_chat_model_stream" and
+                      node in ["answer_rag","answer_rag_no_docs", "answer_search", "answer_chitchat"]):
                     history, start_streaming, answer_message_content = stream_answer(
                         history, event, start_streaming, answer_message_content
                     )
 def start_chat(query, history, search_only):
     history = history + [ChatMessage(role="user", content=query)]
     if not search_only:
+        return (gr.update(interactive=False), gr.update(selected=1), history, [])
     else:
+        return (gr.update(interactive=False), gr.update(selected=2), history, [])
 def finish_chat():
     return gr.update(interactive=True, value="")
         textbox = gr.Textbox(
             placeholder="Ask me anything here!",
             show_label=False,
+            scale=12,
             lines=1,
             interactive=True,
             elem_id="input-textbox"
 def create_figures_tab():
     sources_raw = gr.State()
+    new_figures = gr.State([])
+    used_figures = gr.State([])
     with Modal(visible=False, elem_id="modal_figure_galery") as figure_modal:
         gallery_component = gr.Gallery(
     figures_cards = gr.HTML(show_label=False, elem_id="sources-figures")
+    return sources_raw, new_figures, used_figures, gallery_component, figures_cards, figure_modal
 def create_papers_tab():
     with gr.Accordion(
         )
         dropdown_external_sources = gr.CheckboxGroup(
+            choices=["Figures (IPCC/IPBES)", "Papers (OpenAlex)", "Graphs (OurWorldInData)"],
             label="Select database to search for relevant content",
+            value=["Figures (IPCC/IPBES)"],
             interactive=True
         )
         )
         dropdown_external_sources.change(
+            lambda x: gr.update(visible="Papers (OpenAlex)" in x),
             inputs=[dropdown_external_sources],
             outputs=[after]
         )
                         with gr.Tabs(elem_id="group-subtabs") as tabs_recommended_content:
                             # Figures subtab
                             with gr.Tab("Figures", elem_id="tab-figures", id=3) as tab_figures:
+                                sources_raw, new_figures, used_figures, gallery_component, figures_cards, figure_modal = create_figures_tab()
                             # Papers subtab
                             with gr.Tab("Papers", elem_id="tab-citations", id=4) as tab_papers:
     (textbox
+        .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name="start_chat_textbox")
+        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, sources_textbox, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
     )
     (examples_hidden
+        .change(start_chat, [examples_hidden, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name="start_chat_examples")
+        .then(chat, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, sources_textbox, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_examples")
     )
+    new_figures.change(process_figures, inputs=[sources_raw, new_figures], outputs=[sources_raw, figures_cards, gallery_component])
     # Update sources numbers
     sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs, papers_html], [tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])

climateqa/constants.py CHANGED Viewed

@@ -1,4 +1,6 @@
 POSSIBLE_REPORTS = [
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",

 POSSIBLE_REPORTS = [
+    "IPBES IABWFH SPM",
+    "IPBES CBL SPM",
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -87,7 +87,7 @@ def _get_k_images_by_question(n_questions):
     elif n_questions == 2:
         return 5
     elif n_questions == 3:
-        return 2
     else:
         return 1
@@ -98,7 +98,10 @@ def _add_metadata_and_score(docs: List) -> Document:
         doc.page_content = doc.page_content.replace("\r\n"," ")
         doc.metadata["similarity_score"] = score
         doc.metadata["content"] = doc.page_content
-        doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
         docs_with_metadata.append(doc)
     return docs_with_metadata
@@ -216,14 +219,17 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
         docs = state["documents"]
     else:
         docs = []
     # Get the related_content from the state
     if "related_content" in state and state["related_content"] is not None:
         related_content = state["related_content"]
     else:
         related_content = []
-    search_figures = "IPCC figures" in state["relevant_content_sources"]
     search_only = state["search_only"]
     # Get the current question
     current_question = state["remaining_questions"][0]
@@ -253,6 +259,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
             k_images = k_images_by_question,
             threshold = 0.5,
             search_only = search_only,
         )

     elif n_questions == 2:
         return 5
     elif n_questions == 3:
+        return 3
     else:
         return 1
         doc.page_content = doc.page_content.replace("\r\n"," ")
         doc.metadata["similarity_score"] = score
         doc.metadata["content"] = doc.page_content
+        if doc.metadata["page_number"] != "N/A":
+            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+        else:
+            doc.metadata["page_number"] = 1
         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
         docs_with_metadata.append(doc)
     return docs_with_metadata
         docs = state["documents"]
     else:
         docs = []
     # Get the related_content from the state
     if "related_content" in state and state["related_content"] is not None:
         related_content = state["related_content"]
     else:
         related_content = []
+    search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources"]
     search_only = state["search_only"]
+    reports = state["reports"]
     # Get the current question
     current_question = state["remaining_questions"][0]
             k_images = k_images_by_question,
             threshold = 0.5,
             search_only = search_only,
+            reports = reports,
         )

climateqa/engine/chains/retrieve_papers.py CHANGED Viewed

@@ -33,7 +33,7 @@ def generate_keywords(query):
 async def find_papers(query,after, relevant_content_sources, reranker= reranker):
-    if "OpenAlex" in relevant_content_sources:
         summary = ""
         keywords = generate_keywords(query)
         df_works = oa.search(keywords,after = after)

 async def find_papers(query,after, relevant_content_sources, reranker= reranker):
+    if "Papers (OpenAlex)" in relevant_content_sources:
         summary = ""
         keywords = generate_keywords(query)
         df_works = oa.search(keywords,after = after)

climateqa/engine/graph.py CHANGED Viewed

@@ -36,7 +36,7 @@ class GraphState(TypedDict):
     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["IPCC","IPBES"]
-    relevant_content_sources: List[str] = ["IPCC figures"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
@@ -44,6 +44,7 @@ class GraphState(TypedDict):
     related_contents : Dict[str,Document]
     recommended_content : List[Document]
     search_only : bool = False
 def search(state): #TODO
     return state
@@ -82,7 +83,7 @@ def route_based_on_relevant_docs(state,threshold_docs=0.2):
         return "answer_rag_no_docs"
 def route_retrieve_documents(state):
-    if state["search_only"] :
         return END
     elif len(state["remaining_questions"]) > 0:
         return "retrieve_documents"
@@ -158,7 +159,7 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     )
     workflow.add_conditional_edges(
         "transform_query",
-        lambda state : "retrieve_graphs" if "OurWorldInData" in state["relevant_content_sources"]  else END,
         make_id_dict(["retrieve_graphs", END])
     )

     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["IPCC","IPBES"]
+    relevant_content_sources: List[str] = ["Figures (IPCC/IPBES)"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
     related_contents : Dict[str,Document]
     recommended_content : List[Document]
     search_only : bool = False
+    reports : List[str] = []
 def search(state): #TODO
     return state
         return "answer_rag_no_docs"
 def route_retrieve_documents(state):
+    if len(state["remaining_questions"]) == 0 and state["search_only"] :
         return END
     elif len(state["remaining_questions"]) > 0:
         return "retrieve_documents"
     )
     workflow.add_conditional_edges(
         "transform_query",
+        lambda state : "retrieve_graphs" if "Graphs (OurWorldInData)" in state["relevant_content_sources"]  else END,
         make_id_dict(["retrieve_graphs", END])
     )

front/utils.py CHANGED Viewed

@@ -39,23 +39,29 @@ def parse_output_llm_with_sources(output:str)->str:
     content_parts = "".join(parts)
     return content_parts
-def process_figures(docs:list)->tuple:
-    gallery=[]
-    used_figures =[]
     figures = '<div class="figures-container"><p></p> </div>'
     docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
-    for i, doc in enumerate(docs_figures):
-        if doc.metadata["chunk_type"] == "image":
-            if doc.metadata["figure_code"] != "N/A":
-                title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}"
-            else:
-                title = f"{doc.metadata['short_name']}"
-            if title not in used_figures:
-                used_figures.append(title)
                 try:
-                    key = f"Image {i+1}"
                     image_path = doc.metadata["image_path"].split("documents/")[1]
                     img = get_image_from_azure_blob_storage(image_path)
@@ -68,12 +74,12 @@ def process_figures(docs:list)->tuple:
                     img_str = base64.b64encode(buffered.getvalue()).decode()
-                    figures = figures + make_html_figure_sources(doc, i, img_str)
                     gallery.append(img)
                 except Exception as e:
-                    print(f"Skipped adding image {i} because of {e}")
-    return figures, gallery
 def generate_html_graphs(graphs:list)->str:

     content_parts = "".join(parts)
     return content_parts
+def process_figures(docs:list, new_figures:list)->tuple:
+    docs = docs + new_figures
     figures = '<div class="figures-container"><p></p> </div>'
+    gallery = []
+    used_figures = []
+    if docs == []:
+        return docs, figures, gallery
     docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
+    for i_doc, doc in enumerate(docs_figures):
+        if doc.metadata["chunk_type"] == "image":
+            path = doc.metadata["image_path"]
+            if path not in used_figures:
+                used_figures.append(path)
+                figure_number = len(used_figures)
                 try:
+                    key = f"Image {figure_number}"
                     image_path = doc.metadata["image_path"].split("documents/")[1]
                     img = get_image_from_azure_blob_storage(image_path)
                     img_str = base64.b64encode(buffered.getvalue()).decode()
+                    figures = figures + make_html_figure_sources(doc, figure_number, img_str)
                     gallery.append(img)
                 except Exception as e:
+                    print(f"Skipped adding image {figure_number} because of {e}")
+    return docs, figures, gallery
 def generate_html_graphs(graphs:list)->str:

style.css CHANGED Viewed

@@ -24,18 +24,11 @@ main.flex.flex-1.flex-col {
 }
 #group-subtabs {
-    width: 100%;
-    position: sticky;
 }
-#group-subtabs .tab-container {
-    display: flex;
-    text-align: center;
-    width: 100%;
-}
-#group-subtabs .tab-container button {
-    flex: 1;
 }
 .tab-nav {

 }
 #group-subtabs {
+    /* display: block; */
+    position : sticky;
 }
 }
 .tab-nav {