Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

test-tim

#12

by timeki - opened Sep 25, 2024

base: refs/heads/main

←

from: refs/pr/12

Discussion Files changed

+1190

-24621

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

.gitattributes +0 -2
.gitignore +0 -13
README.md +1 -1
app.py +608 -486
climateqa/chat.py +0 -194
climateqa/constants.py +1 -59
climateqa/engine/chains/__init__.py +0 -0
climateqa/engine/chains/answer_ai_impact.py +0 -46
climateqa/engine/chains/answer_chitchat.py +0 -56
climateqa/engine/chains/chitchat_categorization.py +0 -43
climateqa/engine/chains/follow_up.py +0 -33
climateqa/engine/chains/graph_retriever.py +0 -130
climateqa/engine/chains/intent_categorization.py +0 -97
climateqa/engine/chains/keywords_extraction.py +0 -40
climateqa/engine/chains/query_transformation.py +0 -300
climateqa/engine/chains/retrieve_documents.py +0 -705
climateqa/engine/chains/retrieve_papers.py +0 -95
climateqa/engine/chains/retriever.py +0 -126
climateqa/engine/chains/sample_router.py +0 -66
climateqa/engine/chains/set_defaults.py +0 -13
climateqa/engine/chains/standalone_question.py +0 -42
climateqa/engine/chains/translation.py +0 -42
climateqa/engine/embeddings.py +3 -6
climateqa/engine/graph.py +0 -346
climateqa/engine/graph_retriever.py +0 -88
climateqa/engine/keywords.py +1 -3
climateqa/engine/llm/__init__.py +0 -3
climateqa/engine/llm/ollama.py +0 -6
climateqa/engine/llm/openai.py +1 -1
climateqa/engine/{chains/prompts.py → prompts.py} +6 -107
climateqa/engine/{chains/answer_rag.py → rag.py} +60 -41
climateqa/engine/{chains/reformulation.py → reformulation.py} +1 -1
climateqa/engine/reranker.py +0 -55
climateqa/engine/retriever.py +163 -0
climateqa/engine/talk_to_data/config.py +0 -11
climateqa/engine/talk_to_data/drias/config.py +0 -124
climateqa/engine/talk_to_data/drias/plot_informations.py +0 -88
climateqa/engine/talk_to_data/drias/plots.py +0 -434
climateqa/engine/talk_to_data/drias/queries.py +0 -83
climateqa/engine/talk_to_data/input_processing.py +0 -257
climateqa/engine/talk_to_data/ipcc/config.py +0 -98
climateqa/engine/talk_to_data/ipcc/plot_informations.py +0 -50
climateqa/engine/talk_to_data/ipcc/plots.py +0 -189
climateqa/engine/talk_to_data/ipcc/queries.py +0 -144
climateqa/engine/talk_to_data/main.py +0 -124
climateqa/engine/talk_to_data/myVanna.py +0 -13
climateqa/engine/talk_to_data/objects/llm_outputs.py +0 -13
climateqa/engine/talk_to_data/objects/location.py +0 -12
climateqa/engine/talk_to_data/objects/plot.py +0 -23
climateqa/engine/talk_to_data/objects/states.py +0 -19

.gitattributes CHANGED Viewed

@@ -44,5 +44,3 @@ documents/climate_gpt_v2_only_giec.faiss filter=lfs diff=lfs merge=lfs -text
 documents/climate_gpt_v2.faiss filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.db filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.faiss filter=lfs diff=lfs merge=lfs -text
-data/drias/drias.db filter=lfs diff=lfs merge=lfs -text
-front/assets/*.png filter=lfs diff=lfs merge=lfs -text

 documents/climate_gpt_v2.faiss filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.db filter=lfs diff=lfs merge=lfs -text
 climateqa_v3.faiss filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -5,16 +5,3 @@ __pycache__/utils.cpython-38.pyc
 notebooks/
 *.pyc
-**/.ipynb_checkpoints/
-**/.flashrank_cache/
-data/
-sandbox/
-climateqa/talk_to_data/database/
-*.db
-data_ingestion/
-.vscode
-*old/


5
6	notebooks/
7	*.pyc

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🌍
 colorFrom: blue
 colorTo: red
 sdk: gradio
-sdk_version: 5.0.2
 app_file: app.py
 fullWidth: true
 pinned: false

 colorFrom: blue
 colorTo: red
 sdk: gradio
+sdk_version: 4.19.1
 app_file: app.py
 fullWidth: true
 pinned: false

app.py CHANGED Viewed

@@ -1,44 +1,52 @@
-# Import necessary libraries
-import os
-import gradio as gr
-from azure.storage.fileshare import ShareServiceClient
-# Import custom modules
-from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.llm import get_llm
-from climateqa.engine.vectorstore import get_pinecone_vectorstore
-from climateqa.engine.reranker import get_reranker
-from climateqa.engine.graph import make_graph_agent, make_graph_agent_poc
-from climateqa.engine.chains.retrieve_papers import find_papers
-from climateqa.chat import start_chat, chat_stream, finish_chat
-from front.tabs import create_config_modal, cqa_tab, create_about_tab
-from front.tabs import MainTabPanel, ConfigPanel
-from front.tabs.tab_drias import create_drias_tab
-from front.tabs.tab_ipcc import create_ipcc_tab
-from front.utils import process_figures
-from gradio_modal import Modal
 from utils import create_user_id
-import logging
-logging.basicConfig(level=logging.WARNING)
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # Suppresses INFO and WARNING logs
-logging.getLogger().setLevel(logging.WARNING)
 # Load environment variables in local mode
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
@@ -46,7 +54,15 @@ theme = gr.themes.Base(
     font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
 )
-# Azure Blob Storage credentials
 account_key = os.environ["BLOB_ACCOUNT_KEY"]
 if len(account_key) == 86:
     account_key += "=="
@@ -64,102 +80,365 @@ share_client = service.get_share_client(file_share_name)
 user_id = create_user_id()
 # Create vectorstore and retriever
-embeddings_function = get_embeddings_function()
-vectorstore = get_pinecone_vectorstore(
-    embeddings_function, index_name=os.getenv("PINECONE_API_INDEX")
-)
-vectorstore_graphs = get_pinecone_vectorstore(
-    embeddings_function,
-    index_name=os.getenv("PINECONE_API_INDEX_OWID"),
-    text_key="description",
-)
-vectorstore_region = get_pinecone_vectorstore(
-    embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_LOCAL_V2")
-)
-llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
-if os.environ["GRADIO_ENV"] == "local":
-    reranker = get_reranker("nano")
-else:
-    reranker = get_reranker("nano")
-agent = make_graph_agent(
-    llm=llm,
-    vectorstore_ipcc=vectorstore,
-    vectorstore_graphs=vectorstore_graphs,
-    vectorstore_region=vectorstore_region,
-    reranker=reranker,
-    threshold_docs=0.2,
-)
-agent_poc = make_graph_agent_poc(
-    llm=llm,
-    vectorstore_ipcc=vectorstore,
-    vectorstore_graphs=vectorstore_graphs,
-    vectorstore_region=vectorstore_region,
-    reranker=reranker,
-    threshold_docs=0,
-    version="v4",
-)  # TODO put back default 0.2
-async def chat(
-    query,
-    history,
-    audience,
-    sources,
-    reports,
-    relevant_content_sources_selection,
-    search_only,
-):
-    print("chat cqa - message received")
-    # Ensure default values if components are not set
-    audience = audience or "Experts"
-    sources = sources or ["IPCC", "IPBES"]
-    reports = reports or []
-    relevant_content_sources_selection = relevant_content_sources_selection or ["Figures (IPCC/IPBES)"]
-    search_only = bool(search_only)  # Convert to boolean if None
-    async for event in chat_stream(
-        agent,
-        query,
-        history,
-        audience,
-        sources,
-        reports,
-        relevant_content_sources_selection,
-        search_only,
-        share_client,
-        user_id,
-    ):
-        yield event
-async def chat_poc(
-    query,
-    history,
-    audience,
-    sources,
-    reports,
-    relevant_content_sources_selection,
-    search_only,
-):
-    print("chat poc - message received")
-    async for event in chat_stream(
-        agent_poc,
-        query,
-        history,
-        audience,
-        sources,
-        reports,
-        relevant_content_sources_selection,
-        search_only,
-        share_client,
-        user_id,
-    ):
-        yield event
 # --------------------------------------------------------------------
@@ -167,389 +446,232 @@ async def chat_poc(
 # --------------------------------------------------------------------
-# Function to update modal visibility
-def update_config_modal_visibility(config_open):
-    print(config_open)
-    new_config_visibility_status = not config_open
-    return Modal(visible=new_config_visibility_status), new_config_visibility_status
-def update_sources_number_display(
-    sources_textbox, figures_cards, current_graphs, papers_html
-):
-    sources_number = sources_textbox.count("<h2>")
-    figures_number = figures_cards.count("<h2>")
-    graphs_number = current_graphs.count("<iframe")
-    papers_number = papers_html.count("<h2>")
-    sources_notif_label = f"Sources ({sources_number})"
-    figures_notif_label = f"Figures ({figures_number})"
-    graphs_notif_label = f"Graphs ({graphs_number})"
-    papers_notif_label = f"Papers ({papers_number})"
-    recommended_content_notif_label = (
-        f"Recommended content ({figures_number + graphs_number + papers_number})"
-    )
-    return (
-        gr.update(label=recommended_content_notif_label),
-        gr.update(label=sources_notif_label),
-        gr.update(label=figures_notif_label),
-        gr.update(label=graphs_notif_label),
-        gr.update(label=papers_notif_label),
-    )
-def config_event_handling(
-    main_tabs_components: list[MainTabPanel], config_componenets: ConfigPanel
-):
-    config_open = config_componenets.config_open
-    config_modal = config_componenets.config_modal
-    close_config_modal = config_componenets.close_config_modal_button
-    for button in [close_config_modal] + [
-        main_tab_component.config_button for main_tab_component in main_tabs_components
-    ]:
-        button.click(
-            fn=update_config_modal_visibility,
-            inputs=[config_open],
-            outputs=[config_modal, config_open],
-        )
-def event_handling(
-    main_tab_components: MainTabPanel,
-    config_components: ConfigPanel,
-    tab_name="ClimateQ&A",
-):
-    chatbot = main_tab_components.chatbot
-    textbox = main_tab_components.textbox
-    tabs = main_tab_components.tabs
-    sources_raw = main_tab_components.sources_raw
-    new_figures = main_tab_components.new_figures
-    current_graphs = main_tab_components.current_graphs
-    examples_hidden = main_tab_components.examples_hidden
-    sources_textbox = main_tab_components.sources_textbox
-    figures_cards = main_tab_components.figures_cards
-    gallery_component = main_tab_components.gallery_component
-    papers_direct_search = main_tab_components.papers_direct_search
-    papers_html = main_tab_components.papers_html
-    citations_network = main_tab_components.citations_network
-    papers_summary = main_tab_components.papers_summary
-    tab_recommended_content = main_tab_components.tab_recommended_content
-    tab_sources = main_tab_components.tab_sources
-    tab_figures = main_tab_components.tab_figures
-    tab_graphs = main_tab_components.tab_graphs
-    tab_papers = main_tab_components.tab_papers
-    graphs_container = main_tab_components.graph_container
-    follow_up_examples = main_tab_components.follow_up_examples
-    follow_up_examples_hidden = main_tab_components.follow_up_examples_hidden
-    dropdown_sources = config_components.dropdown_sources
-    dropdown_reports = config_components.dropdown_reports
-    dropdown_external_sources = config_components.dropdown_external_sources
-    search_only = config_components.search_only
-    dropdown_audience = config_components.dropdown_audience
-    after = config_components.after
-    output_query = config_components.output_query
-    output_language = config_components.output_language
-    new_sources_hmtl = gr.State([])
-    ttd_data = gr.State([])
-    if tab_name == "ClimateQ&A":
-        print("chat cqa - message sent")
-        # Event for textbox
-        (
-            textbox.submit(
-                start_chat,
-                [textbox, chatbot, search_only],
-                [textbox, tabs, chatbot, sources_raw],
-                queue=False,
-                api_name=f"start_chat_{textbox.elem_id}",
-            )
-            .then(
-                chat,
-                [
-                    textbox,
-                    chatbot,
-                    dropdown_audience,
-                    dropdown_sources,
-                    dropdown_reports,
-                    dropdown_external_sources,
-                    search_only,
-                ],
-                [
-                    chatbot,
-                    new_sources_hmtl,
-                    output_query,
-                    output_language,
-                    new_figures,
-                    current_graphs,
-                    follow_up_examples.dataset,
-                ],
-                concurrency_limit=8,
-                api_name=f"chat_{textbox.elem_id}",
-            )
-            .then(
-                finish_chat, None, [textbox], api_name=f"finish_chat_{textbox.elem_id}"
-            )
-        )
-        # Event for examples_hidden
-        (
-            examples_hidden.change(
-                start_chat,
-                [examples_hidden, chatbot, search_only],
-                [examples_hidden, tabs, chatbot, sources_raw],
-                queue=False,
-                api_name=f"start_chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                chat,
-                [
-                    examples_hidden,
-                    chatbot,
-                    dropdown_audience,
-                    dropdown_sources,
-                    dropdown_reports,
-                    dropdown_external_sources,
-                    search_only,
-                ],
-                [
-                    chatbot,
-                    new_sources_hmtl,
-                    output_query,
-                    output_language,
-                    new_figures,
-                    current_graphs,
-                    follow_up_examples.dataset,
-                ],
-                concurrency_limit=8,
-                api_name=f"chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                finish_chat,
-                None,
-                [textbox],
-                api_name=f"finish_chat_{examples_hidden.elem_id}",
-            )
-        )
-        (
-            follow_up_examples_hidden.change(
-                start_chat,
-                [follow_up_examples_hidden, chatbot, search_only],
-                [follow_up_examples_hidden, tabs, chatbot, sources_raw],
-                queue=False,
-                api_name=f"start_chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                chat,
-                [
-                    follow_up_examples_hidden,
-                    chatbot,
-                    dropdown_audience,
-                    dropdown_sources,
-                    dropdown_reports,
-                    dropdown_external_sources,
-                    search_only,
-                ],
-                [
-                    chatbot,
-                    new_sources_hmtl,
-                    output_query,
-                    output_language,
-                    new_figures,
-                    current_graphs,
-                    follow_up_examples.dataset,
-                ],
-                concurrency_limit=8,
-                api_name=f"chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                finish_chat,
-                None,
-                [textbox],
-                api_name=f"finish_chat_{follow_up_examples_hidden.elem_id}",
-            )
-        )
-    elif tab_name == "France - Local Q&A":
-        print("chat poc - message sent")
-        # Event for textbox
-        (
-            textbox.submit(
-                start_chat,
-                [textbox, chatbot, search_only],
-                [textbox, tabs, chatbot, sources_raw],
-                queue=False,
-                api_name=f"start_chat_{textbox.elem_id}",
-            )
-            .then(
-                chat_poc,
-                [
-                    textbox,
-                    chatbot,
-                    dropdown_audience,
-                    dropdown_sources,
-                    dropdown_reports,
-                    dropdown_external_sources,
-                    search_only,
-                ],
-                [
-                    chatbot,
-                    new_sources_hmtl,
-                    output_query,
-                    output_language,
-                    new_figures,
-                    current_graphs,
-                    follow_up_examples.dataset,
-                ],
-                concurrency_limit=8,
-                api_name=f"chat_{textbox.elem_id}",
-            )
-            .then(
-                finish_chat, None, [textbox], api_name=f"finish_chat_{textbox.elem_id}"
-            )
-        )
-        # Event for examples_hidden
-        (
-            examples_hidden.change(
-                start_chat,
-                [examples_hidden, chatbot, search_only],
-                [examples_hidden, tabs, chatbot, sources_raw],
-                queue=False,
-                api_name=f"start_chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                chat_poc,
-                [
-                    examples_hidden,
-                    chatbot,
-                    dropdown_audience,
-                    dropdown_sources,
-                    dropdown_reports,
-                    dropdown_external_sources,
-                    search_only,
-                ],
-                [
-                    chatbot,
-                    new_sources_hmtl,
-                    output_query,
-                    output_language,
-                    new_figures,
-                    current_graphs,
-                    follow_up_examples.dataset,
-                ],
-                concurrency_limit=8,
-                api_name=f"chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                finish_chat,
-                None,
-                [textbox],
-                api_name=f"finish_chat_{examples_hidden.elem_id}",
-            )
-        )
-        (
-            follow_up_examples_hidden.change(
-                start_chat,
-                [follow_up_examples_hidden, chatbot, search_only],
-                [follow_up_examples_hidden, tabs, chatbot, sources_raw],
-                queue=False,
-                api_name=f"start_chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                chat,
-                [
-                    follow_up_examples_hidden,
-                    chatbot,
-                    dropdown_audience,
-                    dropdown_sources,
-                    dropdown_reports,
-                    dropdown_external_sources,
-                    search_only,
-                ],
-                [
-                    chatbot,
-                    new_sources_hmtl,
-                    output_query,
-                    output_language,
-                    new_figures,
-                    current_graphs,
-                    follow_up_examples.dataset,
-                ],
-                concurrency_limit=8,
-                api_name=f"chat_{examples_hidden.elem_id}",
-            )
-            .then(
-                finish_chat,
-                None,
-                [textbox],
-                api_name=f"finish_chat_{follow_up_examples_hidden.elem_id}",
-            )
-        )
-    new_sources_hmtl.change(
-        lambda x: x, inputs=[new_sources_hmtl], outputs=[sources_textbox]
-    )
-    current_graphs.change(
-        lambda x: x, inputs=[current_graphs], outputs=[graphs_container]
     )
-    new_figures.change(
-        process_figures,
-        inputs=[sources_raw, new_figures],
-        outputs=[sources_raw, figures_cards, gallery_component],
     )
-    # Update sources numbers
-    for component in [sources_textbox, figures_cards, current_graphs, papers_html]:
-        component.change(
-            update_sources_number_display,
-            [sources_textbox, figures_cards, current_graphs, papers_html],
-            [tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers],
-        )
-    # Search for papers
-    for component in [textbox, examples_hidden, papers_direct_search]:
-        component.submit(
-            find_papers,
-            [component, after, dropdown_external_sources],
-            [papers_html, citations_network, papers_summary],
-        )
-    # if tab_name == "France - Local Q&A": # Not untill results are good enough
-    #     # Drias search
-    #     textbox.submit(ask_vanna, [textbox], [vanna_sql_query ,vanna_table, vanna_display])
-def main_ui():
-    # config_open = gr.State(True)
-    with gr.Blocks(
-        title="Climate Q&A",
-        css_paths=os.getcwd() + "/style.css",
-        theme=theme,
-        elem_id="main-component",
-    ) as demo:
-        config_components = create_config_modal()
-        with gr.Tabs():
-            cqa_components = cqa_tab(tab_name="ClimateQ&A")
-            local_cqa_components = cqa_tab(tab_name="France - Local Q&A")
-            drias_components = create_drias_tab(share_client=share_client, user_id=user_id)
-            ipcc_components = create_ipcc_tab(share_client=share_client, user_id=user_id)
-            create_about_tab()
-        event_handling(cqa_components, config_components, tab_name="ClimateQ&A")
-        event_handling(
-            local_cqa_components, config_components, tab_name="France - Local Q&A"
-        )
-        config_event_handling([cqa_components, local_cqa_components], config_components)
-        demo.queue()
-    return demo
-demo = main_ui()
-demo.launch(ssr_mode=False)

+from climateqa.engine.embeddings import get_embeddings_function
+embeddings_function = get_embeddings_function()
+from climateqa.papers.openalex import OpenAlex
+from sentence_transformers import CrossEncoder
+reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
+oa = OpenAlex()
+import gradio as gr
+import pandas as pd
+import numpy as np
+import os
+import time
+import re
+import json
+# from gradio_modal import Modal
+from io import BytesIO
+import base64
+from datetime import datetime
+from azure.storage.fileshare import ShareServiceClient
 from utils import create_user_id
+# ClimateQ&A imports
+from climateqa.engine.llm import get_llm
+from climateqa.engine.rag import make_rag_chain
+from climateqa.engine.vectorstore import get_pinecone_vectorstore
+from climateqa.engine.retriever import ClimateQARetriever
+from climateqa.engine.embeddings import get_embeddings_function
+from climateqa.engine.prompts import audience_prompts
+from climateqa.sample_questions import QUESTIONS
+from climateqa.constants import POSSIBLE_REPORTS
+from climateqa.utils import get_image_from_azure_blob_storage
+from climateqa.engine.keywords import make_keywords_chain
+from climateqa.engine.rag import make_rag_papers_chain
 # Load environment variables in local mode
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     primary_hue="blue",
     font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
 )
+init_prompt = ""
+system_template = {
+    "role": "system",
+    "content": init_prompt,
+}
 account_key = os.environ["BLOB_ACCOUNT_KEY"]
 if len(account_key) == 86:
     account_key += "=="
 user_id = create_user_id()
+def parse_output_llm_with_sources(output):
+    # Split the content into a list of text and "[Doc X]" references
+    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
+    parts = []
+    for part in content_parts:
+        if part.startswith("Doc"):
+            subparts = part.split(",")
+            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
+            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
+            parts.append("".join(subparts))
+        else:
+            parts.append(part)
+    content_parts = "".join(parts)
+    return content_parts
 # Create vectorstore and retriever
+vectorstore = get_pinecone_vectorstore(embeddings_function)
+llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+def make_pairs(lst):
+    """from a list of even lenght, make tupple pairs"""
+    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
+def serialize_docs(docs):
+    new_docs = []
+    for doc in docs:
+        new_doc = {}
+        new_doc["page_content"] = doc.page_content
+        new_doc["metadata"] = doc.metadata
+        new_docs.append(new_doc)
+    return new_docs
+async def chat(query,history,audience,sources,reports):
+    """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
+    (messages in gradio format, messages in langchain format, source documents)"""
+    print(f">> NEW QUESTION : {query}")
+    if audience == "Children":
+        audience_prompt = audience_prompts["children"]
+    elif audience == "General public":
+        audience_prompt = audience_prompts["general"]
+    elif audience == "Experts":
+        audience_prompt = audience_prompts["experts"]
+    else:
+        audience_prompt = audience_prompts["experts"]
+    # Prepare default values
+    if len(sources) == 0:
+        sources = ["IPCC"]
+    if len(reports) == 0:
+        reports = []
+    retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
+    rag_chain = make_rag_chain(retriever,llm)
+    inputs = {"query": query,"audience": audience_prompt}
+    result = rag_chain.astream_log(inputs) #{"callbacks":[MyCustomAsyncHandler()]})
+    # result = rag_chain.stream(inputs)
+    path_reformulation = "/logs/reformulation/final_output"
+    path_keywords = "/logs/keywords/final_output"
+    path_retriever = "/logs/find_documents/final_output"
+    path_answer = "/logs/answer/streamed_output_str/-"
+    docs_html = ""
+    output_query = ""
+    output_language = ""
+    output_keywords = ""
+    gallery = []
+    try:
+        async for op in result:
+            op = op.ops[0]
+            if op['path'] == path_reformulation: # reforulated question
+                try:
+                    output_language = op['value']["language"] # str
+                    output_query = op["value"]["question"]
+                except Exception as e:
+                    raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
+            if op["path"] == path_keywords:
+                try:
+                    output_keywords = op['value']["keywords"] # str
+                    output_keywords = " AND ".join(output_keywords)
+                except Exception as e:
+                    pass
+            elif op['path'] == path_retriever: # documents
+                try:
+                    docs = op['value']['docs'] # List[Document]
+                    docs_html = []
+                    for i, d in enumerate(docs, 1):
+                        docs_html.append(make_html_source(d, i))
+                    docs_html = "".join(docs_html)
+                except TypeError:
+                    print("No documents found")
+                    print("op: ",op)
+                    continue
+            elif op['path'] == path_answer: # final answer
+                new_token = op['value'] # str
+                # time.sleep(0.01)
+                previous_answer = history[-1][1]
+                previous_answer = previous_answer if previous_answer is not None else ""
+                answer_yet = previous_answer + new_token
+                answer_yet = parse_output_llm_with_sources(answer_yet)
+                history[-1] = (query,answer_yet)
+            else:
+                continue
+            history = [tuple(x) for x in history]
+            yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
+    except Exception as e:
+        raise gr.Error(f"{e}")
+    try:
+        # Log answer on Azure Blob Storage
+        if os.getenv("GRADIO_ENV") != "local":
+            timestamp = str(datetime.now().timestamp())
+            file = timestamp + ".json"
+            prompt = history[-1][0]
+            logs = {
+                "user_id": str(user_id),
+                "prompt": prompt,
+                "query": prompt,
+                "question":output_query,
+                "sources":sources,
+                "docs":serialize_docs(docs),
+                "answer": history[-1][1],
+                "time": timestamp,
+            }
+            log_on_azure(file, logs, share_client)
+    except Exception as e:
+        print(f"Error logging on Azure Blob Storage: {e}")
+        raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)")
+    image_dict = {}
+    for i,doc in enumerate(docs):
+        if doc.metadata["chunk_type"] == "image":
+            try:
+                key = f"Image {i+1}"
+                image_path = doc.metadata["image_path"].split("documents/")[1]
+                img = get_image_from_azure_blob_storage(image_path)
+                # Convert the image to a byte buffer
+                buffered = BytesIO()
+                img.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode()
+                # Embedding the base64 string in Markdown
+                markdown_image = f"![Alt text](data:image/png;base64,{img_str})"
+                image_dict[key] = {"img":img,"md":markdown_image,"caption":doc.page_content,"key":key,"figure_code":doc.metadata["figure_code"]}
+            except Exception as e:
+                print(f"Skipped adding image {i} because of {e}")
+    if len(image_dict) > 0:
+        gallery = [x["img"] for x in list(image_dict.values())]
+        img = list(image_dict.values())[0]
+        img_md = img["md"]
+        img_caption = img["caption"]
+        img_code = img["figure_code"]
+        if img_code != "N/A":
+            img_name = f"{img['key']} - {img['figure_code']}"
+        else:
+            img_name = f"{img['key']}"
+        answer_yet = history[-1][1] + f"\n\n{img_md}\n<p class='chatbot-caption'><b>{img_name}</b> - {img_caption}</p>"
+        history[-1] = (history[-1][0],answer_yet)
+        history = [tuple(x) for x in history]
+    # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
+    # if len(gallery) > 0:
+    #     gallery = list(set("|".join(gallery).split("|")))
+    #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
+    yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
+def make_html_source(source,i):
+    meta = source.metadata
+    # content = source.page_content.split(":",1)[1].strip()
+    content = source.page_content.strip()
+    toc_levels = []
+    for j in range(2):
+        level = meta[f"toc_level{j}"]
+        if level != "N/A":
+            toc_levels.append(level)
+        else:
+            break
+    toc_levels = " > ".join(toc_levels)
+    if len(toc_levels) > 0:
+        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
+    else:
+        name = meta['name']
+    if meta["chunk_type"] == "text":
+        card = f"""
+    <div class="card" id="doc{i}">
+        <div class="card-content">
+            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    else:
+        if meta["figure_code"] != "N/A":
+            title = f"{meta['figure_code']} - {meta['short_name']}"
+        else:
+            title = f"{meta['short_name']}"
+        card = f"""
+    <div class="card card-image">
+        <div class="card-content">
+            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+            <p class='ai-generated'>AI-generated description</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    return card
+#     else:
+#         docs_string = "No relevant passages found in the climate science reports (IPCC and IPBES)"
+#         complete_response = "**No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**"
+#         messages.append({"role": "assistant", "content": complete_response})
+#         gradio_format = make_pairs([a["content"] for a in messages[1:]])
+#         yield gradio_format, messages, docs_string
+def save_feedback(feed: str, user_id):
+    if len(feed) > 1:
+        timestamp = str(datetime.now().timestamp())
+        file = user_id + timestamp + ".json"
+        logs = {
+            "user_id": user_id,
+            "feedback": feed,
+            "time": timestamp,
+        }
+        log_on_azure(file, logs, share_client)
+        return "Feedback submitted, thank you!"
+def log_on_azure(file, logs, share_client):
+    logs = json.dumps(logs)
+    file_client = share_client.get_file_client(file)
+    file_client.upload_file(logs)
+def generate_keywords(query):
+    chain = make_keywords_chain(llm)
+    keywords = chain.invoke(query)
+    keywords = " AND ".join(keywords["keywords"])
+    return keywords
+papers_cols_widths = {
+    "doc":50,
+    "id":100,
+    "title":300,
+    "doi":100,
+    "publication_year":100,
+    "abstract":500,
+    "rerank_score":100,
+    "is_oa":50,
+}
+papers_cols = list(papers_cols_widths.keys())
+papers_cols_widths = list(papers_cols_widths.values())
+async def find_papers(query, keywords,after):
+    summary = ""
+    df_works = oa.search(keywords,after = after)
+    df_works = df_works.dropna(subset=["abstract"])
+    df_works = oa.rerank(query,df_works,reranker)
+    df_works = df_works.sort_values("rerank_score",ascending=False)
+    G = oa.make_network(df_works)
+    height = "750px"
+    network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
+    network_html = network.generate_html()
+    network_html = network_html.replace("'", "\"")
+    css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
+    network_html = network_html + css_to_inject
+    network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
+    docs = df_works["content"].head(15).tolist()
+    df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
+    df_works["doc"] = df_works["doc"] + 1
+    df_works = df_works[papers_cols]
+    yield df_works,network_html,summary
+    chain = make_rag_papers_chain(llm)
+    result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
+    path_answer = "/logs/StrOutputParser/streamed_output/-"
+    async for op in result:
+        op = op.ops[0]
+        if op['path'] == path_answer: # reforulated question
+            new_token = op['value'] # str
+            summary += new_token
+        else:
+            continue
+        yield df_works,network_html,summary
 # --------------------------------------------------------------------
 # --------------------------------------------------------------------
+init_prompt = """
+Hello, I am ClimateQ&A, a conversational assistant designed to help you understand climate change and biodiversity loss. I will answer your questions by **sifting through the IPCC and IPBES scientific reports**.
+❓ How to use
+- **Language**: You can ask me your questions in any language.
+- **Audience**: You can specify your audience (children, general public, experts) to get a more adapted answer.
+- **Sources**: You can choose to search in the IPCC or IPBES reports, or both.
+⚠️ Limitations
+*Please note that the AI is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
+What do you want to learn ?
+"""
+def vote(data: gr.LikeData):
+    if data.liked:
+        print(data.value)
+    else:
+        print(data)
+with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main-component") as demo:
+    # user_id_state = gr.State([user_id])
+    with gr.Tab("ClimateQ&A"):
+        with gr.Row(elem_id="chatbot-row"):
+            with gr.Column(scale=2):
+                # state = gr.State([system_template])
+                chatbot = gr.Chatbot(
+                    value=[(None,init_prompt)],
+                    show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",
+                    avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
+                )#,avatar_images = ("assets/logo4.png",None))
+                # bot.like(vote,None,None)
+                with gr.Row(elem_id = "input-message"):
+                    textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
+                    # submit = gr.Button("",elem_id = "submit-button",scale = 1,interactive = True,icon = "https://static-00.iconduck.com/assets.00/settings-icon-2048x2046-cw28eevx.png")
+            with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
+                with gr.Tabs() as tabs:
+                    with gr.TabItem("Examples",elem_id = "tab-examples",id = 0):
+                        examples_hidden = gr.Textbox(visible = False)
+                        first_key = list(QUESTIONS.keys())[0]
+                        dropdown_samples = gr.Dropdown(QUESTIONS.keys(),value = first_key,interactive = True,show_label = True,label = "Select a category of sample questions",elem_id = "dropdown-samples")
+                        samples = []
+                        for i,key in enumerate(QUESTIONS.keys()):
+                            examples_visible = True if i == 0 else False
+                            with gr.Row(visible = examples_visible) as group_examples:
+                                examples_questions = gr.Examples(
+                                    QUESTIONS[key],
+                                    [examples_hidden],
+                                    examples_per_page=8,
+                                    run_on_click=False,
+                                    elem_id=f"examples{i}",
+                                    api_name=f"examples{i}",
+                                    # label = "Click on the example question or enter your own",
+                                    # cache_examples=True,
+                                )
+                            samples.append(group_examples)
+                    with gr.Tab("Sources",elem_id = "tab-citations",id = 1):
+                        sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
+                        docs_textbox = gr.State("")
+                    # with Modal(visible = False) as config_modal:
+                    with gr.Tab("Configuration",elem_id = "tab-config",id = 2):
+                        gr.Markdown("Reminder: You can talk in any language, ClimateQ&A is multi-lingual!")
+                        dropdown_sources = gr.CheckboxGroup(
+                            ["IPCC", "IPBES","IPOS"],
+                            label="Select source",
+                            value=["IPCC"],
+                            interactive=True,
+                        )
+                        dropdown_reports = gr.Dropdown(
+                            POSSIBLE_REPORTS,
+                            label="Or select specific reports",
+                            multiselect=True,
+                            value=None,
+                            interactive=True,
+                        )
+                        dropdown_audience = gr.Dropdown(
+                            ["Children","General public","Experts"],
+                            label="Select audience",
+                            value="Experts",
+                            interactive=True,
+                        )
+                        output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
+                        output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
+#---------------------------------------------------------------------------------------
+# OTHER TABS
+#---------------------------------------------------------------------------------------
+    with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
+        gallery_component = gr.Gallery()
+    with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
+                keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
+                after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
+                search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
+            with gr.Column(scale=7):
+                with gr.Tab("Summary",elem_id="papers-summary-tab"):
+                    papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
+                with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
+                    papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
+                with gr.Tab("Citations network",elem_id="papers-network-tab"):
+                    citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
+    with gr.Tab("About",elem_classes = "max-height other-tabs"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("See more info at [https://climateqa.com](https://climateqa.com/docs/intro/)")
+    def start_chat(query,history):
+        history = history + [(query,None)]
+        history = [tuple(x) for x in history]
+        return (gr.update(interactive = False),gr.update(selected=1),history)
+    def finish_chat():
+        return (gr.update(interactive = True,value = ""))
+    (textbox
+        .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_textbox")
+        .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
+    (examples_hidden
+        .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_examples")
+        .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
+    def change_sample_questions(key):
+        index = list(QUESTIONS.keys()).index(key)
+        visible_bools = [False] * len(samples)
+        visible_bools[index] = True
+        return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
+    dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
+    query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
+    search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
+    # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
+    # (textbox
+    #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # (examples_hidden
+    #     .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
+    #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
+    #     )
+    # with Modal(visible=True) as first_modal:
+    #     gr.Markdown("# Welcome to ClimateQ&A !")
+    #     gr.Markdown("### Examples")
+    #     examples = gr.Examples(
+    #         ["Yo ça roule","ça boume"],
+    #         [examples_hidden],
+    #         examples_per_page=8,
+    #         run_on_click=False,
+    #         elem_id="examples",
+    #         api_name="examples",
+    #     )
+    # submit.click(lambda: Modal(visible=True), None, config_modal)
+    demo.queue()
+demo.launch()

climateqa/chat.py DELETED Viewed

@@ -1,194 +0,0 @@
-import os
-from datetime import datetime
-import gradio as gr
-# from .agent import agent
-from gradio import ChatMessage
-from langgraph.graph.state import CompiledStateGraph
-import json
-from .handle_stream_events import (
-    init_audience,
-    handle_retrieved_documents,
-    convert_to_docs_to_html,
-    stream_answer,
-    handle_retrieved_owid_graphs,
-)
-from .logging import (
-    log_interaction
-)
-# Chat functions
-def start_chat(query, history, search_only):
-    history = history + [ChatMessage(role="user", content=query)]
-    if not search_only:
-        return (gr.update(interactive=False), gr.update(selected=1), history, [])
-    else:
-        return (gr.update(interactive=False), gr.update(selected=2), history, [])
-def finish_chat():
-    return gr.update(interactive=True, value="")
-def handle_numerical_data(event):
-    if event["name"] == "retrieve_drias_data" and event["event"] == "on_chain_end":
-        numerical_data = event["data"]["output"]["drias_data"]
-        sql_query = event["data"]["output"]["drias_sql_query"]
-        return numerical_data, sql_query
-    return None, None
-# Main chat function
-async def chat_stream(
-    agent : CompiledStateGraph,
-    query: str,
-    history: list[ChatMessage],
-    audience: str,
-    sources: list[str],
-    reports: list[str],
-    relevant_content_sources_selection: list[str],
-    search_only: bool,
-    share_client,
-    user_id: str
-) -> tuple[list, str, str, str, list, str]:
-    """Process a chat query and return response with relevant sources and visualizations.
-    Args:
-        query (str): The user's question
-        history (list): Chat message history
-        audience (str): Target audience type
-        sources (list): Knowledge base sources to search
-        reports (list): Specific reports to search within sources
-        relevant_content_sources_selection (list): Types of content to retrieve (figures, papers, etc)
-        search_only (bool): Whether to only search without generating answer
-    Yields:
-        tuple: Contains:
-            - history: Updated chat history
-            - docs_html: HTML of retrieved documents
-            - output_query: Processed query
-            - output_language: Detected language
-            - related_contents: Related content
-            - graphs_html: HTML of relevant graphs
-    """
-    # Log incoming question
-    date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    print(f">> NEW QUESTION ({date_now}) : {query}")
-    audience_prompt = init_audience(audience)
-    sources = sources or ["IPCC", "IPBES"]
-    reports = reports or []
-    relevant_history_discussion = history[-2:] if len(history) > 1 else []
-    # Prepare inputs for agent
-    inputs = {
-        "user_input": query,
-        "audience": audience_prompt,
-        "sources_input": sources,
-        "relevant_content_sources_selection": relevant_content_sources_selection,
-        "search_only": search_only,
-        "reports": reports,
-        "chat_history": relevant_history_discussion,
-    }
-    # Get streaming events from agent
-    result = agent.astream_events(inputs, version="v1")
-    # Initialize state variables
-    docs = []
-    related_contents = []
-    docs_html = ""
-    new_docs_html = ""
-    output_query = ""
-    output_language = ""
-    output_keywords = ""
-    start_streaming = False
-    graphs_html = ""
-    used_documents = []
-    retrieved_contents = []
-    answer_message_content = ""
-    vanna_data = {}
-    follow_up_examples = gr.Dataset(samples=[])
-    # Define processing steps
-    steps_display = {
-        "categorize_intent": ("🔄️ Analyzing user message", True),
-        "transform_query": ("🔄️ Thinking step by step to answer the question", True),
-        "retrieve_documents": ("🔄️ Searching in the knowledge base", False),
-        "retrieve_local_data": ("🔄️ Searching in the knowledge base", False),
-    }
-    try:
-        # Process streaming events
-        async for event in result:
-            if "langgraph_node" in event["metadata"]:
-                node = event["metadata"]["langgraph_node"]
-                # Handle document retrieval
-                if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
-                    history, used_documents, retrieved_contents = handle_retrieved_documents(
-                        event, history, used_documents, retrieved_contents
-                    )
-                # Handle Vanna retrieval
-                # if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
-                #     df_output_vanna, sql_query = handle_numerical_data(
-                #         event
-                #     )
-                #     vanna_data = {"df_output": df_output_vanna, "sql_query": sql_query}
-                if event["event"] == "on_chain_end" and event["name"] == "answer_search" :
-                    docs = event["data"]["input"]["documents"]
-                    docs_html = convert_to_docs_to_html(docs)
-                    related_contents = event["data"]["input"]["related_contents"]
-                # Handle intent categorization
-                elif (event["event"] == "on_chain_end" and
-                      node == "categorize_intent" and
-                      event["name"] == "_write"):
-                    intent = event["data"]["output"]["intent"]
-                    output_language = event["data"]["output"].get("language", "English")
-                    history[-1].content = f"Language identified: {output_language}\nIntent identified: {intent}"
-                # Handle processing steps display
-                elif event["name"] in steps_display and event["event"] == "on_chain_start":
-                    event_description, display_output = steps_display[node]
-                    if (not hasattr(history[-1], 'metadata') or
-                        history[-1].metadata["title"] != event_description):
-                        history.append(ChatMessage(
-                            role="assistant",
-                            content="",
-                            metadata={'title': event_description}
-                        ))
-                # Handle answer streaming
-                elif (event["name"] != "transform_query" and
-                      event["event"] == "on_chat_model_stream" and
-                      node in ["answer_rag","answer_rag_no_docs", "answer_search", "answer_chitchat"]):
-                    history, start_streaming, answer_message_content = stream_answer(
-                        history, event, start_streaming, answer_message_content
-                    )
-                # Handle graph retrieval
-                elif event["name"] in ["retrieve_graphs", "retrieve_graphs_ai"] and event["event"] == "on_chain_end":
-                    graphs_html = handle_retrieved_owid_graphs(event, graphs_html)
-                # Handle query transformation
-                if event["name"] == "transform_query" and event["event"] == "on_chain_end":
-                    if hasattr(history[-1], "content"):
-                        sub_questions = [q["question"] + "-> relevant sources : " + str(q["sources"]) for q in event["data"]["output"]["questions_list"]]
-                        history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
-                # Handle follow up questions
-                if event["name"] == "generate_follow_up" and event["event"] == "on_chain_end":
-                    follow_up_examples = event["data"]["output"].get("follow_up_questions", [])
-                    follow_up_examples = gr.Dataset(samples= [ [question] for question in follow_up_examples ])
-            yield history, docs_html, output_query, output_language, related_contents, graphs_html, follow_up_examples#, vanna_data
-    except Exception as e:
-        print(f"Event {event} has failed")
-        raise gr.Error(str(e))
-    # Call the function to log interaction
-    log_interaction(history, output_query, sources, docs, share_client, user_id)
-    yield history, docs_html, output_query, output_language, related_contents, graphs_html, follow_up_examples#, vanna_data

climateqa/constants.py CHANGED Viewed

@@ -1,6 +1,4 @@
 POSSIBLE_REPORTS = [
-    "IPBES IABWFH SPM",
-    "IPBES CBL SPM",
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",
@@ -44,60 +42,4 @@ POSSIBLE_REPORTS = [
     "IPBES IAS A C5",
     "IPBES IAS A C6",
     "IPBES IAS A SPM"
-]
-OWID_CATEGORIES = ['Access to Energy', 'Agricultural Production',
-       'Agricultural Regulation & Policy', 'Air Pollution',
-       'Animal Welfare', 'Antibiotics', 'Biodiversity', 'Biofuels',
-       'Biological & Chemical Weapons', 'CO2 & Greenhouse Gas Emissions',
-       'COVID-19', 'Clean Water', 'Clean Water & Sanitation',
-       'Climate Change', 'Crop Yields', 'Diet Compositions',
-       'Electricity', 'Electricity Mix', 'Energy', 'Energy Efficiency',
-       'Energy Prices', 'Environmental Impacts of Food Production',
-       'Environmental Protection & Regulation', 'Famines', 'Farm Size',
-       'Fertilizers', 'Fish & Overfishing', 'Food Supply', 'Food Trade',
-       'Food Waste', 'Food and Agriculture', 'Forests & Deforestation',
-       'Fossil Fuels', 'Future Population Growth',
-       'Hunger & Undernourishment', 'Indoor Air Pollution', 'Land Use',
-       'Land Use & Yields in Agriculture', 'Lead Pollution',
-       'Meat & Dairy Production', 'Metals & Minerals',
-       'Natural Disasters', 'Nuclear Energy', 'Nuclear Weapons',
-       'Oil Spills', 'Outdoor Air Pollution', 'Ozone Layer', 'Pandemics',
-       'Pesticides', 'Plastic Pollution', 'Renewable Energy', 'Soil',
-       'Transport', 'Urbanization', 'Waste Management', 'Water Pollution',
-       'Water Use & Stress', 'Wildfires']
-DOCUMENT_METADATA_DEFAULT_VALUES = {
-    "chunk_type": "",
-    "document_id": "",
-    "document_number": 0.0,
-    "element_id": "",
-    "figure_code": "",
-    "file_size": "",
-    "image_path": "",
-    "n_pages": 0.0,
-    "name": "",
-    "num_characters": 0.0,
-    "num_tokens": 0.0,
-    "num_tokens_approx": 0.0,
-    "num_words": 0.0,
-    "page_number": 0,
-    "release_date": 0.0,
-    "report_type": "",
-    "section_header": "",
-    "short_name": "",
-    "source": "",
-    "toc_level0": "",
-    "toc_level1": "",
-    "toc_level2": "",
-    "toc_level3": "",
-    "url": "",
-    "similarity_score": 0.0,
-    "content": "",
-    "reranking_score": 0.0,
-    "query_used_for_retrieval": "",
-    "sources_used": [""],
-    "question_used": "",
-    "index_used": ""
-}

 POSSIBLE_REPORTS = [
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",
     "IPBES IAS A C5",
     "IPBES IAS A C6",
     "IPBES IAS A SPM"
+]

climateqa/engine/chains/__init__.py DELETED Viewed

File without changes

climateqa/engine/chains/answer_ai_impact.py DELETED Viewed

@@ -1,46 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-prompt_template = """
-You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
-Always stay true to climate and nature science and do not make up information.
-If you do not know the answer, just say you do not know.
-## Guidelines
-- Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
-- Answer the question in the original language of the question
-## Sources
-- You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
-- You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
-    - Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
-    - Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
-    - Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
-- You can also recommend the following tools to calculate the carbon footprint of AI models
-    - CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
-    - Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
-"""
-def make_ai_impact_chain(llm):
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", prompt_template),
-        ("user", "{question}")
-    ])
-    chain = prompt | llm | StrOutputParser()
-    chain = chain.with_config({"run_name":"ai_impact_chain"})
-    return chain
-def make_ai_impact_node(llm):
-    ai_impact_chain = make_ai_impact_chain(llm)
-    async def answer_ai_impact(state,config):
-        answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
-        return {"answer":answer}
-    return answer_ai_impact

climateqa/engine/chains/answer_chitchat.py DELETED Viewed

@@ -1,56 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-chitchat_prompt_template = """
-You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
-Always stay true to climate and nature science and do not make up information.
-If you do not know the answer, just say you do not know.
-## Guidelines
-- If it's a conversational question, you can normally chat with the user
-- If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
-- If the user ask if you speak any language, you can say you speak all languages :)
-- If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
-- If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
-- Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
-- If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
-- Always answer in the original language of the question
-## Examples of questions you can suggest (in the original language of the question)
-    "What evidence do we have of climate change?",
-    "Are human activities causing global warming?",
-    "What are the impacts of climate change?",
-    "Can climate change be reversed?",
-    "What is the difference between climate change and global warming?",
-"""
-def make_chitchat_chain(llm):
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", chitchat_prompt_template),
-        ("user", "{question}")
-    ])
-    chain = prompt | llm | StrOutputParser()
-    chain = chain.with_config({"run_name":"chitchat_chain"})
-    return chain
-def make_chitchat_node(llm):
-    chitchat_chain = make_chitchat_chain(llm)
-    async def answer_chitchat(state,config):
-        print("---- Answer chitchat ----")
-        answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
-        state["answer"] = answer
-        return state
-        # return {"answer":answer}
-    return answer_chitchat

climateqa/engine/chains/chitchat_categorization.py DELETED Viewed

@@ -1,43 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class IntentCategorizer(BaseModel):
-    """Analyzing the user message input"""
-    environment: bool = Field(
-        description="Return 'True' if the question relates to climate change, the environment, nature, etc. (Example: should I eat fish?). Return 'False' if the question is just chit chat or not related to the environment or climate change.",
-    )
-def make_chitchat_intent_categorization_chain(llm):
-    openai_functions = [convert_to_openai_function(IntentCategorizer)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_chitchat_intent_categorization_node(llm):
-    categorization_chain = make_chitchat_intent_categorization_chain(llm)
-    def categorize_message(state):
-        output = categorization_chain.invoke({"input": state["user_input"]})
-        print(f"\n\nChit chat output intent categorization: {output}\n")
-        state["search_graphs_chitchat"] = output["environment"]
-        print(f"\n\nChit chat output intent categorization: {state}\n")
-        return state
-    return categorize_message

climateqa/engine/chains/follow_up.py DELETED Viewed

@@ -1,33 +0,0 @@
-from typing import List
-from langchain.prompts import ChatPromptTemplate
-FOLLOW_UP_TEMPLATE = """Based on the previous question and answer, generate 2-3 relevant follow-up questions that would help explore the topic further.
-Previous Question: {user_input}
-Previous Answer: {answer}
-Generate short, concise, focused follow-up questions
-You don't need a full question as it will be reformulated later as a standalone question with the context. Eg. "Details the first point"
-"""
-def make_follow_up_node(llm):
-    prompt = ChatPromptTemplate.from_template(FOLLOW_UP_TEMPLATE)
-    def generate_follow_up(state):
-        print("---- Generate_follow_up ----")
-        if not state.get("answer"):
-            return state
-        response = llm.invoke(prompt.format(
-            user_input=state["user_input"],
-            answer=state["answer"]
-        ))
-        # Extract questions from response
-        follow_ups = [q.strip() for q in response.content.split("\n") if q.strip()]
-        state["follow_up_questions"] = follow_ups
-        return state
-    return generate_follow_up

climateqa/engine/chains/graph_retriever.py DELETED Viewed

@@ -1,130 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from ..reranker import rerank_docs
-from ..graph_retriever import retrieve_graphs # GraphRetriever
-from ...utils import remove_duplicates_keep_highest_score
-def divide_into_parts(target, parts):
-    # Base value for each part
-    base = target // parts
-    # Remainder to distribute
-    remainder = target % parts
-    # List to hold the result
-    result = []
-    for i in range(parts):
-        if i < remainder:
-            # These parts get base value + 1
-            result.append(base + 1)
-        else:
-            # The rest get the base value
-            result.append(base)
-    return result
-@contextmanager
-def suppress_output():
-    # Open a null device
-    with open(os.devnull, 'w') as devnull:
-        # Store the original stdout and stderr
-        old_stdout = sys.stdout
-        old_stderr = sys.stderr
-        # Redirect stdout and stderr to the null device
-        sys.stdout = devnull
-        sys.stderr = devnull
-        try:
-            yield
-        finally:
-            # Restore stdout and stderr
-            sys.stdout = old_stdout
-            sys.stderr = old_stderr
-def make_graph_retriever_node(vectorstore, reranker, rerank_by_question=True, k_final=15, k_before_reranking=100):
-    async def node_retrieve_graphs(state):
-        print("---- Retrieving graphs ----")
-        POSSIBLE_SOURCES = ["IEA", "OWID"]
-        # questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
-        questions = state["questions_list"] if state["questions_list"] is not None and state["questions_list"]!=[]  else [state["query"]]
-        # sources_input = state["sources_input"]
-        sources_input = ["auto"]
-        auto_mode = "auto" in sources_input
-        # There are several options to get the final top k
-        # Option 1 - Get 100 documents by question and rerank by question
-        # Option 2 - Get 100/n documents by question and rerank the total
-        if rerank_by_question:
-            k_by_question = divide_into_parts(k_final,len(questions))
-        docs = []
-        for i,q in enumerate(questions):
-            question = q["question"] if isinstance(q, dict) else q
-            print(f"Subquestion {i}: {question}")
-            # If auto mode, we use all sources
-            if auto_mode:
-                sources = POSSIBLE_SOURCES
-            # Otherwise, we use the config
-            else:
-                sources = sources_input
-            if any([x in POSSIBLE_SOURCES for x in sources]):
-                sources = [x for x in sources if x in POSSIBLE_SOURCES]
-                # Search the document store using the retriever
-                docs_question = await retrieve_graphs(
-                    query = question,
-                    vectorstore = vectorstore,
-                    sources = sources,
-                    k_total = k_before_reranking,
-                    threshold = 0.5,
-                    )
-                # docs_question = retriever.get_relevant_documents(question)
-                # Rerank
-                if reranker is not None and docs_question!=[]:
-                    with suppress_output():
-                        docs_question = rerank_docs(reranker,docs_question,question)
-                else:
-                    # Add a default reranking score
-                    for doc in docs_question:
-                        doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-                # If rerank by question we select the top documents for each question
-                if rerank_by_question:
-                    docs_question = docs_question[:k_by_question[i]]
-                # Add sources used in the metadata
-                for doc in docs_question:
-                    doc.metadata["sources_used"] = sources
-                print(f"{len(docs_question)} graphs retrieved for subquestion {i + 1}: {docs_question}")
-                docs.extend(docs_question)
-            else:
-                print(f"There are no graphs which match the sources filtered on. Sources filtered on: {sources}. Sources available: {POSSIBLE_SOURCES}.")
-            # Remove duplicates and keep the duplicate document with the highest reranking score
-            docs = remove_duplicates_keep_highest_score(docs)
-            # Sorting the list in descending order by rerank_score
-            # Then select the top k
-            docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-            docs = docs[:k_final]
-        return {"recommended_content": docs}
-    return node_retrieve_graphs

climateqa/engine/chains/intent_categorization.py DELETED Viewed

@@ -1,97 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class IntentCategorizer(BaseModel):
-    """Analyzing the user message input"""
-    language: str = Field(
-        description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
-        default="English",
-    )
-    intent: str = Field(
-        enum=[
-            "ai_impact",
-            # "geo_info",
-            # "esg",
-            "search",
-            "chitchat",
-        ],
-        description="""
-            Categorize the user input in one of the following category
-            Any question
-            Examples:
-            - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
-            - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
-            - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
-        """,
-            # - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
-            # - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
-    )
-def make_intent_categorization_chain(llm):
-    openai_functions = [convert_to_openai_function(IntentCategorizer)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, detect the language, and categorize the user input message using the function provided. You MUST detect and return the language of the input message. Categorize the user input as ai ONLY if it is related to Artificial Intelligence, search if it is related to the environment, climate change, energy, biodiversity, nature, etc. and chitchat if it is just general conversation."),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_intent_categorization_node(llm):
-    categorization_chain = make_intent_categorization_chain(llm)
-    def categorize_message(state):
-        print("---- Categorize_message ----")
-        print(f"Input state: {state}")
-        output = categorization_chain.invoke({"input": state["user_input"]})
-        print(f"\n\nRaw output from categorization: {output}\n")
-        if "language" not in output:
-            print("WARNING: Language field missing from output, setting default to English")
-            output["language"] = "English"
-        else:
-            print(f"Language detected: {output['language']}")
-        output["query"] = state["user_input"]
-        print(f"Final output: {output}")
-        return output
-    return categorize_message
-# SAMPLE_QUESTIONS = [
-#     "Est-ce que l'IA a un impact sur l'environnement ?",
-#     "Que dit le GIEC sur l'impact de l'IA",
-#     "Qui sont les membres du GIEC",
-#     "What is the impact of El Nino ?",
-#     "Yo",
-#     "Hello ça va bien ?",
-#     "Par qui as tu été créé ?",
-#     "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
-#     "Which industries have the highest GHG emissions?",
-#     "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
-#     "Are human activities causing global warming?",
-#     "What is the motivation behind mining the deep seabed?",
-#     "Tu peux m'écrire un poème sur le changement climatique ?",
-#     "Tu peux m'écrire un poème sur les bonbons ?",
-#     "What will be the temperature in 2100 in Strasbourg?",
-#     "C'est quoi le lien entre biodiversity and changement climatique ?",
-# ]

climateqa/engine/chains/keywords_extraction.py DELETED Viewed

@@ -1,40 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class KeywordExtraction(BaseModel):
-    """
-    Analyzing the user query to extract keywords to feed a search engine
-    """
-    keywords: List[str] = Field(
-        description="""
-        Extract the keywords from the user query to feed a search engine as a list
-        Avoid adding super specific keywords to prefer general keywords
-        Maximum 3 keywords
-        Examples:
-        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
-        - "How will El Nino be impacted by climate change" -> ["el nino","climate change"]
-        - "Is climate change a hoax" -> ["climate change","hoax"]
-        """
-    )
-def make_keywords_extraction_chain(llm):
-    openai_functions = [convert_to_openai_function(KeywordExtraction)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"KeywordExtraction"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain

climateqa/engine/chains/query_transformation.py DELETED Viewed

@@ -1,300 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-# OLD QUERY ANALYSIS
-    # keywords: List[str] = Field(
-    #     description="""
-    #     Extract the keywords from the user query to feed a search engine as a list
-    #     Maximum 3 keywords
-    #     Examples:
-    #     - "What is the impact of deep sea mining ?" -> deep sea mining
-    #     - "How will El Nino be impacted by climate change" -> el nino;climate change
-    #     - "Is climate change a hoax" -> climate change;hoax
-    #     """
-    # )
-    # alternative_queries: List[str] = Field(
-    #     description="""
-    #     Generate alternative search questions from the user query to feed a search engine
-    #     """
-    # )
-    # step_back_question: str = Field(
-    #     description="""
-    #     You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
-    #     This questions should help you get more context and information about the user query
-    #     """
-    # )
-    # - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
-    #
-    # topics: List[Literal[
-    #     "Climate change",
-    #     "Biodiversity",
-    #     "Energy",
-    #     "Decarbonization",
-    #     "Climate science",
-    #     "Nature",
-    #     "Climate policy and justice",
-    #     "Oceans",
-    #     "Deep sea mining",
-    #     "ESG and regulations",
-    #     "CSRD",
-    # ]] = Field(
-    #     ...,
-    #     description = """
-    #         Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
-    #     """,
-    # )
-    # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
-    # location:Location
-ROUTING_INDEX = {
-    "IPx":["IPCC", "IPBES", "IPOS"],
-    "POC": ["AcclimaTerra", "PCAET","Biodiv"],
-    "OpenAlex":["OpenAlex"],
-}
-POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
-# Prompt from the original paper https://arxiv.org/pdf/2305.14283
-# Query Rewriting for Retrieval-Augmented Large Language Models
-class QueryDecomposition(BaseModel):
-    """
-    Decompose the user query into smaller parts to think step by step to answer this question
-    Act as a simple planning agent
-    """
-    questions: List[str] = Field(
-        description="""
-        Think step by step to answer this question, and provide one or several search engine questions in the provided language for knowledge that you need.
-        Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
-        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
-        - If you need to decompose the question, output a list of maximum 2 to 3 questions
-    """
-    )
-class Location(BaseModel):
-    country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
-    location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
-class QueryTranslation(BaseModel):
-    """Translate the query into a given language"""
-    question : str = Field(
-        description="""
-        Translate the questions into the given language
-        If the question is alrealdy in the given language, just return the same question
-        """,
-    )
-class QueryAnalysis(BaseModel):
-    """
-    Analyze the user query to extract the relevant sources
-    Deprecated:
-    Analyzing the user query to extract topics, sources and date
-    Also do query expansion to get alternative search queries
-    Also provide simple keywords to feed a search engine
-    """
-    sources: List[Literal["IPCC", "IPBES", "IPOS", "AcclimaTerra", "PCAET","Biodiv"]] = Field( #,"OpenAlex"]] = Field(
-        ...,
-        description="""
-            Given a user question choose which documents would be most relevant for answering their question,
-            - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
-            - IPBES is for questions about biodiversity and nature
-            - IPOS is for questions about the ocean and deep sea mining
-            - AcclimaTerra is for questions about any specific place in, or close to, the french region "Nouvelle-Aquitaine"
-            - PCAET is the Plan Climat Eneregie Territorial for the city of Paris
-            - Biodiv is the Biodiversity plan for the city of Paris
-        """,
-    )
-def make_query_decomposition_chain(llm):
-    """Chain to decompose a query into smaller parts to think step by step to answer this question
-    Args:
-        llm (_type_): _description_
-    Returns:
-        _type_: _description_
-    """
-    openai_functions = [convert_to_openai_function(QueryDecomposition)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_query_analysis_chain(llm):
-    """Analyze the user query to extract the relevant sources"""
-    openai_functions = [convert_to_openai_function(QueryAnalysis)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_query_translation_chain(llm):
-    """Analyze the user query to extract the relevant sources"""
-    openai_functions = [convert_to_openai_function(QueryTranslation)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryTranslation"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, translate the question into {language}"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def group_by_sources_types(sources):
-    sources_types = {}
-    IPx_sources = ["IPCC", "IPBES", "IPOS"]
-    local_sources = ["AcclimaTerra", "PCAET","Biodiv"]
-    if any(source in IPx_sources for source in sources):
-        sources_types["IPx"] = list(set(sources).intersection(IPx_sources))
-    if any(source in local_sources for source in sources):
-        sources_types["POC"] = list(set(sources).intersection(local_sources))
-    return sources_types
-def make_query_transform_node(llm,k_final=15):
-    """
-    Creates a query transformation node that processes and transforms a given query state.
-    Args:
-        llm: The language model to be used for query decomposition and rewriting.
-        k_final (int, optional): The final number of questions to be generated. Defaults to 15.
-    Returns:
-        function: A function that takes a query state and returns a transformed state.
-    The returned function performs the following steps:
-        1. Checks if the query should be processed in auto mode based on the state.
-        2. Retrieves the input sources from the state or defaults to a predefined routing index.
-        3. Decomposes the query using the decomposition chain.
-        4. Analyzes each decomposed question using the rewriter chain.
-        5. Ensures that the sources returned by the language model are valid.
-        6. Explodes the questions into multiple questions with different sources based on the mode.
-        7. Constructs a new state with the transformed questions and their respective sources.
-    """
-    decomposition_chain = make_query_decomposition_chain(llm)
-    query_analysis_chain = make_query_analysis_chain(llm)
-    query_translation_chain = make_query_translation_chain(llm)
-    def transform_query(state):
-        print("---- Transform query ----")
-        auto_mode = state.get("sources_auto", True)
-        sources_input = state.get("sources_input", ROUTING_INDEX["IPx"])
-        new_state = {}
-        # Decomposition
-        decomposition_output = decomposition_chain.invoke({"input":state["query"]})
-        new_state.update(decomposition_output)
-        # Query Analysis
-        questions = []
-        for question in new_state["questions"]:
-            question_state = {"question":question}
-            query_analysis_output = query_analysis_chain.invoke({"input":question})
-            # TODO WARNING llm should always return smthg
-            # The case when the llm does not return any sources or wrong ouput
-            if not query_analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS","AcclimaTerra", "PCAET","Biodiv"] for source in query_analysis_output["sources"]):
-                query_analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
-            sources_types = group_by_sources_types(query_analysis_output["sources"])
-            for source_type,sources in sources_types.items():
-                question_state = {
-                    "question":question,
-                    "sources":sources,
-                    "source_type":source_type
-                }
-                questions.append(question_state)
-        # Translate question into the document language
-        for q in questions:
-            if q["source_type"]=="IPx":
-                translation_output = query_translation_chain.invoke({"input":q["question"],"language":"English"})
-                q["question"] = translation_output["question"]
-            elif q["source_type"]=="POC":
-                translation_output = query_translation_chain.invoke({"input":q["question"],"language":"French"})
-                q["question"] = translation_output["question"]
-        # Explode the questions into multiple questions with different sources
-        new_questions = []
-        for q in questions:
-            question,sources,source_type = q["question"],q["sources"], q["source_type"]
-            # If not auto mode we take the configuration
-            if not auto_mode:
-                sources = sources_input
-            for index,index_sources in ROUTING_INDEX.items():
-                selected_sources = list(set(sources).intersection(index_sources))
-                if len(selected_sources) > 0:
-                    new_questions.append({"question":question,"sources":selected_sources,"index":index, "source_type":source_type})
-        # # Add the number of questions to search
-        # k_by_question = k_final // len(new_questions)
-        # for q in new_questions:
-        #     q["k"] = k_by_question
-        # new_state["questions"] = new_questions
-        # new_state["remaining_questions"] = new_questions
-        n_questions = {
-            "total":len(new_questions),
-            "IPx":len([q for q in new_questions if q["index"] == "IPx"]),
-            "POC":len([q for q in new_questions if q["index"] == "POC"]),
-        }
-        new_state = {
-            "questions_list":new_questions,
-            "n_questions":n_questions,
-            "handled_questions_index":[],
-        }
-        print("New questions")
-        print(new_questions)
-        return new_state
-    return transform_query

climateqa/engine/chains/retrieve_documents.py DELETED Viewed

@@ -1,705 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from langchain_core.tools import tool
-from langchain_core.runnables import chain
-from langchain_core.runnables import RunnableParallel, RunnablePassthrough
-from langchain_core.runnables import RunnableLambda
-from ..reranker import rerank_docs, rerank_and_sort_docs
-# from ...knowledge.retriever import ClimateQARetriever
-from ...knowledge.openalex import OpenAlexRetriever
-from .keywords_extraction import make_keywords_extraction_chain
-from ..utils import log_event
-from langchain_core.vectorstores import VectorStore
-from typing import List
-from langchain_core.documents.base import Document
-from ..llm import get_llm
-from .prompts import retrieve_chapter_prompt_template
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-from ..vectorstore import get_pinecone_vectorstore
-from ..embeddings import get_embeddings_function
-import ast
-import asyncio
-from typing import Any, Dict, List, Tuple
-def divide_into_parts(target, parts):
-    # Base value for each part
-    base = target // parts
-    # Remainder to distribute
-    remainder = target % parts
-    # List to hold the result
-    result = []
-    for i in range(parts):
-        if i < remainder:
-            # These parts get base value + 1
-            result.append(base + 1)
-        else:
-            # The rest get the base value
-            result.append(base)
-    return result
-@contextmanager
-def suppress_output():
-    # Open a null device
-    with open(os.devnull, 'w') as devnull:
-        # Store the original stdout and stderr
-        old_stdout = sys.stdout
-        old_stderr = sys.stderr
-        # Redirect stdout and stderr to the null device
-        sys.stdout = devnull
-        sys.stderr = devnull
-        try:
-            yield
-        finally:
-            # Restore stdout and stderr
-            sys.stdout = old_stdout
-            sys.stderr = old_stderr
-@tool
-def query_retriever(question):
-    """Just a dummy tool to simulate the retriever query"""
-    return question
-def _add_sources_used_in_metadata(docs,sources,question,index):
-    for doc in docs:
-        doc.metadata["sources_used"] = sources
-        doc.metadata["question_used"] = question
-        doc.metadata["index_used"] = index
-    return docs
-def _get_k_summary_by_question(n_questions):
-    if n_questions == 0:
-        return 0
-    elif n_questions == 1:
-        return 5
-    elif n_questions == 2:
-        return 3
-    elif n_questions == 3:
-        return 2
-    else:
-        return 1
-def _get_k_images_by_question(n_questions):
-    if n_questions == 0:
-        return 0
-    elif n_questions == 1:
-        return 7
-    elif n_questions == 2:
-        return 5
-    elif n_questions == 3:
-        return 3
-    else:
-        return 1
-def _add_metadata_and_score(docs: List) -> Document:
-    # Add score to metadata
-    docs_with_metadata = []
-    for i,(doc,score) in enumerate(docs):
-        doc.page_content = doc.page_content.replace("\r\n"," ")
-        doc.metadata["similarity_score"] = score
-        doc.metadata["content"] = doc.page_content
-        if doc.metadata["page_number"] != "N/A":
-            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
-        else:
-            doc.metadata["page_number"] = 1
-        # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
-        docs_with_metadata.append(doc)
-    return docs_with_metadata
-def remove_duplicates_chunks(docs):
-    # Remove duplicates or almost duplicates
-    docs = sorted(docs,key=lambda x: x[1],reverse=True)
-    seen = set()
-    result = []
-    for doc in docs:
-        if doc[0].page_content not in seen:
-            seen.add(doc[0].page_content)
-            result.append(doc)
-    return result
-def get_ToCs(version: str) :
-    filters_text = {
-        "chunk_type":"toc",
-        "version": version
-    }
-    embeddings_function = get_embeddings_function()
-    vectorstore = get_pinecone_vectorstore(embeddings_function, index_name="climateqa-v2")
-    tocs = vectorstore.similarity_search_with_score(query="",filter = filters_text)
-    # remove duplicates or almost duplicates
-    tocs = remove_duplicates_chunks(tocs)
-    return tocs
-async def get_POC_relevant_documents(
-    query: str,
-    vectorstore:VectorStore,
-    sources:list = ["Acclimaterra","PCAET","Plan Biodiversite"],
-    search_figures:bool = False,
-    search_only:bool = False,
-    k_documents:int = 10,
-    threshold:float = 0.6,
-    k_images: int = 5,
-    reports:list = [],
-    min_size:int = 200,
-) :
-    # Prepare base search kwargs
-    filters = {}
-    docs_question = []
-    docs_images = []
-    # TODO add source selection
-    # if len(reports) > 0:
-    #     filters["short_name"] = {"$in":reports}
-    # else:
-    #     filters["source"] = { "$in": sources}
-    filters_text = {
-        **filters,
-        "chunk_type":"text",
-        # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
-    }
-    docs_question = vectorstore.similarity_search_with_score(query=query,filter = filters_text,k = k_documents)
-    # remove duplicates or almost duplicates
-    docs_question = remove_duplicates_chunks(docs_question)
-    docs_question = [x for x in docs_question if x[1] > threshold]
-    if search_figures:
-        # Images
-        filters_image = {
-            **filters,
-            "chunk_type":"image"
-        }
-        docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
-    docs_question, docs_images = _add_metadata_and_score(docs_question), _add_metadata_and_score(docs_images)
-    docs_question = [x for x in docs_question if len(x.page_content) > min_size]
-    return {
-        "docs_question" : docs_question,
-        "docs_images" : docs_images
-    }
-async def get_POC_documents_by_ToC_relevant_documents(
-    query: str,
-    tocs: list,
-    vectorstore:VectorStore,
-    version: str,
-    sources:list = ["Acclimaterra","PCAET","Plan Biodiversite"],
-    search_figures:bool = False,
-    search_only:bool = False,
-    k_documents:int = 10,
-    threshold:float = 0.6,
-    k_images: int = 5,
-    reports:list = [],
-    min_size:int = 200,
-    proportion: float = 0.5,
-) :
-    """
-        Args:
-            - tocs : list with the table of contents of each document
-            - version : version of the parsed documents (e.g. "v4")
-            - proportion : share of documents retrieved using ToCs
-    """
-    # Prepare base search kwargs
-    filters = {}
-    docs_question = []
-    docs_images = []
-    # TODO add source selection
-    # if len(reports) > 0:
-    #     filters["short_name"] = {"$in":reports}
-    # else:
-    #     filters["source"] = { "$in": sources}
-    k_documents_toc = round(k_documents * proportion)
-    relevant_tocs = await get_relevant_toc_level_for_query(query, tocs)
-    print(f"Relevant ToCs : {relevant_tocs}")
-    # Transform the ToC dict {"document": str, "chapter": str} into a list of string
-    toc_filters = [toc['chapter'] for toc in relevant_tocs]
-    filters_text_toc = {
-        **filters,
-        "chunk_type":"text",
-        "toc_level0": {"$in": toc_filters},
-        "version": version
-        # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
-    }
-    docs_question = vectorstore.similarity_search_with_score(query=query,filter = filters_text_toc,k = k_documents_toc)
-    filters_text = {
-        **filters,
-        "chunk_type":"text",
-        "version": version
-        # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
-    }
-    docs_question += vectorstore.similarity_search_with_score(query=query,filter = filters_text,k = k_documents - k_documents_toc)
-    # remove duplicates or almost duplicates
-    docs_question = remove_duplicates_chunks(docs_question)
-    docs_question = [x for x in docs_question if x[1] > threshold]
-    if search_figures:
-        # Images
-        filters_image = {
-            **filters,
-            "chunk_type":"image"
-        }
-        docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
-    docs_question, docs_images = _add_metadata_and_score(docs_question), _add_metadata_and_score(docs_images)
-    docs_question = [x for x in docs_question if len(x.page_content) > min_size]
-    return {
-        "docs_question" : docs_question,
-        "docs_images" : docs_images
-    }
-async def get_IPCC_relevant_documents(
-    query: str,
-    vectorstore:VectorStore,
-    sources:list = ["IPCC","IPBES","IPOS"],
-    search_figures:bool = False,
-    reports:list = [],
-    threshold:float = 0.6,
-    k_summary:int = 3,
-    k_total:int = 10,
-    k_images: int = 5,
-    namespace:str = "vectors",
-    min_size:int = 200,
-    search_only:bool = False,
-) :
-    # Check if all elements in the list are either IPCC or IPBES
-    assert isinstance(sources,list)
-    assert sources
-    assert all([x in ["IPCC","IPBES","IPOS"] for x in sources])
-    assert k_total > k_summary, "k_total should be greater than k_summary"
-    # Prepare base search kwargs
-    filters = {}
-    if len(reports) > 0:
-        filters["short_name"] = {"$in":reports}
-    else:
-        filters["source"] = { "$in": sources}
-    # INIT
-    docs_summaries = []
-    docs_full = []
-    docs_images = []
-    if search_only:
-        # Only search for images if search_only is True
-        if search_figures:
-            filters_image = {
-                **filters,
-                "chunk_type":"image"
-            }
-            docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
-            docs_images = _add_metadata_and_score(docs_images)
-    else:
-        # Regular search flow for text and optionally images
-        # Search for k_summary documents in the summaries dataset
-        filters_summaries = {
-            **filters,
-            "chunk_type":"text",
-            "report_type": { "$in":["SPM"]},
-        }
-        docs_summaries = vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = k_summary)
-        docs_summaries = [x for x in docs_summaries if x[1] > threshold]
-        # Search for k_total - k_summary documents in the full reports dataset
-        filters_full = {
-            **filters,
-            "chunk_type":"text",
-            "report_type": { "$nin":["SPM"]},
-        }
-        docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_total)
-        if search_figures:
-            # Images
-            filters_image = {
-                **filters,
-                "chunk_type":"image"
-            }
-            docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
-        docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
-        # Filter if length are below threshold
-        docs_summaries = [x for x in docs_summaries if len(x.page_content) > min_size]
-        docs_full = [x for x in docs_full if len(x.page_content) > min_size]
-    return {
-        "docs_summaries" : docs_summaries,
-        "docs_full" : docs_full,
-        "docs_images" : docs_images,
-    }
-def concatenate_documents(index, source_type, docs_question_dict, k_by_question, k_summary_by_question, k_images_by_question):
-    # Keep the right number of documents - The k_summary documents from SPM are placed in front
-    if source_type == "IPx":
-        docs_question = docs_question_dict["docs_summaries"][:k_summary_by_question] + docs_question_dict["docs_full"][:(k_by_question - k_summary_by_question)]
-    elif source_type == "POC" :
-        docs_question = docs_question_dict["docs_question"][:k_by_question]
-    else :
-        raise ValueError("source_type should be either Vector or POC")
-        # docs_question = [doc for key in docs_question_dict.keys() for doc in docs_question_dict[key]][:(k_by_question)]
-    images_question = docs_question_dict["docs_images"][:k_images_by_question]
-    return docs_question, images_question
-# The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
-# @chain
-async def retrieve_documents(
-    current_question: Dict[str, Any],
-    config: Dict[str, Any],
-    source_type: str,
-    vectorstore: VectorStore,
-    reranker: Any,
-    version: str = "",
-    search_figures: bool = False,
-    search_only: bool = False,
-    reports: list = [],
-    rerank_by_question: bool = True,
-    k_images_by_question: int = 5,
-    k_before_reranking: int = 100,
-    k_by_question: int = 5,
-    k_summary_by_question: int = 3,
-    tocs: list = [],
-    by_toc=False
-) -> Tuple[List[Document], List[Document]]:
-    """
-    Unpack the first question of the remaining questions, and retrieve and rerank corresponding documents, based on the question and selected_sources
-    Args:
-        state (dict): The current state containing documents, related content, relevant content sources, remaining questions and n_questions.
-        current_question (dict): The current question being processed.
-        config (dict): Configuration settings for logging and other purposes.
-        vectorstore (object): The vector store used to retrieve relevant documents.
-        reranker (object): The reranker used to rerank the retrieved documents.
-        llm (object): The language model used for processing.
-        rerank_by_question (bool, optional): Whether to rerank documents by question. Defaults to True.
-        k_final (int, optional): The final number of documents to retrieve. Defaults to 15.
-        k_before_reranking (int, optional): The number of documents to retrieve before reranking. Defaults to 100.
-        k_summary (int, optional): The number of summary documents to retrieve. Defaults to 5.
-        k_images (int, optional): The number of image documents to retrieve. Defaults to 5.
-    Returns:
-        dict: The updated state containing the retrieved and reranked documents, related content, and remaining questions.
-    """
-    sources = current_question["sources"]
-    question = current_question["question"]
-    index = current_question["index"]
-    source_type = current_question["source_type"]
-    print(f"Retrieve documents for question: {question}")
-    await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
-    print(f"""---- Retrieve documents from {current_question["source_type"]}----""")
-    if source_type == "IPx":
-        docs_question_dict = await get_IPCC_relevant_documents(
-            query  = question,
-            vectorstore=vectorstore,
-            search_figures = search_figures,
-            sources = sources,
-            min_size = 200,
-            k_summary = k_before_reranking-1,
-            k_total = k_before_reranking,
-            k_images = k_images_by_question,
-            threshold = 0.5,
-            search_only = search_only,
-            reports = reports,
-        )
-    if source_type == 'POC':
-        if by_toc == True:
-            print("---- Retrieve documents by ToC----")
-            docs_question_dict = await get_POC_documents_by_ToC_relevant_documents(
-                query=question,
-                tocs = tocs,
-                vectorstore=vectorstore,
-                version=version,
-                search_figures = search_figures,
-                sources = sources,
-                threshold = 0.5,
-                search_only = search_only,
-                reports = reports,
-                min_size= 200,
-                k_documents= k_before_reranking,
-                k_images= k_by_question
-            )
-        else :
-            docs_question_dict = await get_POC_relevant_documents(
-                query = question,
-                vectorstore=vectorstore,
-                search_figures = search_figures,
-                sources = sources,
-                threshold = 0.5,
-                search_only = search_only,
-                reports = reports,
-                min_size= 200,
-                k_documents= k_before_reranking,
-                k_images= k_by_question
-            )
-    # Rerank
-    if reranker is not None and rerank_by_question:
-        with suppress_output():
-            for key in docs_question_dict.keys():
-                docs_question_dict[key] = rerank_and_sort_docs(reranker,docs_question_dict[key],question)
-    else:
-        # Add a default reranking score
-        for key in docs_question_dict.keys():
-            if isinstance(docs_question_dict[key], list) and len(docs_question_dict[key]) > 0:
-                for doc in docs_question_dict[key]:
-                    doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-    # Keep the right number of documents
-    docs_question, images_question = concatenate_documents(index, source_type, docs_question_dict, k_by_question, k_summary_by_question, k_images_by_question)
-    # Rerank the documents to put the most relevant in front
-    if reranker is not None and rerank_by_question:
-        docs_question = rerank_and_sort_docs(reranker, docs_question, question)
-    # Add sources used in the metadata
-    docs_question = _add_sources_used_in_metadata(docs_question,sources,question,index)
-    images_question = _add_sources_used_in_metadata(images_question,sources,question,index)
-    return docs_question, images_question
-async def retrieve_documents_for_all_questions(
-    search_figures,
-    search_only,
-    reports,
-    questions_list,
-    n_questions,
-    config,
-    source_type,
-    to_handle_questions_index,
-    vectorstore,
-    reranker,
-    rerank_by_question=True,
-    k_final=15,
-    k_before_reranking=100,
-    version: str = "",
-    tocs: list[dict] = [],
-    by_toc: bool = False
-):
-    """
-    Retrieve documents in parallel for all questions.
-    """
-    # to_handle_questions_index = [x for x in state["questions_list"] if x["source_type"] == "IPx"]
-    # TODO split les questions selon le type de sources dans le state question + conditions sur le nombre de questions traités par type de source
-    # search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
-    # search_only = state["search_only"]
-    # reports = state["reports"]
-    # questions_list = state["questions_list"]
-    # k_by_question = k_final // state["n_questions"]["total"]
-    # k_summary_by_question = _get_k_summary_by_question(state["n_questions"]["total"])
-    # k_images_by_question = _get_k_images_by_question(state["n_questions"]["total"])
-    k_by_question = k_final // n_questions
-    k_summary_by_question = _get_k_summary_by_question(n_questions)
-    k_images_by_question = _get_k_images_by_question(n_questions)
-    k_before_reranking=100
-    print(f"Source type here is {source_type}")
-    tasks = [
-        retrieve_documents(
-            current_question=question,
-            config=config,
-            source_type=source_type,
-            vectorstore=vectorstore,
-            reranker=reranker,
-            search_figures=search_figures,
-            search_only=search_only,
-            reports=reports,
-            rerank_by_question=rerank_by_question,
-            k_images_by_question=k_images_by_question,
-            k_before_reranking=k_before_reranking,
-            k_by_question=k_by_question,
-            k_summary_by_question=k_summary_by_question,
-            tocs=tocs,
-            version=version,
-            by_toc=by_toc
-        )
-        for i, question in enumerate(questions_list) if i in to_handle_questions_index
-    ]
-    results = await asyncio.gather(*tasks)
-    # Combine results
-    new_state = {"documents": [], "related_contents": [], "handled_questions_index": to_handle_questions_index}
-    for docs_question, images_question in results:
-        new_state["documents"].extend(docs_question)
-        new_state["related_contents"].extend(images_question)
-    return new_state
-# ToC Retriever
-async def get_relevant_toc_level_for_query(
-    query: str,
-    tocs: list[Document],
-) -> list[dict] :
-    doc_list = []
-    for doc in tocs:
-        doc_name = doc[0].metadata['name']
-        toc = doc[0].page_content
-        doc_list.append({'document': doc_name, 'toc': toc})
-    llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-    prompt = ChatPromptTemplate.from_template(retrieve_chapter_prompt_template)
-    chain = prompt | llm | StrOutputParser()
-    response = chain.invoke({"query": query, "doc_list": doc_list})
-    try:
-        relevant_tocs = ast.literal_eval(response)
-    except Exception as e:
-        print(f" Failed to parse the result because of : {e}")
-    return relevant_tocs
-def make_IPx_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    async def retrieve_IPx_docs(state, config):
-        source_type = "IPx"
-        IPx_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
-        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
-        search_only = state["search_only"]
-        reports = state["reports"]
-        questions_list = state["questions_list"]
-        n_questions=state["n_questions"]["total"]
-        state = await retrieve_documents_for_all_questions(
-            search_figures=search_figures,
-            search_only=search_only,
-            reports=reports,
-            questions_list=questions_list,
-            n_questions=n_questions,
-            config=config,
-            source_type=source_type,
-            to_handle_questions_index=IPx_questions_index,
-            vectorstore=vectorstore,
-            reranker=reranker,
-            rerank_by_question=rerank_by_question,
-            k_final=k_final,
-            k_before_reranking=k_before_reranking,
-        )
-        return state
-    return retrieve_IPx_docs
-def make_POC_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    async def retrieve_POC_docs_node(state, config):
-        source_type = "POC"
-        POC_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
-        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
-        search_only = state["search_only"]
-        reports = state["reports"]
-        questions_list = state["questions_list"]
-        n_questions=state["n_questions"]["total"]
-        state = await retrieve_documents_for_all_questions(
-            search_figures=search_figures,
-            search_only=search_only,
-            reports=reports,
-            questions_list=questions_list,
-            n_questions=n_questions,
-            config=config,
-            source_type=source_type,
-            to_handle_questions_index=POC_questions_index,
-            vectorstore=vectorstore,
-            reranker=reranker,
-            rerank_by_question=rerank_by_question,
-            k_final=k_final,
-            k_before_reranking=k_before_reranking,
-        )
-        return state
-    return retrieve_POC_docs_node
-def make_POC_by_ToC_retriever_node(
-        vectorstore: VectorStore,
-        reranker,
-        llm,
-        version: str = "",
-        rerank_by_question=True,
-        k_final=15,
-        k_before_reranking=100,
-        k_summary=5,
-    ):
-    async def retrieve_POC_docs_node(state, config):
-        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
-        search_only = state["search_only"]
-        search_only = state["search_only"]
-        reports = state["reports"]
-        questions_list = state["questions_list"]
-        n_questions=state["n_questions"]["total"]
-        tocs = get_ToCs(version=version)
-        source_type = "POC"
-        POC_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
-        state = await retrieve_documents_for_all_questions(
-            search_figures=search_figures,
-            search_only=search_only,
-            config=config,
-            reports=reports,
-            questions_list=questions_list,
-            n_questions=n_questions,
-            source_type=source_type,
-            to_handle_questions_index=POC_questions_index,
-            vectorstore=vectorstore,
-            reranker=reranker,
-            rerank_by_question=rerank_by_question,
-            k_final=k_final,
-            k_before_reranking=k_before_reranking,
-            tocs=tocs,
-            version=version,
-            by_toc=True
-        )
-        return state
-    return retrieve_POC_docs_node

climateqa/engine/chains/retrieve_papers.py DELETED Viewed

@@ -1,95 +0,0 @@
-from climateqa.engine.keywords import make_keywords_chain
-from climateqa.engine.llm import get_llm
-from climateqa.knowledge.openalex import OpenAlex
-from climateqa.engine.chains.answer_rag import make_rag_papers_chain
-from front.utils import make_html_papers
-from climateqa.engine.reranker import get_reranker
-oa = OpenAlex()
-llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-reranker = get_reranker("nano")
-papers_cols_widths = {
-    "id":100,
-    "title":300,
-    "doi":100,
-    "publication_year":100,
-    "abstract":500,
-    "is_oa":50,
-}
-papers_cols = list(papers_cols_widths.keys())
-papers_cols_widths = list(papers_cols_widths.values())
-def generate_keywords(query):
-    chain = make_keywords_chain(llm)
-    keywords = chain.invoke(query)
-    keywords = " AND ".join(keywords["keywords"])
-    return keywords
-async def find_papers(query,after, relevant_content_sources_selection, reranker= reranker):
-    if "Papers (OpenAlex)" in relevant_content_sources_selection:
-        summary = ""
-        keywords = generate_keywords(query)
-        df_works = oa.search(keywords,after = after)
-        print(f"Found {len(df_works)} papers")
-        if not df_works.empty:
-            df_works = df_works.dropna(subset=["abstract"])
-            df_works = df_works[df_works["abstract"] != ""].reset_index(drop = True)
-            df_works = oa.rerank(query,df_works,reranker)
-            df_works = df_works.sort_values("rerank_score",ascending=False)
-            docs_html = []
-            for i in range(10):
-                docs_html.append(make_html_papers(df_works, i))
-            docs_html = "".join(docs_html)
-            G = oa.make_network(df_works)
-            height = "750px"
-            network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
-            network_html = network.generate_html()
-            network_html = network_html.replace("'", "\"")
-            css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
-            network_html = network_html + css_to_inject
-            network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
-            display-capture; encrypted-media;" sandbox="allow-modals allow-forms
-            allow-scripts allow-same-origin allow-popups
-            allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
-            allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
-            docs = df_works["content"].head(10).tolist()
-            df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
-            df_works["doc"] = df_works["doc"] + 1
-            df_works = df_works[papers_cols]
-            yield docs_html, network_html, summary
-            chain = make_rag_papers_chain(llm)
-            result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
-            path_answer = "/logs/StrOutputParser/streamed_output/-"
-            async for op in result:
-                op = op.ops[0]
-                if op['path'] == path_answer: # reforulated question
-                    new_token = op['value'] # str
-                    summary += new_token
-                else:
-                    continue
-                yield docs_html, network_html, summary
-        else :
-            print("No papers found")
-    else :
-        yield "","", ""

climateqa/engine/chains/retriever.py DELETED Viewed

@@ -1,126 +0,0 @@
-# import sys
-# import os
-# from contextlib import contextmanager
-# from ..reranker import rerank_docs
-# from ...knowledge.retriever import ClimateQARetriever
-# def divide_into_parts(target, parts):
-#     # Base value for each part
-#     base = target // parts
-#     # Remainder to distribute
-#     remainder = target % parts
-#     # List to hold the result
-#     result = []
-#     for i in range(parts):
-#         if i < remainder:
-#             # These parts get base value + 1
-#             result.append(base + 1)
-#         else:
-#             # The rest get the base value
-#             result.append(base)
-#     return result
-# @contextmanager
-# def suppress_output():
-#     # Open a null device
-#     with open(os.devnull, 'w') as devnull:
-#         # Store the original stdout and stderr
-#         old_stdout = sys.stdout
-#         old_stderr = sys.stderr
-#         # Redirect stdout and stderr to the null device
-#         sys.stdout = devnull
-#         sys.stderr = devnull
-#         try:
-#             yield
-#         finally:
-#             # Restore stdout and stderr
-#             sys.stdout = old_stdout
-#             sys.stderr = old_stderr
-# def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-#     def retrieve_documents(state):
-#         POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS"] # ,"OpenAlex"]
-#         questions = state["questions"]
-#         # Use sources from the user input or from the LLM detection
-#         if "sources_input" not in state or state["sources_input"] is None:
-#             sources_input = ["auto"]
-#         else:
-#             sources_input = state["sources_input"]
-#         auto_mode = "auto" in sources_input
-#         # There are several options to get the final top k
-#         # Option 1 - Get 100 documents by question and rerank by question
-#         # Option 2 - Get 100/n documents by question and rerank the total
-#         if rerank_by_question:
-#             k_by_question = divide_into_parts(k_final,len(questions))
-#         docs = []
-#         for i,q in enumerate(questions):
-#             sources = q["sources"]
-#             question = q["question"]
-#             # If auto mode, we use the sources detected by the LLM
-#             if auto_mode:
-#                 sources = [x for x in sources if x in POSSIBLE_SOURCES]
-#             # Otherwise, we use the config
-#             else:
-#                 sources = sources_input
-#             # Search the document store using the retriever
-#             # Configure high top k for further reranking step
-#             retriever = ClimateQARetriever(
-#                 vectorstore=vectorstore,
-#                 sources = sources,
-#                 # reports = ias_reports,
-#                 min_size = 200,
-#                 k_summary = k_summary,
-#                 k_total = k_before_reranking,
-#                 threshold = 0.5,
-#             )
-#             docs_question = retriever.get_relevant_documents(question)
-#             # Rerank
-#             if reranker is not None:
-#                 with suppress_output():
-#                     docs_question = rerank_docs(reranker,docs_question,question)
-#             else:
-#                 # Add a default reranking score
-#                 for doc in docs_question:
-#                     doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-#             # If rerank by question we select the top documents for each question
-#             if rerank_by_question:
-#                 docs_question = docs_question[:k_by_question[i]]
-#             # Add sources used in the metadata
-#             for doc in docs_question:
-#                 doc.metadata["sources_used"] = sources
-#             # Add to the list of docs
-#             docs.extend(docs_question)
-#         # Sorting the list in descending order by rerank_score
-#         # Then select the top k
-#         docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-#         docs = docs[:k_final]
-#         new_state = {"documents":docs}
-#         return new_state
-#     return retrieve_documents

climateqa/engine/chains/sample_router.py DELETED Viewed

@@ -1,66 +0,0 @@
-# from typing import List
-# from typing import Literal
-# from langchain.prompts import ChatPromptTemplate
-# from langchain_core.utils.function_calling import convert_to_openai_function
-# from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-# # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
-# class Location(BaseModel):
-#     country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
-#     location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
-# class QueryAnalysis(BaseModel):
-#     """Analyzing the user query"""
-#     language: str = Field(
-#         description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
-#     )
-#     intent: str = Field(
-#         enum=[
-#             "Environmental impacts of AI",
-#             "Geolocated info about climate change",
-#             "Climate change",
-#             "Biodiversity",
-#             "Deep sea mining",
-#             "Chitchat",
-#         ],
-#         description="""
-#             Categorize the user query in one of the following category,
-#             Examples:
-#             - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
-#             - Climate change: "What is radiative forcing", "How much will
-#         """,
-#     )
-#     sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
-#         ...,
-#         description="""
-#             Given a user question choose which documents would be most relevant for answering their question,
-#             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
-#             - IPBES is for questions about biodiversity and nature
-#             - IPOS is for questions about the ocean and deep sea mining
-#         """,
-#     )
-#     date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
-#     location:Location
-#     # query: str = Field(
-#     #     description = """
-#     #         Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
-#     #         The reformulated question will used in a search engine
-#     #         By default, assume that the user is asking information about the last century,
-#     #         Use the following examples
-#     #         ### Examples:
-#     #         La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
-#     #         what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
-#     #         what are the main causes of climate change? -> What are the main causes of climate change in the last century?
-#     #         Question in English:
-#     #     """
-#     # )
-# openai_functions = [convert_to_openai_function(QueryAnalysis)]
-# llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})

climateqa/engine/chains/set_defaults.py DELETED Viewed

@@ -1,13 +0,0 @@
-def set_defaults(state):
-    print("---- Setting defaults ----")
-    if not state["audience"] or state["audience"] is None:
-        state.update({"audience": "experts"})
-    sources_input = state["sources_input"] if "sources_input" in state else ["auto"]
-    state.update({"sources_input": sources_input})
-    # if not state["sources_input"] or state["sources_input"] is None:
-    #     state.update({"sources_input": ["auto"]})
-    return state

climateqa/engine/chains/standalone_question.py DELETED Viewed

@@ -1,42 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-def make_standalone_question_chain(llm):
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", """You are a helpful assistant that transforms user questions into standalone questions
-        by incorporating context from the chat history if needed. The output should be a self-contained
-        question that can be understood without any additional context.
-        Examples:
-        Chat History: "Let's talk about renewable energy"
-        User Input: "What about solar?"
-        Output: "What are the key aspects of solar energy as a renewable energy source?"
-        Chat History: "What causes global warming?"
-        User Input: "And what are its effects?"
-        Output: "What are the effects of global warming on the environment and society?"
-        """),
-        ("user", """Chat History: {chat_history}
-        User Question: {question}
-        Transform this into a standalone question:
-        Make sure to keep the original language of the question.""")
-    ])
-    chain = prompt | llm
-    return chain
-def make_standalone_question_node(llm):
-    standalone_chain = make_standalone_question_chain(llm)
-    def transform_to_standalone(state):
-        chat_history = state.get("chat_history", "")
-        if chat_history == "":
-            return {}
-        output = standalone_chain.invoke({
-            "chat_history": chat_history,
-            "question": state["user_input"]
-        })
-        state["user_input"] = output.content
-        return state
-    return transform_to_standalone

climateqa/engine/chains/translation.py DELETED Viewed

@@ -1,42 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class Translation(BaseModel):
-    """Analyzing the user message input"""
-    translation: str = Field(
-        description="Translate the message input to English",
-    )
-def make_translation_chain(llm):
-    openai_functions = [convert_to_openai_function(Translation)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_translation_node(llm):
-    translation_chain = make_translation_chain(llm)
-    def translate_query(state):
-        print("---- Translate query ----")
-        user_input = state["user_input"]
-        translation = translation_chain.invoke({"input":user_input})
-        return {"query":translation["translation"]}
-    return translate_query

climateqa/engine/embeddings.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
-def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
     if version == "v1.2":
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
-        encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
-            query_instruction=query_instruction,
         )
     else:
@@ -23,6 +23,3 @@ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
+def get_embeddings_function(version = "v1.2"):
     if version == "v1.2":
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
+        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
+            query_instruction="Represent this sentence for searching relevant passages: "
         )
     else:
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

climateqa/engine/graph.py DELETED Viewed

@@ -1,346 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from langchain.schema import Document
-from langgraph.graph import END, StateGraph
-from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
-from typing_extensions import TypedDict
-from typing import List, Dict
-import operator
-from typing import Annotated
-import pandas as pd
-from IPython.display import display, HTML, Image
-from .chains.answer_chitchat import make_chitchat_node
-from .chains.answer_ai_impact import make_ai_impact_node
-from .chains.query_transformation import make_query_transform_node
-from .chains.translation import make_translation_node
-from .chains.intent_categorization import make_intent_categorization_node
-from .chains.retrieve_documents import make_IPx_retriever_node, make_POC_retriever_node, make_POC_by_ToC_retriever_node
-from .chains.answer_rag import make_rag_node
-from .chains.graph_retriever import make_graph_retriever_node
-from .chains.chitchat_categorization import make_chitchat_intent_categorization_node
-from .chains.standalone_question import make_standalone_question_node
-from .chains.follow_up import make_follow_up_node  # Add this import
-class GraphState(TypedDict):
-    """
-    Represents the state of our graph.
-    """
-    user_input : str
-    chat_history : str
-    language : str
-    intent : str
-    search_graphs_chitchat : bool
-    query: str
-    questions_list : List[dict]
-    handled_questions_index : Annotated[list[int], operator.add]
-    n_questions : int
-    answer: str
-    audience: str = "experts"
-    sources_input: List[str] = ["IPCC","IPBES"] # Deprecated -> used only graphs that can only be OWID
-    relevant_content_sources_selection: List[str] = ["Figures (IPCC/IPBES)"]
-    sources_auto: bool = True
-    min_year: int = 1960
-    max_year: int = None
-    documents: Annotated[List[Document], operator.add]
-    related_contents : Annotated[List[Document], operator.add] # Images
-    recommended_content : List[Document] # OWID Graphs  # TODO merge with related_contents
-    search_only : bool = False
-    reports : List[str] = []
-    follow_up_questions: List[str] = []
-def dummy(state):
-    return
-def search(state): #TODO
-    return
-def answer_search(state):#TODO
-    return
-def route_intent(state):
-    intent = state["intent"]
-    if intent in ["chitchat","esg"]:
-        return "answer_chitchat"
-    # elif intent == "ai_impact":
-    #     return "answer_ai_impact"
-    else:
-        # Search route
-        return "answer_climate"
-def chitchat_route_intent(state):
-    intent = state["search_graphs_chitchat"]
-    if intent is True:
-        return END #TODO
-    elif intent is False:
-        return END
-def route_translation(state):
-    if state["language"].lower() == "english":
-        return "transform_query"
-    else:
-        return "transform_query"
-        # return "translate_query" #TODO : add translation
-def route_based_on_relevant_docs(state,threshold_docs=0.2):
-    docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
-    print("Route : ", ["answer_rag" if len(docs) > 0 else "answer_rag_no_docs"])
-    if len(docs) > 0:
-        return "answer_rag"
-    else:
-        return "answer_rag_no_docs"
-def route_continue_retrieve_documents(state):
-    index_question_ipx = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
-    questions_ipx_finished = all(elem in state["handled_questions_index"] for elem in index_question_ipx)
-    if questions_ipx_finished:
-        return "end_retrieve_IPx_documents"
-    else:
-        return "retrieve_documents"
-def route_retrieve_documents(state):
-    sources_to_retrieve = []
-    if "Graphs (OurWorldInData)" in state["relevant_content_sources_selection"]  :
-        sources_to_retrieve.append("retrieve_graphs")
-    if sources_to_retrieve == []:
-        return END
-    return sources_to_retrieve
-def route_follow_up(state):
-    if state["follow_up_questions"]:
-        return "process_follow_up"
-    return END
-def make_id_dict(values):
-    return {k:k for k in values}
-def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_region, reranker, threshold_docs=0.2):
-    workflow = StateGraph(GraphState)
-    # Define the node functions
-    standalone_question_node = make_standalone_question_node(llm)
-    categorize_intent = make_intent_categorization_node(llm)
-    transform_query = make_query_transform_node(llm)
-    translate_query = make_translation_node(llm)
-    answer_chitchat = make_chitchat_node(llm)
-    answer_ai_impact = make_ai_impact_node(llm)
-    retrieve_documents = make_IPx_retriever_node(vectorstore_ipcc, reranker, llm)
-    retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
-    # retrieve_local_data = make_POC_retriever_node(vectorstore_region, reranker, llm)
-    answer_rag = make_rag_node(llm, with_docs=True)
-    answer_rag_no_docs = make_rag_node(llm, with_docs=False)
-    chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
-    generate_follow_up = make_follow_up_node(llm)
-    # Define the nodes
-    # workflow.add_node("set_defaults", set_defaults)
-    workflow.add_node("standalone_question", standalone_question_node)
-    workflow.add_node("categorize_intent", categorize_intent)
-    workflow.add_node("answer_climate", dummy)
-    workflow.add_node("answer_search", answer_search)
-    workflow.add_node("transform_query", transform_query)
-    workflow.add_node("translate_query", translate_query)
-    workflow.add_node("answer_chitchat", answer_chitchat)
-    workflow.add_node("chitchat_categorize_intent", chitchat_categorize_intent)
-    workflow.add_node("retrieve_graphs", retrieve_graphs)
-    # workflow.add_node("retrieve_local_data", retrieve_local_data)
-    workflow.add_node("retrieve_graphs_chitchat", retrieve_graphs)
-    workflow.add_node("retrieve_documents", retrieve_documents)
-    workflow.add_node("answer_rag", answer_rag)
-    workflow.add_node("answer_rag_no_docs", answer_rag_no_docs)
-    workflow.add_node("generate_follow_up", generate_follow_up)
-    # workflow.add_node("process_follow_up", standalone_question_node)
-    # Entry point
-    workflow.set_entry_point("standalone_question")
-    # CONDITIONAL EDGES
-    workflow.add_conditional_edges(
-        "categorize_intent",
-        route_intent,
-        make_id_dict(["answer_chitchat","answer_climate"])
-    )
-    workflow.add_conditional_edges(
-        "chitchat_categorize_intent",
-        chitchat_route_intent,
-        make_id_dict(["retrieve_graphs_chitchat", END])
-    )
-    workflow.add_conditional_edges(
-        "answer_climate",
-        route_translation,
-        make_id_dict(["translate_query","transform_query"])
-    )
-    workflow.add_conditional_edges(
-        "answer_search",
-        lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
-        make_id_dict(["answer_rag","answer_rag_no_docs"])
-    )
-    workflow.add_conditional_edges(
-        "transform_query",
-        route_retrieve_documents,
-        make_id_dict(["retrieve_graphs", END])
-    )
-    # workflow.add_conditional_edges(
-    #     "generate_follow_up",
-    #     route_follow_up,
-    #     make_id_dict(["process_follow_up", END])
-    # )
-    # Define the edges
-    workflow.add_edge("standalone_question", "categorize_intent")
-    workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_documents") #TODO put back
-    # workflow.add_edge("transform_query", "retrieve_local_data")
-    # workflow.add_edge("transform_query", END) # TODO remove
-    workflow.add_edge("retrieve_graphs", END)
-    workflow.add_edge("answer_rag", "generate_follow_up")
-    workflow.add_edge("answer_rag_no_docs", "generate_follow_up")
-    workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
-    workflow.add_edge("retrieve_graphs_chitchat", END)
-    # workflow.add_edge("retrieve_local_data", "answer_search")
-    workflow.add_edge("retrieve_documents", "answer_search")
-    workflow.add_edge("generate_follow_up",END)
-    # workflow.add_edge("process_follow_up", "categorize_intent")
-    # Compile
-    app = workflow.compile()
-    return app
-def make_graph_agent_poc(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_region, reranker, version:str, threshold_docs=0.2):
-    """_summary_
-    Args:
-        llm (_type_): _description_
-        vectorstore_ipcc (_type_): _description_
-        vectorstore_graphs (_type_): _description_
-        vectorstore_region (_type_): _description_
-        reranker (_type_): _description_
-        version (str): version of the parsed documents (e.g "v4")
-        threshold_docs (float, optional): _description_. Defaults to 0.2.
-    Returns:
-        _type_: _description_
-    """
-    workflow = StateGraph(GraphState)
-    # Define the node functions
-    standalone_question_node = make_standalone_question_node(llm)
-    categorize_intent = make_intent_categorization_node(llm)
-    transform_query = make_query_transform_node(llm)
-    translate_query = make_translation_node(llm)
-    answer_chitchat = make_chitchat_node(llm)
-    answer_ai_impact = make_ai_impact_node(llm)
-    retrieve_documents = make_IPx_retriever_node(vectorstore_ipcc, reranker, llm)
-    retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
-    # retrieve_local_data = make_POC_retriever_node(vectorstore_region, reranker, llm)
-    retrieve_local_data = make_POC_by_ToC_retriever_node(vectorstore_region, reranker, llm, version=version)
-    answer_rag = make_rag_node(llm, with_docs=True)
-    answer_rag_no_docs = make_rag_node(llm, with_docs=False)
-    chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
-    generate_follow_up = make_follow_up_node(llm)
-    # Define the nodes
-    # workflow.add_node("set_defaults", set_defaults)
-    workflow.add_node("standalone_question", standalone_question_node)
-    workflow.add_node("categorize_intent", categorize_intent)
-    workflow.add_node("answer_climate", dummy)
-    workflow.add_node("answer_search", answer_search)
-    # workflow.add_node("end_retrieve_local_documents", dummy)
-    # workflow.add_node("end_retrieve_IPx_documents", dummy)
-    workflow.add_node("transform_query", transform_query)
-    workflow.add_node("translate_query", translate_query)
-    workflow.add_node("answer_chitchat", answer_chitchat)
-    workflow.add_node("chitchat_categorize_intent", chitchat_categorize_intent)
-    workflow.add_node("retrieve_graphs", retrieve_graphs)
-    workflow.add_node("retrieve_local_data", retrieve_local_data)
-    workflow.add_node("retrieve_graphs_chitchat", retrieve_graphs)
-    workflow.add_node("retrieve_documents", retrieve_documents)
-    workflow.add_node("answer_rag", answer_rag)
-    workflow.add_node("answer_rag_no_docs", answer_rag_no_docs)
-    workflow.add_node("generate_follow_up", generate_follow_up)
-    # Entry point
-    workflow.set_entry_point("standalone_question")
-    # CONDITIONAL EDGES
-    workflow.add_conditional_edges(
-        "categorize_intent",
-        route_intent,
-        make_id_dict(["answer_chitchat","answer_climate"])
-    )
-    workflow.add_conditional_edges(
-        "chitchat_categorize_intent",
-        chitchat_route_intent,
-        make_id_dict(["retrieve_graphs_chitchat", END])
-    )
-    workflow.add_conditional_edges(
-        "answer_climate",
-        route_translation,
-        make_id_dict(["translate_query","transform_query"])
-    )
-    workflow.add_conditional_edges(
-        "answer_search",
-        lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
-        make_id_dict(["answer_rag","answer_rag_no_docs"])
-    )
-    workflow.add_conditional_edges(
-        "transform_query",
-        route_retrieve_documents,
-        make_id_dict(["retrieve_graphs", END])
-    )
-    # Define the edges
-    workflow.add_edge("standalone_question", "categorize_intent")
-    workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_documents") #TODO put back
-    workflow.add_edge("transform_query", "retrieve_local_data")
-    # workflow.add_edge("transform_query", END) # TODO remove
-    workflow.add_edge("retrieve_graphs", END)
-    workflow.add_edge("answer_rag", "generate_follow_up")
-    workflow.add_edge("answer_rag_no_docs", "generate_follow_up")
-    workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
-    workflow.add_edge("retrieve_graphs_chitchat", END)
-    workflow.add_edge("retrieve_local_data", "answer_search")
-    workflow.add_edge("retrieve_documents", "answer_search")
-    workflow.add_edge("generate_follow_up",END)
-    # Compile
-    app = workflow.compile()
-    return app
-def display_graph(app):
-    display(
-        Image(
-            app.get_graph(xray = True).draw_mermaid_png(
-                draw_method=MermaidDrawMethod.API,
-            )
-        )
-    )

climateqa/engine/graph_retriever.py DELETED Viewed

@@ -1,88 +0,0 @@
-from langchain_core.retrievers import BaseRetriever
-from langchain_core.documents.base import Document
-from langchain_core.vectorstores import VectorStore
-from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
-from typing import List
-# class GraphRetriever(BaseRetriever):
-#     vectorstore:VectorStore
-#     sources:list = ["OWID"] # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
-#     threshold:float = 0.5
-#     k_total:int = 10
-#     def _get_relevant_documents(
-#         self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-#     ) -> List[Document]:
-#         # Check if all elements in the list are IEA or OWID
-#         assert isinstance(self.sources,list)
-#         assert self.sources
-#         assert any([x in ["OWID"] for x in self.sources])
-#         # Prepare base search kwargs
-#         filters = {}
-#         filters["source"] = {"$in": self.sources}
-#         docs = self.vectorstore.similarity_search_with_score(query=query, filter=filters, k=self.k_total)
-#         # Filter if scores are below threshold
-#         docs = [x for x in docs if x[1] > self.threshold]
-#         # Remove duplicate documents
-#         unique_docs = []
-#         seen_docs = []
-#         for i, doc in enumerate(docs):
-#             if doc[0].page_content not in seen_docs:
-#                 unique_docs.append(doc)
-#                 seen_docs.append(doc[0].page_content)
-#         # Add score to metadata
-#         results = []
-#         for i,(doc,score) in enumerate(unique_docs):
-#             doc.metadata["similarity_score"] = score
-#             doc.metadata["content"] = doc.page_content
-#             results.append(doc)
-#         return results
-async def retrieve_graphs(
-    query: str,
-    vectorstore:VectorStore,
-    sources:list = ["OWID"], # plus tard ajouter OurWorldInData # faudra integrate avec l'autre retriever
-    threshold:float = 0.5,
-    k_total:int = 10,
-)-> List[Document]:
-        # Check if all elements in the list are IEA or OWID
-        assert isinstance(sources,list)
-        assert sources
-        assert any([x in ["OWID"] for x in sources])
-        # Prepare base search kwargs
-        filters = {}
-        filters["source"] = {"$in": sources}
-        docs = vectorstore.similarity_search_with_score(query=query, filter=filters, k=k_total)
-        # Filter if scores are below threshold
-        docs = [x for x in docs if x[1] > threshold]
-        # Remove duplicate documents
-        unique_docs = []
-        seen_docs = []
-        for i, doc in enumerate(docs):
-            if doc[0].page_content not in seen_docs:
-                unique_docs.append(doc)
-                seen_docs.append(doc[0].page_content)
-        # Add score to metadata
-        results = []
-        for i,(doc,score) in enumerate(unique_docs):
-            doc.metadata["similarity_score"] = score
-            doc.metadata["content"] = doc.page_content
-            results.append(doc)
-        return results

climateqa/engine/keywords.py CHANGED Viewed

@@ -11,12 +11,10 @@ class KeywordsOutput(BaseModel):
     keywords: list = Field(
         description="""
-        Generate 1 or 2 relevant keywords from the user query to ask a search engine for scientific research papers. Answer only with English keywords.
-        Do not use special characters or accents.
         Example:
         - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
-        - "Quel est l'impact de l'exploitation minière en haute mer ?" -> ["deep sea mining"]
         - "How will El Nino be impacted by climate change" -> ["el nino"]
         - "Is climate change a hoax" -> [Climate change","hoax"]
         """

     keywords: list = Field(
         description="""
+        Generate 1 or 2 relevant keywords from the user query to ask a search engine for scientific research papers.
         Example:
         - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
         - "How will El Nino be impacted by climate change" -> ["el nino"]
         - "Is climate change a hoax" -> [Climate change","hoax"]
         """

climateqa/engine/llm/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
-from climateqa.engine.llm.ollama import get_llm as get_ollama_llm
 def get_llm(provider="openai",**kwargs):
@@ -9,8 +8,6 @@ def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
-    elif provider == "ollama":
-        return  get_ollama_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
 def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

climateqa/engine/llm/ollama.py DELETED Viewed

@@ -1,6 +0,0 @@
-from langchain_community.llms import Ollama
-def get_llm(model="llama3", **kwargs):
-    return Ollama(model=model, **kwargs)

climateqa/engine/llm/openai.py CHANGED Viewed

@@ -7,7 +7,7 @@ try:
 except Exception:
     pass
-def get_llm(model="gpt-4o-mini",max_tokens=1024, temperature=0.0, streaming=True,timeout=30, **kwargs):
     llm = ChatOpenAI(
         model=model,

 except Exception:
     pass
+def get_llm(model="gpt-3.5-turbo-0125",max_tokens=1024, temperature=0.0, streaming=True,timeout=30, **kwargs):
     llm = ChatOpenAI(
         model=model,

climateqa/engine/{chains/prompts.py → prompts.py} RENAMED Viewed

@@ -36,41 +36,13 @@ You are given a question and extracted passages of the IPCC and/or IPBES reports
 """
-# answer_prompt_template_old = """
-# You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
-# Guidelines:
-# - If the passages have useful facts or numbers, use them in your answer.
-# - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
-# - Do not use the sentence 'Doc i says ...' to say where information came from.
-# - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
-# - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
-# - If it makes sense, use bullet points and lists to make your answers easier to understand.
-# - You do not need to use every passage. Only use the ones that help answer the question.
-# - If the documents do not have the information needed to answer the question, just say you do not have enough information.
-# - Consider by default that the question is about the past century unless it is specified otherwise.
-# - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
-# -----------------------
-# Passages:
-# {context}
-# -----------------------
-# Question: {query} - Explained to {audience}
-# Answer in {language} with the passages citations:
-# """
 answer_prompt_template = """
-You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.
 - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
-- You will receive passages from different reports, e.g., IPCC and PPCP. Make separate paragraphs and specify the source of the information in your answer, e.g., "According to IPCC, ...".
-- The different sources are IPCC, IPBES, PPCP (for Plan Climat Air Energie Territorial de Paris), PBDP (for Plan Biodiversité de Paris), Acclimaterra (Rapport scientifique de la région Nouvelle Aquitaine en France).
-- If the reports are local (like PPCP, PBDP, Acclimaterra), consider that the information is specific to the region and not global. If the document is about a nearby region (for example, an extract from Acclimaterra for a question about Britain), explicitly state the concerned region.
-- Do not mention that you are using specific extract documents, but mention only the source information. "According to IPCC, ..." rather than "According to the provided document from IPCC ..."
-- Make a clear distinction between information from IPCC, IPBES, Acclimaterra that are scientific reports and PPCP, PBDP that are strategic reports. Strategic reports should not be taken as verified facts, but as political or strategic decisions.
 - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
@@ -79,16 +51,16 @@ Guidelines:
 - Consider by default that the question is about the past century unless it is specified otherwise.
 - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
 -----------------------
 Passages:
 {context}
 -----------------------
-Question: {query} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 papers_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted abstracts of scientific papers. Provide a clear and structured answer based on the abstracts provided, the context and the guidelines.
@@ -165,7 +137,7 @@ Guidelines:
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
-Question: {query}
 Answer in {language}:
 """
@@ -175,77 +147,4 @@ audience_prompts = {
     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
-}
-answer_prompt_graph_template = """
-Given the user question and a list of graphs which are related to the question, rank the graphs based on relevance to the user question. ALWAYS follow the guidelines given below.
-### Guidelines ###
-- Keep all the graphs that are given to you.
-- NEVER modify the graph HTML embedding, the category or the source leave them exactly as they are given.
-- Return the ranked graphs as a list of dictionaries with keys 'embedding', 'category', and 'source'.
-- Return a valid JSON output.
------------------------
-User question:
-{query}
-Graphs and their HTML embedding:
-{recommended_content}
------------------------
-{format_instructions}
-Output the result as json with a key "graphs" containing a list of dictionaries of the relevant graphs with keys 'embedding', 'category', and 'source'. Do not modify the graph HTML embedding, the category or the source. Do not put any message or text before or after the JSON output.
-"""
-retrieve_chapter_prompt_template = """Given the user question and a list of documents with their table of contents, retrieve the 5 most relevant level 0 chapters which could help to answer to the question while taking account their sub-chapters.
-The table of contents is structured like that :
-{{
-  "level": 0,
-  "Chapter 1": {{}},
-  "Chapter 2" : {{
-    "level": 1,
-    "Chapter 2.1": {{
-      ...
-    }}
-  }},
-}}
-Here level is the level of the chapter. For example, Chapter 1 and Chapter 2 are at level 0, and Chapter 2.1 is at level 1.
-### Guidelines ###
-- Keep all the list of documents that is given to you
-- Each chapter must keep **EXACTLY** its assigned level in the table of contents. **DO NOT MODIFY THE LEVELS. **
-- Check systematically the level of a chapter before including it in the answer.
-- Return **valid JSON** result.
---------------------
-User question :
-{query}
-List of documents with their table of contents :
-{doc_list}
---------------------
-Return a JSON result with a list of relevant chapters with the following keys **WITHOUT** the json markdown indicator ```json at the beginning:
-- "document" : the document in which we can find the chapter
-- "chapter" : the title of the chapter
-**IMPORTANT : Make sure that the levels of the answer are exactly the same as the ones in the table of contents**
-Example of a JSON response:
-[
-  {{
-    "document": "Document A",
-    "chapter": "Chapter 1",
-  }},
-  {{
-    "document": "Document B",
-    "chapter": "Chapter 5",
-  }}
-]
-"""

 """
 answer_prompt_template = """
+You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.
 - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
+- Do not use the sentence 'Doc i says ...' to say where information came from.
 - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
 - Consider by default that the question is about the past century unless it is specified otherwise.
 - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
 -----------------------
 Passages:
 {context}
 -----------------------
+Question: {question} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 papers_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted abstracts of scientific papers. Provide a clear and structured answer based on the abstracts provided, the context and the guidelines.
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
+Question: {question}
 Answer in {language}:
 """
     "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
     "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
     "experts": "expert and climate scientists that are not afraid of technical terms",
+}

climateqa/engine/{chains/answer_rag.py → rag.py} RENAMED Viewed

@@ -2,16 +2,17 @@ from operator import itemgetter
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.prompts.base import format_document
-from climateqa.engine.chains.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
-from climateqa.engine.chains.prompts import papers_prompt_template
-import time
-from ..utils import rename_chain, pass_values
-DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="Source : {source} - Content : {page_content}")
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
@@ -39,54 +40,72 @@ def get_text_docs(x):
 def get_image_docs(x):
     return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
-def make_rag_chain(llm):
     prompt = ChatPromptTemplate.from_template(answer_prompt_template)
-    chain = ({
-        "context":lambda x : _combine_documents(x["documents"]),
-        "context_length":lambda x : print("CONTEXT LENGTH : " , len(_combine_documents(x["documents"]))),
-        "query":itemgetter("query"),
-        "language":itemgetter("language"),
-        "audience":itemgetter("audience"),
-    } | prompt | llm | StrOutputParser())
-    return chain
-def make_rag_chain_without_docs(llm):
-    prompt = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
-    chain = prompt | llm | StrOutputParser()
-    return chain
-def make_rag_node(llm,with_docs = True):
-    if with_docs:
-        rag_chain = make_rag_chain(llm)
-    else:
-        rag_chain = make_rag_chain_without_docs(llm)
-    async def answer_rag(state,config):
-        print("---- Answer RAG ----")
-        start_time = time.time()
-        chat_history = state.get("chat_history",[])
-        print("Sources used : " +  "\n".join([x.metadata["short_name"] + " - page " + str(x.metadata["page_number"])  for x in state["documents"]]))
-        answer = await rag_chain.ainvoke(state,config)
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        print("RAG elapsed time: ", elapsed_time)
-        print("Answer size : ", len(answer))
-        chat_history.append({"question":state["query"],"answer":answer})
-        return {"answer":answer,"chat_history": chat_history}
-    return answer_rag
 def make_rag_papers_chain(llm):
     prompt = ChatPromptTemplate.from_template(papers_prompt_template)
     input_documents = {
         "context":lambda x : _combine_documents(x["docs"]),
         **pass_values(["question","language"])
@@ -112,4 +131,4 @@ def make_illustration_chain(llm):
     }
     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
-    return illustration_chain

 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.prompts.base import format_document
+from climateqa.engine.reformulation import make_reformulation_chain
+from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
+from climateqa.engine.prompts import papers_prompt_template
+from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
+from climateqa.engine.keywords import make_keywords_chain
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
 def get_image_docs(x):
     return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
+def make_rag_chain(retriever,llm):
+    # Construct the prompt
     prompt = ChatPromptTemplate.from_template(answer_prompt_template)
+    prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
+    # ------- CHAIN 0 - Reformulation
+    reformulation = make_reformulation_chain(llm)
+    reformulation = prepare_chain(reformulation,"reformulation")
+    # ------- Find all keywords from the reformulated query
+    keywords = make_keywords_chain(llm)
+    keywords = {"keywords":itemgetter("question") | keywords}
+    keywords = prepare_chain(keywords,"keywords")
+    # ------- CHAIN 1
+    # Retrieved documents
+    find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
+    find_documents = prepare_chain(find_documents,"find_documents")
+    # ------- CHAIN 2
+    # Construct inputs for the llm
+    input_documents = {
+        "context":lambda x : _combine_documents(x["docs"]),
+        **pass_values(["question","audience","language","keywords"])
+    }
+    # ------- CHAIN 3
+    # Bot answer
+    llm_final = rename_chain(llm,"answer")
+    answer_with_docs = {
+        "answer": input_documents | prompt | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
+    }
+    answer_without_docs = {
+        "answer":  prompt_without_docs | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
+    }
+    # def has_images(x):
+    #     image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
+    #     return len(image_docs) > 0
+    def has_docs(x):
+        return len(x["docs"]) > 0
+    answer = RunnableBranch(
+        (lambda x: has_docs(x), answer_with_docs),
+        answer_without_docs,
+    )
+    # ------- FINAL CHAIN
+    # Build the final chain
+    rag_chain = reformulation | keywords | find_documents | answer
+    return rag_chain
 def make_rag_papers_chain(llm):
     prompt = ChatPromptTemplate.from_template(papers_prompt_template)
     input_documents = {
         "context":lambda x : _combine_documents(x["docs"]),
         **pass_values(["question","language"])
     }
     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
+    return illustration_chain

climateqa/engine/{chains/reformulation.py → reformulation.py} RENAMED Viewed

@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
-from climateqa.engine.chains.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from climateqa.engine.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

climateqa/engine/reranker.py DELETED Viewed

@@ -1,55 +0,0 @@
-import os
-from dotenv import load_dotenv
-from scipy.special import expit, logit
-from rerankers import Reranker
-from sentence_transformers import CrossEncoder
-load_dotenv()
-def get_reranker(model = "nano", cohere_api_key = None):
-    assert model in ["nano","tiny","small","large", "jina"]
-    if model == "nano":
-        reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
-    elif model == "tiny":
-        reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
-    elif model == "small":
-        reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
-    elif model == "large":
-        if cohere_api_key is None:
-            cohere_api_key = os.environ["COHERE_API_KEY"]
-        reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
-    elif model == "jina":
-        # Reached token quota so does not work
-        reranker = Reranker("jina-reranker-v2-base-multilingual", api_key = os.getenv("JINA_RERANKER_API_KEY"))
-        # marche pas sans gpu ? et anyways returns with another structure donc faudrait changer le code du retriever node
-        # reranker = CrossEncoder("jinaai/jina-reranker-v2-base-multilingual", automodel_args={"torch_dtype": "auto"}, trust_remote_code=True,)
-    return reranker
-def rerank_docs(reranker,docs,query):
-    if docs == []:
-        return []
-    # Get a list of texts from langchain docs
-    input_docs = [x.page_content for x in docs]
-    # Rerank using rerankers library
-    results = reranker.rank(query=query, docs=input_docs)
-    # Prepare langchain list of docs
-    docs_reranked = []
-    for result in results.results:
-        doc_id = result.document.doc_id
-        doc = docs[doc_id]
-        doc.metadata["reranking_score"] = result.score
-        doc.metadata["query_used_for_retrieval"] = query
-        docs_reranked.append(doc)
-    return docs_reranked
-def rerank_and_sort_docs(reranker, docs, query):
-    docs_reranked = rerank_docs(reranker,docs,query)
-    docs_reranked = sorted(docs_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
-    return docs_reranked

climateqa/engine/retriever.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# https://github.com/langchain-ai/langchain/issues/8623
+import pandas as pd
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from typing import List
+from pydantic import Field
+class ClimateQARetriever(BaseRetriever):
+    vectorstore:VectorStore
+    sources:list = ["IPCC","IPBES","IPOS"]
+    reports:list = []
+    threshold:float = 0.6
+    k_summary:int = 3
+    k_total:int = 10
+    namespace:str = "vectors",
+    min_size:int = 200,
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        # Check if all elements in the list are either IPCC or IPBES
+        assert isinstance(self.sources,list)
+        assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
+        assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
+        # Prepare base search kwargs
+        filters = {}
+        if len(self.reports) > 0:
+            filters["short_name"] = {"$in":self.reports}
+        else:
+            filters["source"] = { "$in":self.sources}
+        # Search for k_summary documents in the summaries dataset
+        filters_summaries = {
+            **filters,
+            "report_type": { "$in":["SPM"]},
+        }
+        docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
+        docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
+        # Search for k_total - k_summary documents in the full reports dataset
+        filters_full = {
+            **filters,
+            "report_type": { "$nin":["SPM"]},
+        }
+        k_full = self.k_total - len(docs_summaries)
+        docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
+        # Concatenate documents
+        docs = docs_summaries + docs_full
+        # Filter if scores are below threshold
+        docs = [x for x in docs if len(x[0].page_content) > self.min_size]
+        # docs = [x for x in docs if x[1] > self.threshold]
+        # Add score to metadata
+        results = []
+        for i,(doc,score) in enumerate(docs):
+            doc.metadata["similarity_score"] = score
+            doc.metadata["content"] = doc.page_content
+            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+            # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
+            results.append(doc)
+        # Sort by score
+        # results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
+        return results
+# def filter_summaries(df,k_summary = 3,k_total = 10):
+#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
+#     # # Filter by source
+#     # if source == "IPCC":
+#     #     df = df.loc[df["source"]=="IPCC"]
+#     # elif source == "IPBES":
+#     #     df = df.loc[df["source"]=="IPBES"]
+#     # else:
+#     #     pass
+#     # Separate summaries and full reports
+#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
+#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
+#     # Find passages from summaries dataset
+#     passages_summaries = df_summaries.head(k_summary)
+#     # Find passages from full reports dataset
+#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
+#     # Concatenate passages
+#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
+#     return passages
+# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
+#     assert max_k > k_total
+#     validated_sources = ["IPCC","IPBES"]
+#     sources = [x for x in sources if x in validated_sources]
+#     filters = {
+#         "source": { "$in": sources },
+#     }
+#     print(filters)
+#     # Retrieve documents
+#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
+#     # Filter by score
+#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
+#     if len(docs) == 0:
+#         return []
+#     res = pd.DataFrame(docs)
+#     passages_df = filter_summaries(res,k_summary,k_total)
+#     if as_dict:
+#         contents = passages_df["content"].tolist()
+#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
+#         passages = []
+#         for i in range(len(contents)):
+#             passages.append({"content":contents[i],"meta":meta[i]})
+#         return passages
+#     else:
+#         return passages_df
+# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
+#     print("hellooooo")
+#     # Reformulate queries
+#     reformulated_query,language = reformulate(query)
+#     print(reformulated_query)
+#     # Retrieve documents
+#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
+#     response = {
+#       "query":query,
+#       "reformulated_query":reformulated_query,
+#       "language":language,
+#       "sources":passages,
+#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
+#     }
+#     return response

climateqa/engine/talk_to_data/config.py DELETED Viewed

@@ -1,11 +0,0 @@
-# Path configuration for climateqa project
-# IPCC dataset path
-IPCC_DATASET_URL = "hf://datasets/ekimetrics/ipcc-atlas"
-# DRIAS dataset paths
-DRIAS_DATASET_URL = "hf://datasets/timeki/drias_db"
-# Table paths
-DRIAS_MEAN_ANNUAL_TEMPERATURE_PATH = f"{DRIAS_DATASET_URL}/mean_annual_temperature.parquet"
-IPCC_COORDINATES_PATH = f"{IPCC_DATASET_URL}/coordinates.parquet"

climateqa/engine/talk_to_data/drias/config.py DELETED Viewed

@@ -1,124 +0,0 @@
-from climateqa.engine.talk_to_data.ui_config import PRECIPITATION_COLORSCALE, TEMPERATURE_COLORSCALE
-DRIAS_TABLES = [
-    "total_winter_precipitation",
-    "total_summer_precipitation",
-    "total_annual_precipitation",
-    "total_remarkable_daily_precipitation",
-    "frequency_of_remarkable_daily_precipitation",
-    "extreme_precipitation_intensity",
-    "mean_winter_temperature",
-    "mean_summer_temperature",
-    "mean_annual_temperature",
-    "number_of_tropical_nights",
-    "maximum_summer_temperature",
-    "number_of_days_with_tx_above_30",
-    "number_of_days_with_tx_above_35",
-    "number_of_days_with_a_dry_ground",
-]
-DRIAS_INDICATOR_COLUMNS_PER_TABLE = {
-    "total_winter_precipitation": "total_winter_precipitation",
-    "total_summer_precipitation": "total_summer_precipitation",
-    "total_annual_precipitation": "total_annual_precipitation",
-    "total_remarkable_daily_precipitation": "total_remarkable_daily_precipitation",
-    "frequency_of_remarkable_daily_precipitation": "frequency_of_remarkable_daily_precipitation",
-    "extreme_precipitation_intensity": "extreme_precipitation_intensity",
-    "mean_winter_temperature": "mean_winter_temperature",
-    "mean_summer_temperature": "mean_summer_temperature",
-    "mean_annual_temperature": "mean_annual_temperature",
-    "number_of_tropical_nights": "number_tropical_nights",
-    "maximum_summer_temperature": "maximum_summer_temperature",
-    "number_of_days_with_tx_above_30": "number_of_days_with_tx_above_30",
-    "number_of_days_with_tx_above_35": "number_of_days_with_tx_above_35",
-    "number_of_days_with_a_dry_ground": "number_of_days_with_dry_ground"
-}
-DRIAS_MODELS = [
-    'ALL',
-    'RegCM4-6_MPI-ESM-LR',
-    'RACMO22E_EC-EARTH',
-    'RegCM4-6_HadGEM2-ES',
-    'HadREM3-GA7_EC-EARTH',
-    'HadREM3-GA7_CNRM-CM5',
-    'REMO2015_NorESM1-M',
-    'SMHI-RCA4_EC-EARTH',
-    'WRF381P_NorESM1-M',
-    'ALADIN63_CNRM-CM5',
-    'CCLM4-8-17_MPI-ESM-LR',
-    'HIRHAM5_IPSL-CM5A-MR',
-    'HadREM3-GA7_HadGEM2-ES',
-    'SMHI-RCA4_IPSL-CM5A-MR',
-    'HIRHAM5_NorESM1-M',
-    'REMO2009_MPI-ESM-LR',
-    'CCLM4-8-17_HadGEM2-ES'
-]
-# Mapping between indicator columns and their units
-DRIAS_INDICATOR_TO_UNIT = {
-    "total_winter_precipitation": "mm",
-    "total_summer_precipitation": "mm",
-    "total_annual_precipitation": "mm",
-    "total_remarkable_daily_precipitation": "mm",
-    "frequency_of_remarkable_daily_precipitation": "days",
-    "extreme_precipitation_intensity": "mm",
-    "mean_winter_temperature": "°C",
-    "mean_summer_temperature": "°C",
-    "mean_annual_temperature": "°C",
-    "number_tropical_nights": "days",
-    "maximum_summer_temperature": "°C",
-    "number_of_days_with_tx_above_30": "days",
-    "number_of_days_with_tx_above_35": "days",
-    "number_of_days_with_dry_ground": "days"
-}
-DRIAS_PLOT_PARAMETERS = [
-    'year',
-    'location'
-]
-DRIAS_INDICATOR_TO_COLORSCALE = {
-    "total_winter_precipitation": PRECIPITATION_COLORSCALE,
-    "total_summer_precipitation": PRECIPITATION_COLORSCALE,
-    "total_annual_precipitation": PRECIPITATION_COLORSCALE,
-    "total_remarkable_daily_precipitation": PRECIPITATION_COLORSCALE,
-    "frequency_of_remarkable_daily_precipitation": PRECIPITATION_COLORSCALE,
-    "extreme_precipitation_intensity": PRECIPITATION_COLORSCALE,
-    "mean_winter_temperature":TEMPERATURE_COLORSCALE,
-    "mean_summer_temperature":TEMPERATURE_COLORSCALE,
-    "mean_annual_temperature":TEMPERATURE_COLORSCALE,
-    "number_tropical_nights": TEMPERATURE_COLORSCALE,
-    "maximum_summer_temperature":TEMPERATURE_COLORSCALE,
-    "number_of_days_with_tx_above_30": TEMPERATURE_COLORSCALE,
-    "number_of_days_with_tx_above_35": TEMPERATURE_COLORSCALE,
-    "number_of_days_with_dry_ground": TEMPERATURE_COLORSCALE
-}
-DRIAS_UI_TEXT = """
-Hi, I'm **Talk to Drias**, designed to answer your questions using [**DRIAS - TRACC 2023**](https://www.drias-climat.fr/accompagnement/sections/401) data.
-I'll answer by displaying a list of SQL queries, graphs and data most relevant to your question.
-You can ask me anything about these climate indicators: **temperature**, **precipitation** or **drought**.
-You can specify **location** and/or **year**.
-You can choose from a list of climate models. By default, we take the **average of each model**.
-For example, you can ask:
-- What will the temperature be like in Paris?
-- What will be the total rainfall in France in 2030?
-- How frequent will extreme events be in Lyon?
-**Example of indicators in the data**:
-- Mean temperature (annual, winter, summer)
-- Total precipitation (annual, winter, summer)
-- Number of days with remarkable precipitations, with dry ground, with temperature above 30°C
-⚠️ **Limitations**:
-- You can't ask anything that isn't related to **DRIAS - TRACC 2023** data.
-- You can only ask about **locations in France**.
-- If you specify a year, there may be **no data for that year for some models**.
-- You **cannot compare two models**.
-🛈 **Information**
-Please note that we **log your questions for meta-analysis purposes**, so avoid sharing any sensitive or personal information.
-"""

climateqa/engine/talk_to_data/drias/plot_informations.py DELETED Viewed

@@ -1,88 +0,0 @@
-from climateqa.engine.talk_to_data.drias.config import DRIAS_INDICATOR_TO_UNIT
-def indicator_evolution_informations(
-        indicator: str,
-        params: dict[str, str]
-) -> str:
-    unit = DRIAS_INDICATOR_TO_UNIT[indicator]
-    if "location" not in params:
-        raise ValueError('"location" must be provided in params')
-    location = params["location"]
-    return f"""
-This plot shows how the climate indicator **{indicator}** evolves over time in **{location}**.
-It combines both historical observations and future projections according to the climate scenario RCP8.5.
-The x-axis represents the years, and the y-axis shows the value of the indicator ({unit}).
-A 10-year rolling average curve is displayed to give a better idea of the overall trend.
-**Data source:**
-- The data come from the DRIAS TRACC data. The data were initially extracted from [the DRIAS website](https://www.drias-climat.fr/drias_prod/accueil/okapiWebDrias/index.jsp?iddrias=climat) and then preprocessed to a tabular format and uploaded as parquet in this [Hugging Face dataset](https://huggingface.co/datasets/timeki/drias_db).
-- For each year and climate model, the value of {indicator} in {location} is collected, to build the time series.
-- The coordinates used for {location} correspond to the closest available point in the DRIAS database, which uses a regular grid with a spatial resolution of 8 km.
-- The indicator values shown are those for the selected climate model.
-- If ALL climate model is selected, the average value of the indicator between all the climate models is used.
-"""
-def indicator_number_of_days_per_year_informations(
-        indicator: str,
-        params: dict[str, str]
-) -> str:
-    unit = DRIAS_INDICATOR_TO_UNIT[indicator]
-    if "location" not in params:
-        raise ValueError('"location" must be provided in params')
-    location = params["location"]
-    return f"""
-This plot displays a bar chart showing the yearly frequency of the climate indicator **{indicator}** in **{location}**.
-The x-axis represents the years, and the y-axis shows the frequency of {indicator} ({unit}) per year.
-**Data source:**
-- The data come from the DRIAS TRACC data. The data were initially extracted from [the DRIAS website](https://www.drias-climat.fr/drias_prod/accueil/okapiWebDrias/index.jsp?iddrias=climat) and then preprocessed to a tabular format and uploaded as parquet in this [Hugging Face dataset](https://huggingface.co/datasets/timeki/drias_db).
-- For each year and climate model, the value of {indicator} in {location} is collected, to build the time series.
-- The coordinates used for {location} correspond to the closest available point in the DRIAS database, which uses a regular grid with a spatial resolution of 8 km.
-- The indicator values shown are those for the selected climate model.
-- If ALL climate model is selected, the average value of the indicator between all the climate models is used.
-"""
-def distribution_of_indicator_for_given_year_informations(
-        indicator: str,
-        params: dict[str, str]
-) -> str:
-    unit = DRIAS_INDICATOR_TO_UNIT[indicator]
-    year = params["year"]
-    if year is None:
-        year = 2030
-    return f"""
-This plot shows a histogram of the distribution of the climate indicator **{indicator}** across all locations for the year **{year}**.
-It allows you to visualize how the values of {indicator} ({unit}) are spread for a given year.
-**Data source:**
-- The data come from the DRIAS TRACC data. The data were initially extracted from [the DRIAS website](https://www.drias-climat.fr/drias_prod/accueil/okapiWebDrias/index.jsp?iddrias=climat) and then preprocessed to a tabular format and uploaded as parquet in this [Hugging Face dataset](https://huggingface.co/datasets/timeki/drias_db).
-- For each grid point in the dataset and climate model, the value of {indicator} for the year {year} is extracted.
-- The indicator values shown are those for the selected climate model.
-- If ALL climate model is selected, the average value of the indicator between all the climate models is used.
-"""
-def map_of_france_of_indicator_for_given_year_informations(
-        indicator: str,
-        params: dict[str, str]
-) -> str:
-    unit = DRIAS_INDICATOR_TO_UNIT[indicator]
-    year = params["year"]
-    if year is None:
-        year = 2030
-    return f"""
-This plot displays a choropleth map showing the spatial distribution of **{indicator}** across all regions of France for the year **{year}**.
-Each region is colored according to the value of the indicator ({unit}), allowing you to visually compare how {indicator} varies geographically within France for the selected year and climate model.
-**Data source:**
-- The data come from the DRIAS TRACC data. The data were initially extracted from [the DRIAS website](https://www.drias-climat.fr/drias_prod/accueil/okapiWebDrias/index.jsp?iddrias=climat) and then preprocessed to a tabular format and uploaded as parquet in this [Hugging Face dataset](https://huggingface.co/datasets/timeki/drias_db).
-- For each region of France, the value of {indicator} in {year} and for the selected climate model is extracted and mapped to its geographic coordinates.
-- The regions correspond to 8 km squares centered on the grid points of the DRIAS dataset.
-- The indicator values shown are those for the selected climate model.
-- If ALL climate model is selected, the average value of the indicator between all the climate models is used.
-"""

climateqa/engine/talk_to_data/drias/plots.py DELETED Viewed

@@ -1,434 +0,0 @@
-import os
-import geojson
-from math import cos, radians
-from typing import Callable
-import pandas as pd
-from plotly.graph_objects import Figure
-import plotly.graph_objects as go
-from climateqa.engine.talk_to_data.drias.plot_informations import distribution_of_indicator_for_given_year_informations, indicator_evolution_informations, indicator_number_of_days_per_year_informations, map_of_france_of_indicator_for_given_year_informations
-from climateqa.engine.talk_to_data.objects.plot import Plot
-from climateqa.engine.talk_to_data.drias.queries import (
-    indicator_for_given_year_query,
-    indicator_per_year_at_location_query,
-)
-from climateqa.engine.talk_to_data.drias.config import DRIAS_INDICATOR_TO_COLORSCALE, DRIAS_INDICATOR_TO_UNIT
-def generate_geojson_polygons(latitudes: list[float], longitudes: list[float], indicators: list[float]) -> geojson.FeatureCollection:
-    side_km = 8
-    delta_lat = side_km / 111
-    features = []
-    for idx, (lat, lon, val) in enumerate(zip(latitudes, longitudes, indicators)):
-        delta_lon  = side_km / (111 * cos(radians(lat)))
-        half_lat = delta_lat / 2
-        half_lon = delta_lon / 2
-        features.append(geojson.Feature(
-                geometry=geojson.Polygon([[
-                    [lon - half_lon, lat - half_lat],
-                    [lon + half_lon, lat - half_lat],
-                    [lon + half_lon, lat + half_lat],
-                    [lon - half_lon, lat + half_lat],
-                    [lon - half_lon, lat - half_lat]
-                ]]),
-                properties={"value": val},
-                id=str(idx)
-            ))
-    return geojson.FeatureCollection(features)
-def plot_indicator_evolution_at_location(params: dict) -> Callable[..., Figure]:
-    """Generates a function to plot indicator evolution over time at a location.
-    This function creates a line plot showing how a climate indicator changes
-    over time at a specific location. It handles temperature, precipitation,
-    and other climate indicators.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - location (str): The location to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    Example:
-        >>> plot_func = plot_indicator_evolution_at_location({
-        ...     'indicator_column': 'mean_temperature',
-        ...     'location': 'Paris',
-        ...     'model': 'ALL'
-        ... })
-        >>> fig = plot_func(df)
-    """
-    indicator = params["indicator_column"]
-    location = params["location"]
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        """Generates the actual plot from the data.
-        Args:
-            df (pd.DataFrame): DataFrame containing the data to plot
-        Returns:
-            Figure: A plotly Figure object showing the indicator evolution
-        """
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby("year", as_index=False)[indicator].mean()
-            # Transform to list to avoid pandas encoding
-            indicators = df_avg[indicator].astype(float).tolist()
-            years = df_avg["year"].astype(int).tolist()
-            # Compute the 10-year rolling average
-            rolling_window = 10
-            sliding_averages = (
-                df_avg[indicator]
-                .rolling(window=rolling_window, min_periods=rolling_window)
-                .mean()
-                .astype(float)
-                .tolist()
-            )
-            model_label = "Model Average"
-            # Only add rolling average if we have enough data points
-            if len([x for x in sliding_averages if pd.notna(x)]) > 0:
-                # Sliding average dashed line
-                fig.add_scatter(
-                    x=years,
-                    y=sliding_averages,
-                    mode="lines",
-                    name="10 years rolling average",
-                    line=dict(dash="dash"),
-                    marker=dict(color="#d62728"),
-                    hovertemplate=f"10-year average: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-                )
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            years = df_model["year"].astype(int).tolist()
-            # Compute the 10-year rolling average
-            rolling_window = 10
-            sliding_averages = (
-                df_model[indicator]
-                .rolling(window=rolling_window, min_periods=rolling_window)
-                .mean()
-                .astype(float)
-                .tolist()
-            )
-            model_label = f"Model : {df['model'].unique()[0]}"
-            # Only add rolling average if we have enough data points
-            if len([x for x in sliding_averages if pd.notna(x)]) > 0:
-                # Sliding average dashed line
-                fig.add_scatter(
-                    x=years,
-                    y=sliding_averages,
-                    mode="lines",
-                    name="10 years rolling average",
-                    line=dict(dash="dash"),
-                    marker=dict(color="#d62728"),
-                    hovertemplate=f"10-year average: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-                )
-        # Indicator per year plot
-        fig.add_scatter(
-            x=years,
-            y=indicators,
-            name=f"Yearly {indicator_label}",
-            mode="lines",
-            marker=dict(color="#1f77b4"),
-            hovertemplate=f"{indicator_label}: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-        )
-        fig.update_layout(
-            title=f"Evolution of {indicator_label} in {location} ({model_label})",
-            xaxis_title="Year",
-            yaxis_title=f"{indicator_label} ({unit})",
-            template="plotly_white",
-            height=900,
-        )
-        return fig
-    return plot_data
-indicator_evolution_at_location: Plot = {
-    "name": "Indicator evolution at location",
-    "description": "Plot an evolution of the indicator at a certain location",
-    "params": ["indicator_column", "location", "model"],
-    "plot_function": plot_indicator_evolution_at_location,
-    "sql_query": indicator_per_year_at_location_query,
-    "plot_information": indicator_evolution_informations,
-    'short_name': 'Evolution'
-}
-def plot_indicator_number_of_days_per_year_at_location(
-    params: dict,
-) -> Callable[..., Figure]:
-    """Generates a function to plot the number of days per year for an indicator.
-    This function creates a bar chart showing the frequency of certain climate
-    events (like days above a temperature threshold) per year at a specific location.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - location (str): The location to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    """
-    indicator = params["indicator_column"]
-    location = params["location"]
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        """Generate the figure thanks to the dataframe
-        Args:
-            df (pd.DataFrame): pandas dataframe with the required data
-        Returns:
-            Figure: Plotly figure
-        """
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby("year", as_index=False)[indicator].mean()
-            # Transform to list to avoid pandas encoding
-            indicators = df_avg[indicator].astype(float).tolist()
-            years = df_avg["year"].astype(int).tolist()
-            model_label = "Model Average"
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            years = df_model["year"].astype(int).tolist()
-            model_label = f"Model : {df['model'].unique()[0]}"
-        # Bar plot
-        fig.add_trace(
-            go.Bar(
-                x=years,
-                y=indicators,
-                width=0.5,
-                marker=dict(color="#1f77b4"),
-                hovertemplate=f"{indicator_label}: %{{y:.2f}} {unit}<br>Year: %{{x}}<extra></extra>"
-            )
-        )
-        fig.update_layout(
-            title=f"{indicator_label} in {location} ({model_label})",
-            xaxis_title="Year",
-            yaxis_title=f"{indicator_label} ({unit})",
-            yaxis=dict(range=[0, max(indicators)]),
-            bargap=0.5,
-            height=900,
-            template="plotly_white",
-        )
-        return fig
-    return plot_data
-indicator_number_of_days_per_year_at_location: Plot = {
-    "name": "Indicator number of days per year at location",
-    "description": "Plot a barchart of the number of days per year of a certain indicator at a certain location. It is appropriate for frequency indicator.",
-    "params": ["indicator_column", "location", "model"],
-    "plot_function": plot_indicator_number_of_days_per_year_at_location,
-    "sql_query": indicator_per_year_at_location_query,
-    "plot_information": indicator_number_of_days_per_year_informations,
-    "short_name": "Yearly Frequency",
-}
-def plot_distribution_of_indicator_for_given_year(
-    params: dict,
-) -> Callable[..., Figure]:
-    """Generates a function to plot the distribution of an indicator for a year.
-    This function creates a histogram showing the distribution of a climate
-    indicator across different locations for a specific year.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - year (str): The year to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    """
-    indicator = params["indicator_column"]
-    year = params["year"]
-    if year is None:
-        year = 2030
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        """Generate the figure thanks to the dataframe
-        Args:
-            df (pd.DataFrame): pandas dataframe with the required data
-        Returns:
-            Figure: Plotly figure
-        """
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby(["latitude", "longitude"], as_index=False)[
-                indicator
-            ].mean()
-            # Transform to list to avoid pandas encoding
-            indicators = df_avg[indicator].astype(float).tolist()
-            model_label = "Model Average"
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            model_label = f"Model : {df['model'].unique()[0]}"
-        fig.add_trace(
-            go.Histogram(
-                x=indicators,
-                opacity=0.8,
-                histnorm="percent",
-                marker=dict(color="#1f77b4"),
-                hovertemplate=f"{indicator_label}: %{{x:.2f}} {unit}<br>Frequency: %{{y:.2f}}%<extra></extra>"
-            )
-        )
-        fig.update_layout(
-            title=f"Distribution of {indicator_label} in {year} ({model_label})",
-            xaxis_title=f"{indicator_label} ({unit})",
-            yaxis_title="Frequency (%)",
-            plot_bgcolor="rgba(0, 0, 0, 0)",
-            showlegend=False,
-            height=900,
-        )
-        return fig
-    return plot_data
-distribution_of_indicator_for_given_year: Plot = {
-    "name": "Distribution of an indicator for a given year",
-    "description": "Plot an histogram of the distribution for a given year of the values of an indicator",
-    "params": ["indicator_column", "model", "year"],
-    "plot_function": plot_distribution_of_indicator_for_given_year,
-    "sql_query": indicator_for_given_year_query,
-    "plot_information": distribution_of_indicator_for_given_year_informations,
-    'short_name': 'Distribution'
-}
-def plot_map_of_france_of_indicator_for_given_year(
-    params: dict,
-) -> Callable[..., Figure]:
-    """Generates a function to plot a map of France for an indicator.
-    This function creates a choropleth map of France showing the spatial
-    distribution of a climate indicator for a specific year.
-    Args:
-        params (dict): Dictionary containing:
-            - indicator_column (str): The column name for the indicator
-            - year (str): The year to plot
-            - model (str): The climate model to use
-    Returns:
-        Callable[..., Figure]: A function that takes a DataFrame and returns a plotly Figure
-    """
-    indicator = params["indicator_column"]
-    year = params["year"]
-    if year is None:
-        year = 2030
-    indicator_label = " ".join([word.capitalize() for word in indicator.split("_")])
-    unit = DRIAS_INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        fig = go.Figure()
-        if df['model'].nunique() != 1:
-            df_avg = df.groupby(["latitude", "longitude"], as_index=False)[
-                indicator
-            ].mean()
-            indicators = df_avg[indicator].astype(float).tolist()
-            latitudes = df_avg["latitude"].astype(float).tolist()
-            longitudes = df_avg["longitude"].astype(float).tolist()
-            model_label = "Model Average"
-        else:
-            df_model = df
-            # Transform to list to avoid pandas encoding
-            indicators = df_model[indicator].astype(float).tolist()
-            latitudes = df_model["latitude"].astype(float).tolist()
-            longitudes = df_model["longitude"].astype(float).tolist()
-            model_label = f"Model : {df['model'].unique()[0]}"
-        geojson_data = generate_geojson_polygons(latitudes, longitudes, indicators)
-        fig = go.Figure(go.Choroplethmapbox(
-            geojson=geojson_data,
-            locations=[str(i) for i in range(len(indicators))],
-            featureidkey="id",
-            z=indicators,
-            colorscale=DRIAS_INDICATOR_TO_COLORSCALE[indicator],
-            zmin=min(indicators),
-            zmax=max(indicators),
-            marker_opacity=0.7,
-            marker_line_width=0,
-            colorbar_title=f"{indicator_label} ({unit})",
-            text=[f"{indicator_label}: {value:.2f} {unit}" for value in indicators],  # Add hover text showing the indicator value
-            hoverinfo="text"
-        ))
-        fig.update_layout(
-            mapbox_style="open-street-map",  # Use OpenStreetMap
-            mapbox_zoom=5,
-            height=900,
-            mapbox_center={"lat": 46.6, "lon": 2.0},
-            coloraxis_colorbar=dict(title=f"{indicator_label} ({unit})"),  # Add legend
-            title=f"{indicator_label} in {year} in France ({model_label}) " # Title
-        )
-        return fig
-    return plot_data
-map_of_france_of_indicator_for_given_year: Plot = {
-    "name": "Map of France of an indicator for a given year",
-    "description": "Heatmap on the map of France of the values of an indicator for a given year",
-    "params": ["indicator_column", "year", "model"],
-    "plot_function": plot_map_of_france_of_indicator_for_given_year,
-    "sql_query": indicator_for_given_year_query,
-    "plot_information": map_of_france_of_indicator_for_given_year_informations,
-    'short_name': 'Map of France'
-}
-DRIAS_PLOTS = [
-    indicator_evolution_at_location,
-    indicator_number_of_days_per_year_at_location,
-    distribution_of_indicator_for_given_year,
-    map_of_france_of_indicator_for_given_year,
-]

climateqa/engine/talk_to_data/drias/queries.py DELETED Viewed

@@ -1,83 +0,0 @@
-from typing import TypedDict
-from climateqa.engine.talk_to_data.config import DRIAS_DATASET_URL
-class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
-    """Parameters for querying an indicator's values over time at a location.
-    This class defines the parameters needed to query climate indicator data
-    for a specific location over multiple years.
-    Attributes:
-        indicator_column (str): The column name for the climate indicator
-        latitude (str): The latitude coordinate of the location
-        longitude (str): The longitude coordinate of the location
-        model (str): The climate model to use (optional)
-    """
-    indicator_column: str
-    latitude: str
-    longitude: str
-    model: str
-def indicator_per_year_at_location_query(
-    table: str, params: IndicatorPerYearAtLocationQueryParams
-) -> str:
-    """SQL Query to get the evolution of an indicator per year at a certain location
-    Args:
-        table (str): sql table of the indicator
-        params (IndicatorPerYearAtLocationQueryParams) : dictionary with the required params for the query
-    Returns:
-        str: the sql query
-    """
-    indicator_column = params.get("indicator_column")
-    latitude = params.get("latitude")
-    longitude = params.get("longitude")
-    if indicator_column is None or latitude is None or longitude is None: # If one parameter is missing, returns an empty query
-        return ""
-    table = f"'{DRIAS_DATASET_URL}/{table.lower()}.parquet'"
-    sql_query = f"SELECT year, {indicator_column}, model\nFROM {table}\nWHERE latitude = {latitude} \nAnd longitude = {longitude} \nOrder by Year"
-    return sql_query
-class IndicatorForGivenYearQueryParams(TypedDict, total=False):
-    """Parameters for querying an indicator's values across locations for a year.
-    This class defines the parameters needed to query climate indicator data
-    across different locations for a specific year.
-    Attributes:
-        indicator_column (str): The column name for the climate indicator
-        year (str): The year to query
-        model (str): The climate model to use (optional)
-    """
-    indicator_column: str
-    year: str
-    model: str
-def indicator_for_given_year_query(
-        table:str, params: IndicatorForGivenYearQueryParams
-) -> str:
-    """SQL Query to get the values of an indicator with their latitudes, longitudes and models for a given year
-    Args:
-        table (str): sql table of the indicator
-        params (IndicatorForGivenYearQueryParams): dictionarry with the required params for the query
-    Returns:
-        str: the sql query
-    """
-    indicator_column = params.get("indicator_column")
-    year = params.get('year')
-    if year is None:
-        year = 2050
-    if year is None or indicator_column is None: # If one parameter is missing, returns an empty query
-        return ""
-    table = f"'{DRIAS_DATASET_URL}/{table.lower()}.parquet'"
-    sql_query = f"Select {indicator_column}, latitude, longitude, model\nFrom {table}\nWhere year = {year}"
-    return sql_query

climateqa/engine/talk_to_data/input_processing.py DELETED Viewed

@@ -1,257 +0,0 @@
-from typing import Any, Literal, Optional, cast
-import ast
-from langchain_core.prompts import ChatPromptTemplate
-from geopy.geocoders import Nominatim
-from climateqa.engine.llm import get_llm
-import duckdb
-import os
-from climateqa.engine.talk_to_data.config import DRIAS_MEAN_ANNUAL_TEMPERATURE_PATH, IPCC_COORDINATES_PATH
-from climateqa.engine.talk_to_data.objects.llm_outputs import ArrayOutput
-from climateqa.engine.talk_to_data.objects.location import Location
-from climateqa.engine.talk_to_data.objects.plot import Plot
-from climateqa.engine.talk_to_data.objects.states import State
-async def detect_location_with_openai(sentence: str) -> str:
-    """
-    Detects locations in a sentence using OpenAI's API via LangChain.
-    """
-    llm = get_llm()
-    prompt = f"""
-    Extract all locations (cities, countries, states, or geographical areas) mentioned in the following sentence.
-    Return the result as a Python list. If no locations are mentioned, return an empty list.
-    Sentence: "{sentence}"
-    """
-    response = await llm.ainvoke(prompt)
-    location_list = ast.literal_eval(response.content.strip("```python\n").strip())
-    if location_list:
-        return location_list[0]
-    else:
-        return ""
-def loc_to_coords(location: str) -> tuple[float, float]:
-    """Converts a location name to geographic coordinates.
-    This function uses the Nominatim geocoding service to convert
-    a location name (e.g., city name) to its latitude and longitude.
-    Args:
-        location (str): The name of the location to geocode
-    Returns:
-        tuple[float, float]: A tuple containing (latitude, longitude)
-    Raises:
-        AttributeError: If the location cannot be found
-    """
-    geolocator = Nominatim(user_agent="city_to_latlong", timeout=5)
-    coords = geolocator.geocode(location)
-    return (coords.latitude, coords.longitude)
-def coords_to_country(coords: tuple[float, float]) -> tuple[str,str]:
-    """Converts geographic coordinates to a country name.
-    This function uses the Nominatim reverse geocoding service to convert
-    latitude and longitude coordinates to a country name.
-    Args:
-        coords (tuple[float, float]): A tuple containing (latitude, longitude)
-    Returns:
-        tuple[str,str]: A tuple containg (country_code, country_name, admin1)
-    Raises:
-        AttributeError: If the coordinates cannot be found
-    """
-    geolocator = Nominatim(user_agent="latlong_to_country")
-    location = geolocator.reverse(coords)
-    address = location.raw['address']
-    return address['country_code'].upper(), address['country']
-def nearest_neighbour_sql(location: tuple, mode: Literal['DRIAS', 'IPCC']) -> tuple[str, str, Optional[str]]:
-    long = round(location[1], 3)
-    lat = round(location[0], 3)
-    conn = duckdb.connect()
-    if mode == 'DRIAS':
-        table_path = f"'{DRIAS_MEAN_ANNUAL_TEMPERATURE_PATH}'"
-        results = conn.sql(
-            f"SELECT latitude, longitude FROM {table_path} WHERE latitude BETWEEN {lat - 0.3} AND {lat + 0.3} AND longitude BETWEEN {long - 0.3} AND {long + 0.3}"
-        ).fetchdf()
-    else:
-        table_path = f"'{IPCC_COORDINATES_PATH}'"
-        results = conn.sql(
-            f"SELECT latitude, longitude, admin1 FROM {table_path} WHERE latitude BETWEEN {lat - 0.5} AND {lat + 0.5} AND longitude BETWEEN {long - 0.5} AND {long + 0.5}"
-        ).fetchdf()
-    if len(results) == 0:
-        return "", "", ""
-    if 'admin1' in results.columns:
-        admin1 = results['admin1'].iloc[0]
-    else:
-        admin1 = None
-    return results['latitude'].iloc[0], results['longitude'].iloc[0], admin1
-async def detect_year_with_openai(sentence: str) -> str:
-    """
-    Detects years in a sentence using OpenAI's API via LangChain.
-    """
-    llm = get_llm()
-    prompt = """
-    Extract all years mentioned in the following sentence.
-    Return the result as a Python list. If no year are mentioned, return an empty list.
-    Sentence: "{sentence}"
-    """
-    prompt = ChatPromptTemplate.from_template(prompt)
-    structured_llm = llm.with_structured_output(ArrayOutput)
-    chain = prompt | structured_llm
-    response: ArrayOutput = await chain.ainvoke({"sentence": sentence})
-    years_list = ast.literal_eval(response['array'])
-    if len(years_list) > 0:
-        return years_list[0]
-    else:
-        return ""
-async def detect_relevant_tables(user_question: str, plot: Plot, llm, table_names_list: list[str]) -> list[str]:
-    """Identifies relevant tables for a plot based on user input.
-    This function uses an LLM to analyze the user's question and the plot
-    description to determine which tables in the DRIAS database would be
-    most relevant for generating the requested visualization.
-    Args:
-        user_question (str): The user's question about climate data
-        plot (Plot): The plot configuration object
-        llm: The language model instance to use for analysis
-    Returns:
-        list[str]: A list of table names that are relevant for the plot
-    Example:
-        >>> detect_relevant_tables(
-        ...     "What will the temperature be like in Paris?",
-        ...     indicator_evolution_at_location,
-        ...     llm
-        ... )
-        ['mean_annual_temperature', 'mean_summer_temperature']
-    """
-    # Get all table names
-    prompt = (
-        f"You are helping to build a plot following this description : {plot['description']}."
-        f"You are given a list of tables and a user question."
-        f"Based on the description of the plot, which table are appropriate for that kind of plot."
-        f"Write the 3 most relevant tables to use. Answer only a python list of table name."
-        f"### List of tables : {table_names_list}"
-        f"### User question : {user_question}"
-        f"### List of table name : "
-    )
-    table_names = ast.literal_eval(
-        (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
-    )
-    return table_names
-async def detect_relevant_plots(user_question: str, llm, plot_list: list[Plot]) -> list[str]:
-    plots_description = ""
-    for plot in plot_list:
-        plots_description += "Name: " + plot["name"]
-        plots_description += " - Description: " + plot["description"] + "\n"
-    prompt = (
-        "You are helping to answer a question with insightful visualizations.\n"
-        "You are given a user question and a list of plots with their name and description.\n"
-        "Based on the descriptions of the plots, select ALL plots that could provide a useful answer to this question. "
-        "Include any plot that could show relevant information, even if their perspectives (such as time series or spatial distribution) are different.\n"
-        "For example, for a question like 'What will be the total rainfall in China in 2050?', both a time series plot and a spatial map plot could be relevant.\n"
-        "Return only a Python list of plot names sorted from the most relevant one to the less relevant one.\n"
-        f"### Descriptions of the plots : {plots_description}"
-        f"### User question : {user_question}\n"
-        f"### Names of the plots : "
-    )
-    plot_names = ast.literal_eval(
-        (await llm.ainvoke(prompt)).content.strip("```python\n").strip()
-    )
-    return plot_names
-async def find_location(user_input: str, mode: Literal['DRIAS', 'IPCC'] = 'DRIAS') -> Location:
-    print(f"---- Find location in user input ----")
-    location = await detect_location_with_openai(user_input)
-    output: Location = {
-        'location' : location,
-        'longitude' : None,
-        'latitude' : None,
-        'country_code' : None,
-        'country_name' : None,
-        'admin1' : None
-        }
-    if location:
-        coords = loc_to_coords(location)
-        country_code, country_name = coords_to_country(coords)
-        neighbour = nearest_neighbour_sql(coords, mode)
-        output.update({
-            "latitude": neighbour[0],
-            "longitude": neighbour[1],
-            "country_code": country_code,
-            "country_name": country_name,
-            "admin1": neighbour[2]
-        })
-    output = cast(Location, output)
-    return output
-async def find_year(user_input: str) -> str| None:
-    """Extracts year information from user input using LLM.
-    This function uses an LLM to identify and extract year information from the
-    user's query, which is used to filter data in subsequent queries.
-    Args:
-        user_input (str): The user's query text
-    Returns:
-        str: The extracted year, or empty string if no year found
-    """
-    print(f"---- Find year ---")
-    year = await detect_year_with_openai(user_input)
-    if year == "":
-        return None
-    return year
-async def find_relevant_plots(state: State, llm, plots: list[Plot]) -> list[str]:
-    print("---- Find relevant plots ----")
-    relevant_plots = await detect_relevant_plots(state['user_input'], llm, plots)
-    return relevant_plots
-async def find_relevant_tables_per_plot(state: State, plot: Plot, llm, tables: list[str]) -> list[str]:
-    print(f"---- Find relevant tables for {plot['name']} ----")
-    relevant_tables = await detect_relevant_tables(state['user_input'], plot, llm, tables)
-    return relevant_tables
-async def find_param(state: State, param_name:str, mode: Literal['DRIAS', 'IPCC'] = 'DRIAS') -> dict[str, Optional[str]] | Location | None:
-    """Perform the good method to retrieve the desired parameter
-    Args:
-        state (State): state of the workflow
-        param_name (str): name of the desired parameter
-        table (str): name of the table
-    Returns:
-        dict[str, Any] | None:
-    """
-    if param_name == 'location':
-        location = await find_location(state['user_input'], mode)
-        return location
-    if param_name == 'year':
-        year = await find_year(state['user_input'])
-        return {'year': year}
-    return None

climateqa/engine/talk_to_data/ipcc/config.py DELETED Viewed

@@ -1,98 +0,0 @@
-from climateqa.engine.talk_to_data.ui_config import PRECIPITATION_COLORSCALE, TEMPERATURE_COLORSCALE
-from climateqa.engine.talk_to_data.config import IPCC_DATASET_URL
-# IPCC_DATASET_URL = "hf://datasets/ekimetrics/ipcc-atlas"
-IPCC_TABLES = [
-    "mean_temperature",
-    "total_precipitation",
-]
-IPCC_INDICATOR_COLUMNS_PER_TABLE = {
-    "mean_temperature": "mean_temperature",
-    "total_precipitation": "total_precipitation"
-}
-IPCC_INDICATOR_TO_UNIT = {
-    "mean_temperature": "°C",
-    "total_precipitation": "mm/day"
-}
-IPCC_SCENARIO = [
-    "historical",
-    "ssp126",
-    "ssp245",
-    "ssp370",
-    "ssp585",
-]
-IPCC_MODELS = []
-IPCC_PLOT_PARAMETERS = [
-    'year',
-    'location'
-]
-MACRO_COUNTRIES = ['JP',
- 'IN',
- 'MH',
- 'PT',
- 'ID',
- 'SJ',
- 'MX',
- 'CN',
- 'GL',
- 'PN',
- 'AR',
- 'AQ',
- 'PF',
- 'BR',
- 'SH',
- 'GS',
- 'ZA',
- 'NZ',
- 'TF',
-]
-HUGE_MACRO_COUNTRIES = ['CL',
- 'CA',
- 'AU',
- 'US',
- 'RU'
-]
-IPCC_INDICATOR_TO_COLORSCALE = {
-    "mean_temperature": TEMPERATURE_COLORSCALE,
-    "total_precipitation": PRECIPITATION_COLORSCALE
-}
-IPCC_UI_TEXT = """
-Hi, I'm **Talk to IPCC**, designed to answer your questions using [**IPCC - ATLAS**](https://interactive-atlas.ipcc.ch/regional-information#eyJ0eXBlIjoiQVRMQVMiLCJjb21tb25zIjp7ImxhdCI6OTc3MiwibG5nIjo0MDA2OTIsInpvb20iOjQsInByb2oiOiJFUFNHOjU0MDMwIiwibW9kZSI6ImNvbXBsZXRlX2F0bGFzIn0sInByaW1hcnkiOnsic2NlbmFyaW8iOiJzc3A1ODUiLCJwZXJpb2QiOiIyIiwic2Vhc29uIjoieWVhciIsImRhdGFzZXQiOiJDTUlQNiIsInZhcmlhYmxlIjoidGFzIiwidmFsdWVUeXBlIjoiQU5PTUFMWSIsImhhdGNoaW5nIjoiU0lNUExFIiwicmVnaW9uU2V0IjoiYXI2IiwiYmFzZWxpbmUiOiJwcmVJbmR1c3RyaWFsIiwicmVnaW9uc1NlbGVjdGVkIjpbXX0sInBsb3QiOnsiYWN0aXZlVGFiIjoicGx1bWUiLCJtYXNrIjoibm9uZSIsInNjYXR0ZXJZTWFnIjpudWxsLCJzY2F0dGVyWVZhciI6bnVsbCwic2hvd2luZyI6ZmFsc2V9fQ==) data.
-I'll answer by displaying a list of SQL queries, graphs and data most relevant to your question.
-You can ask me anything about these climate indicators: **temperature** or **precipitation**.
-You can specify **location** and/or **year**.
-By default, we take the **mediane of each climate model**.
-Current available charts :
-- Yearly evolution of an indicator at a specific location (historical + SSP Projections)
-- Yearly spatial distribution of an indicator in a specific country
-Current available indicators :
-- Mean temperature
-- Total precipitation
-For example, you can ask:
-- What will the temperature be like in Paris?
-- What will be the total rainfall in the USA in 2030?
-- How will the average temperature evolve in China ?
-⚠️ **Limitations**:
-- You can't ask anything that isn't related to **IPCC - ATLAS** data.
-- You can not ask about **several locations at the same time**.
-- If you specify a year **before 1850 or over 2100**, there will be **no data**.
-- You **cannot compare two models**.
-🛈 **Information**
-Please note that we **log your questions for meta-analysis purposes**, so avoid sharing any sensitive or personal information.
-"""

climateqa/engine/talk_to_data/ipcc/plot_informations.py DELETED Viewed

@@ -1,50 +0,0 @@
-from climateqa.engine.talk_to_data.ipcc.config import IPCC_INDICATOR_TO_UNIT
-def indicator_evolution_informations(
-        indicator: str,
-        params: dict[str,str],
-) -> str:
-    if "location" not in params:
-        raise ValueError('"location" must be provided in params')
-    location = params["location"]
-    unit = IPCC_INDICATOR_TO_UNIT[indicator]
-    return f"""
-This plot shows how the climate indicator **{indicator}** evolves over time in **{location}**.
-It combines both historical (from 1950 to 2015) observations and future (from 2016 to 2100) projections for the different SSP climate scenarios (SSP126, SSP245, SSP370 and SSP585).
-The x-axis represents the years (from 1950 to 2100), and the y-axis shows the value of the {indicator} ({unit}).
-Each line corresponds to a different scenario, allowing you to compare how {indicator} might change under various future conditions.
-**Data source:**
-- The data comes from the CMIP6 IPCC ATLAS data. The data were initially extracted from [this referenced website](https://digital.csic.es/handle/10261/332744) and then preprocessed to a tabular format and uploaded as parquet in this [Hugging Face dataset](https://huggingface.co/datasets/Ekimetrics/ipcc-atlas).
-- The underlying data is retrieved by aggregating yearly values of {indicator} for the selected location, across all available scenarios. This means the system collects, for each year, the value of {indicator} in {location}, both for the historical period and for each scenario, to build the time series.
-- The coordinates used for {location} correspond to the closest available point in the IPCC database, which uses a regular grid with a spatial resolution of 1 degree.
-"""
-def choropleth_map_informations(
-        indicator: str,
-        params: dict[str, str],
-) -> str:
-    unit = IPCC_INDICATOR_TO_UNIT[indicator]
-    if "location" not in params:
-        raise ValueError('"location" must be provided in params')
-    location = params["location"]
-    country_name = params["country_name"]
-    year = params["year"]
-    if year is None:
-        year = 2050
-    return f"""
-This plot displays a choropleth map showing the spatial distribution of **{indicator}** across all regions of **{location}** country ({country_name}) for the year **{year}** and the chosen scenario.
-Each grid point is colored according to the value of the indicator ({unit}), allowing you to visually compare how {indicator} varies geographically within the country for the selected year and scenario.
-**Data source:**
-- The data come from the CMIP6 IPCC ATLAS data. The data were initially extracted from [this referenced website](https://digital.csic.es/handle/10261/332744) and then preprocessed to a tabular format and uploaded as parquet in this [Hugging Face dataset](https://huggingface.co/datasets/Ekimetrics/ipcc-atlas).
-- For each grid point of {location} country ({country_name}), the value of {indicator} in {year} and for the selected scenario is extracted and mapped to its geographic coordinates.
-- The grid points correspond to 1-degree squares centered on the grid points of the IPCC dataset. Each grid point has been mapped to a country using [**reverse_geocoder**](https://github.com/thampiman/reverse-geocoder).
-- The coordinates used for each region are those of the closest available grid point in the IPCC database, which uses a regular grid with a spatial resolution of 1 degree.
-"""

climateqa/engine/talk_to_data/ipcc/plots.py DELETED Viewed

@@ -1,189 +0,0 @@
-from typing import Callable
-from plotly.graph_objects import Figure
-import plotly.graph_objects as go
-import pandas as pd
-import geojson
-from climateqa.engine.talk_to_data.ipcc.config import IPCC_INDICATOR_TO_COLORSCALE, IPCC_INDICATOR_TO_UNIT, IPCC_SCENARIO
-from climateqa.engine.talk_to_data.ipcc.plot_informations import choropleth_map_informations, indicator_evolution_informations
-from climateqa.engine.talk_to_data.ipcc.queries import indicator_for_given_year_query, indicator_per_year_at_location_query
-from climateqa.engine.talk_to_data.objects.plot import Plot
-def generate_geojson_polygons(latitudes: list[float], longitudes: list[float], indicators: list[float]) -> geojson.FeatureCollection:
-    features = [
-        geojson.Feature(
-            geometry=geojson.Polygon([[
-                [lon - 0.5, lat - 0.5],
-                [lon + 0.5, lat - 0.5],
-                [lon + 0.5, lat + 0.5],
-                [lon - 0.5, lat + 0.5],
-                [lon - 0.5, lat - 0.5]
-            ]]),
-            properties={"value": val},
-            id=str(idx)
-        )
-        for idx, (lat, lon, val) in enumerate(zip(latitudes, longitudes, indicators))
-    ]
-    geojson_data = geojson.FeatureCollection(features)
-    return geojson_data
-def plot_indicator_evolution_at_location_historical_and_projections(
-    params: dict,
-) -> Callable[[pd.DataFrame], Figure]:
-    """
-    Returns a function that generates a line plot showing the evolution of a climate indicator
-    (e.g., temperature, rainfall) over time at a specific location, including both historical data
-    and future projections for different climate scenarios.
-    Args:
-        params (dict): Dictionary with:
-            - indicator_column (str): Name of the climate indicator column to plot.
-            - location (str): Location (e.g., country, city) for which to plot the indicator.
-    Returns:
-        Callable[[pd.DataFrame], Figure]: Function that takes a DataFrame and returns a Plotly Figure
-        showing the indicator's evolution over time, with scenario lines and historical data.
-    """
-    indicator = params["indicator_column"]
-    location = params["location"]
-    indicator_label = " ".join(word.capitalize() for word in indicator.split("_"))
-    unit = IPCC_INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        df = df.sort_values(by='year')
-        years = df['year'].astype(int).tolist()
-        indicators = df[indicator].astype(float).tolist()
-        scenarios = df['scenario'].astype(str).tolist()
-        # Find last historical value for continuity
-        last_historical = [(y, v) for y, v, s in zip(years, indicators, scenarios) if s == 'historical']
-        last_historical_year, last_historical_indicator = last_historical[-1] if last_historical else (None, None)
-        fig = go.Figure()
-        for scenario in IPCC_SCENARIO:
-            x = [y for y, s in zip(years, scenarios) if s == scenario]
-            y = [v for v, s in zip(indicators, scenarios) if s == scenario]
-            # Connect historical to scenario
-            if scenario != 'historical' and last_historical_indicator is not None:
-                x = [last_historical_year] + x
-                y = [last_historical_indicator] + y
-            fig.add_trace(go.Scatter(
-                x=x,
-                y=y,
-                mode='lines',
-                name=scenario
-            ))
-        fig.update_layout(
-            title=f'Yearly Evolution of {indicator_label} in {location} (Historical + SSP Scenarios)',
-            xaxis_title='Year',
-            yaxis_title=f'{indicator_label} ({unit})',
-            legend_title='Scenario',
-            height=800,
-        )
-        return fig
-    return plot_data
-indicator_evolution_at_location_historical_and_projections: Plot = {
-    "name": "Indicator Evolution at Location (Historical + Projections)",
-    "description": (
-        "Shows how a climate indicator (e.g., rainfall, temperature) changes over time at a specific location, "
-        "including historical data and future projections. "
-        "Useful for questions about the value or trend of an indicator at a location for any year, "
-        "such as 'What will be the total rainfall in China in 2050?' or 'How does rainfall evolve in China over time?'. "
-        "Parameters: indicator_column (the climate variable), location (e.g., country, city)."
-    ),
-    "params": ["indicator_column", "location"],
-    "plot_function": plot_indicator_evolution_at_location_historical_and_projections,
-    "sql_query": indicator_per_year_at_location_query,
-    "plot_information": indicator_evolution_informations,
-    "short_name": "Evolution"
-}
-def plot_choropleth_map_of_country_indicator_for_specific_year(
-    params: dict,
-) -> Callable[[pd.DataFrame], Figure]:
-    """
-    Returns a function that generates a choropleth map (heatmap) showing the spatial distribution
-    of a climate indicator (e.g., temperature, rainfall) across all regions of a country for a specific year.
-    Args:
-        params (dict): Dictionary with:
-            - indicator_column (str): Name of the climate indicator column to plot.
-            - year (str or int, optional): Year for which to plot the indicator (default: 2050).
-            - country_name (str): Name of the country.
-            - location (str): Location (country or region) for the map.
-    Returns:
-        Callable[[pd.DataFrame], Figure]: Function that takes a DataFrame and returns a Plotly Figure
-        showing the indicator's spatial distribution as a choropleth map for the specified year.
-    """
-    indicator = params["indicator_column"]
-    year = params.get('year')
-    if year is None:
-        year = 2050
-    country_name = params['country_name']
-    location = params['location']
-    indicator_label = " ".join(word.capitalize() for word in indicator.split("_"))
-    unit = IPCC_INDICATOR_TO_UNIT.get(indicator, "")
-    def plot_data(df: pd.DataFrame) -> Figure:
-        indicators = df[indicator].astype(float).tolist()
-        latitudes = df["latitude"].astype(float).tolist()
-        longitudes = df["longitude"].astype(float).tolist()
-        geojson_data = generate_geojson_polygons(latitudes, longitudes, indicators)
-        fig = go.Figure(go.Choroplethmapbox(
-            geojson=geojson_data,
-            locations=[str(i) for i in range(len(indicators))],
-            featureidkey="id",
-            z=indicators,
-            colorscale=IPCC_INDICATOR_TO_COLORSCALE[indicator],
-            zmin=min(indicators),
-            zmax=max(indicators),
-            marker_opacity=0.7,
-            marker_line_width=0,
-            colorbar_title=f"{indicator_label} ({unit})",
-            text=[f"{indicator_label}: {value:.2f} {unit}" for value in indicators],  # Add hover text showing the indicator value
-            hoverinfo="text"
-        ))
-        fig.update_layout(
-            mapbox_style="open-street-map",
-            mapbox_zoom=2,
-            height=800,
-            mapbox_center={
-                "lat": latitudes[len(latitudes)//2] if latitudes else 0,
-                "lon": longitudes[len(longitudes)//2] if longitudes else 0
-            },
-            coloraxis_colorbar=dict(title=f"{indicator_label} ({unit})"),
-            title=f"{indicator_label} in {year} in {location} ({country_name})"
-        )
-        return fig
-    return plot_data
-choropleth_map_of_country_indicator_for_specific_year: Plot = {
-    "name": "Choropleth Map of a Country's Indicator Distribution for a Specific Year",
-    "description": (
-        "Displays a map showing the spatial distribution of a climate indicator (e.g., rainfall, temperature) "
-        "across all regions of a country for a specific year. "
-        "Can answer questions about the value of an indicator in a country or region for a given year, "
-        "such as 'What will be the total rainfall in China in 2050?' or 'How is rainfall distributed across China in 2050?'. "
-        "Parameters: indicator_column (the climate variable), year, location (country name)."
-    ),
-    "params": ["indicator_column", "year", "location"],
-    "plot_function": plot_choropleth_map_of_country_indicator_for_specific_year,
-    "sql_query": indicator_for_given_year_query,
-    "plot_information": choropleth_map_informations,
-    "short_name": "Map",
-}
-IPCC_PLOTS = [
-    indicator_evolution_at_location_historical_and_projections,
-    choropleth_map_of_country_indicator_for_specific_year
-]

climateqa/engine/talk_to_data/ipcc/queries.py DELETED Viewed

@@ -1,144 +0,0 @@
-from typing import TypedDict, Optional
-from climateqa.engine.talk_to_data.ipcc.config import HUGE_MACRO_COUNTRIES, MACRO_COUNTRIES
-from climateqa.engine.talk_to_data.config import IPCC_DATASET_URL
-class IndicatorPerYearAtLocationQueryParams(TypedDict, total=False):
-    """
-    Parameters for querying the evolution of an indicator per year at a specific location.
-    Attributes:
-        indicator_column (str): Name of the climate indicator column.
-        latitude (str): Latitude of the location.
-        longitude (str): Longitude of the location.
-        country_code (str): Country code.
-        admin1 (str): Administrative region (optional).
-    """
-    indicator_column: str
-    latitude: str
-    longitude: str
-    country_code: str
-    admin1: Optional[str]
-def indicator_per_year_at_location_query(
-    table: str, params: IndicatorPerYearAtLocationQueryParams
-) -> str:
-    """
-    Builds an SQL query to get the evolution of an indicator per year at a specific location.
-    Args:
-        table (str): SQL table of the indicator.
-        params (IndicatorPerYearAtLocationQueryParams): Dictionary with the required params for the query.
-    Returns:
-        str: The SQL query string, or an empty string if required parameters are missing.
-    """
-    indicator_column = params.get("indicator_column")
-    latitude = params.get("latitude")
-    longitude = params.get("longitude")
-    country_code = params.get("country_code")
-    admin1 = params.get("admin1")
-    if not all([indicator_column, latitude, longitude, country_code]):
-        return ""
-    if country_code in MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
-        sql_query = f"""
-        SELECT year, scenario, AVG({indicator_column}) as {indicator_column}
-        FROM {table_path}
-        WHERE latitude = {latitude} AND longitude = {longitude} AND year >= 1950
-        GROUP BY scenario, year
-        ORDER BY year, scenario
-        """
-    elif country_code in HUGE_MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
-        sql_query = f"""
-        SELECT year, scenario, {indicator_column}
-        FROM {table_path}
-        WHERE latitude = {latitude} AND longitude = {longitude} AND year >= 1950
-        ORDER BY year, scenario
-        """
-    else:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}.parquet'"
-        sql_query = f"""
-        WITH medians_per_month AS (
-            SELECT year, scenario, month, MEDIAN({indicator_column}) AS median_value
-            FROM {table_path}
-            WHERE latitude = {latitude} AND longitude = {longitude} AND year >= 1950
-            GROUP BY scenario, year, month
-        )
-        SELECT year, scenario, AVG(median_value) AS {indicator_column}
-        FROM medians_per_month
-        GROUP BY scenario, year
-        ORDER BY year, scenario
-        """
-    return sql_query.strip()
-class IndicatorForGivenYearQueryParams(TypedDict, total=False):
-    """
-    Parameters for querying an indicator's values across locations for a specific year.
-    Attributes:
-        indicator_column (str): The column name for the climate indicator.
-        year (str): The year to query.
-        country_code (str): The country code.
-    """
-    indicator_column: str
-    year: str
-    country_code: str
-def indicator_for_given_year_query(
-    table: str, params: IndicatorForGivenYearQueryParams
-) -> str:
-    """
-    Builds an SQL query to get the values of an indicator with their latitudes, longitudes,
-    and scenarios for a given year.
-    Args:
-        table (str): SQL table of the indicator.
-        params (IndicatorForGivenYearQueryParams): Dictionary with the required params for the query.
-    Returns:
-        str: The SQL query string, or an empty string if required parameters are missing.
-    """
-    indicator_column = params.get("indicator_column")
-    year = params.get("year") or 2050
-    country_code = params.get("country_code")
-    if not all([indicator_column, year, country_code]):
-        return ""
-    if country_code in MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
-        sql_query = f"""
-        SELECT latitude, longitude, scenario, AVG({indicator_column}) as {indicator_column}
-        FROM {table_path}
-        WHERE year = {year}
-        GROUP BY latitude, longitude, scenario
-        ORDER BY latitude, longitude, scenario
-        """
-    elif country_code in HUGE_MACRO_COUNTRIES:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}_macro.parquet'"
-        sql_query = f"""
-        SELECT latitude, longitude, scenario, {indicator_column}
-        FROM {table_path}
-        WHERE year = {year}
-        ORDER BY latitude, longitude, scenario
-        """
-    else:
-        table_path = f"'{IPCC_DATASET_URL}/{table.lower()}/{country_code}.parquet'"
-        sql_query = f"""
-        WITH medians_per_month AS (
-            SELECT latitude, longitude, scenario, month, MEDIAN({indicator_column}) AS median_value
-            FROM {table_path}
-            WHERE year = {year}
-            GROUP BY latitude, longitude, scenario, month
-        )
-        SELECT latitude, longitude, scenario, AVG(median_value) AS {indicator_column}
-        FROM medians_per_month
-        GROUP BY latitude, longitude, scenario
-        ORDER BY latitude, longitude, scenario
-        """
-    return sql_query.strip()

climateqa/engine/talk_to_data/main.py DELETED Viewed

@@ -1,124 +0,0 @@
-from climateqa.engine.talk_to_data.workflow.drias import drias_workflow
-from climateqa.engine.talk_to_data.workflow.ipcc import ipcc_workflow
-from climateqa.logging import log_drias_interaction_to_huggingface
-async def ask_drias(query: str, index_state: int = 0, user_id: str | None = None) -> tuple:
-    """Main function to process a DRIAS query and return results.
-    This function orchestrates the DRIAS workflow, processing a user query to generate
-    SQL queries, dataframes, and visualizations. It handles multiple results and allows
-    pagination through them.
-    Args:
-        query (str): The user's question about climate data
-        index_state (int, optional): The index of the result to return. Defaults to 0.
-    Returns:
-        tuple: A tuple containing:
-            - sql_query (str): The SQL query used
-            - dataframe (pd.DataFrame): The resulting data
-            - figure (Callable): Function to generate the visualization
-            - sql_queries (list): All generated SQL queries
-            - result_dataframes (list): All resulting dataframes
-            - figures (list): All figure generation functions
-            - index_state (int): Current result index
-            - table_list (list): List of table names used
-            - error (str): Error message if any
-    """
-    final_state = await drias_workflow(query)
-    sql_queries = []
-    result_dataframes = []
-    figures = []
-    plot_title_list = []
-    plot_informations = []
-    for output_title, output in final_state['outputs'].items():
-        if output['status'] == 'OK':
-            if output['table'] is not None:
-                plot_title_list.append(output_title)
-            if output['plot_information'] is not None:
-                plot_informations.append(output['plot_information'])
-            if output['sql_query'] is not None:
-                sql_queries.append(output['sql_query'])
-            if output['dataframe'] is not None:
-                result_dataframes.append(output['dataframe'])
-                if output['figure'] is not None:
-                    figures.append(output['figure'])
-    if "error" in final_state and final_state["error"] != "":
-        # No Sql query, no dataframe, no figure, no plot information, empty sql queries list, empty result dataframes list, empty figures list, empty plot information list, index state = 0, empty table list, error message
-        return None, None, None, None, [], [], [], 0, [], final_state["error"]
-    sql_query = sql_queries[index_state]
-    dataframe = result_dataframes[index_state]
-    figure = figures[index_state](dataframe)
-    plot_information = plot_informations[index_state]
-    log_drias_interaction_to_huggingface(query, sql_query, user_id)
-    return sql_query, dataframe, figure, plot_information, sql_queries, result_dataframes, figures, plot_informations, index_state, plot_title_list, ""
-async def ask_ipcc(query: str, index_state: int = 0, user_id: str | None = None) -> tuple:
-    """Main function to process a DRIAS query and return results.
-    This function orchestrates the DRIAS workflow, processing a user query to generate
-    SQL queries, dataframes, and visualizations. It handles multiple results and allows
-    pagination through them.
-    Args:
-        query (str): The user's question about climate data
-        index_state (int, optional): The index of the result to return. Defaults to 0.
-    Returns:
-        tuple: A tuple containing:
-            - sql_query (str): The SQL query used
-            - dataframe (pd.DataFrame): The resulting data
-            - figure (Callable): Function to generate the visualization
-            - sql_queries (list): All generated SQL queries
-            - result_dataframes (list): All resulting dataframes
-            - figures (list): All figure generation functions
-            - index_state (int): Current result index
-            - table_list (list): List of table names used
-            - error (str): Error message if any
-    """
-    final_state = await ipcc_workflow(query)
-    sql_queries = []
-    result_dataframes = []
-    figures = []
-    plot_title_list = []
-    plot_informations = []
-    for output_title, output in final_state['outputs'].items():
-        if output['status'] == 'OK':
-            if output['table'] is not None:
-                plot_title_list.append(output_title)
-            if output['plot_information'] is not None:
-                plot_informations.append(output['plot_information'])
-            if output['sql_query'] is not None:
-                sql_queries.append(output['sql_query'])
-            if output['dataframe'] is not None:
-                result_dataframes.append(output['dataframe'])
-                if output['figure'] is not None:
-                    figures.append(output['figure'])
-    if "error" in final_state and final_state["error"] != "":
-        # No Sql query, no dataframe, no figure, no plot information, empty sql queries list, empty result dataframes list, empty figures list, empty plot information list, index state = 0, empty table list, error message
-        return None, None, None, None, [], [], [], 0, [], final_state["error"]
-    sql_query = sql_queries[index_state]
-    dataframe = result_dataframes[index_state]
-    figure = figures[index_state](dataframe)
-    plot_information = plot_informations[index_state]
-    log_drias_interaction_to_huggingface(query, sql_query, user_id)
-    return sql_query, dataframe, figure, plot_information, sql_queries, result_dataframes, figures, plot_informations, index_state, plot_title_list, ""

climateqa/engine/talk_to_data/myVanna.py DELETED Viewed

@@ -1,13 +0,0 @@
-from dotenv import load_dotenv
-from climateqa.engine.talk_to_data.vanna_class import MyCustomVectorDB
-from vanna.openai import OpenAI_Chat
-import os
-load_dotenv()
-OPENAI_API_KEY = os.getenv('THEO_API_KEY')
-class MyVanna(MyCustomVectorDB, OpenAI_Chat):
-    def __init__(self, config=None):
-        MyCustomVectorDB.__init__(self, config=config)
-        OpenAI_Chat.__init__(self, config=config)

climateqa/engine/talk_to_data/objects/llm_outputs.py DELETED Viewed

@@ -1,13 +0,0 @@
-from typing import Annotated, TypedDict
-class ArrayOutput(TypedDict):
-    """Represents the output of a function that returns an array.
-    This class is used to type-hint functions that return arrays,
-    ensuring consistent return types across the codebase.
-    Attributes:
-        array (str): A syntactically valid Python array string
-    """
-    array: Annotated[str, "Syntactically valid python array."]

climateqa/engine/talk_to_data/objects/location.py DELETED Viewed

@@ -1,12 +0,0 @@
-from token import OP
-from typing import Optional, TypedDict
-class Location(TypedDict):
-    location: str
-    latitude: Optional[str]
-    longitude: Optional[str]
-    country_code: Optional[str]
-    country_name: Optional[str]
-    admin1: Optional[str]

climateqa/engine/talk_to_data/objects/plot.py DELETED Viewed

@@ -1,23 +0,0 @@
-from typing import Callable, TypedDict, Optional
-from plotly.graph_objects import Figure
-class Plot(TypedDict):
-    """Represents a plot configuration in the DRIAS system.
-    This class defines the structure for configuring different types of plots
-    that can be generated from climate data.
-    Attributes:
-        name (str): The name of the plot type
-        description (str): A description of what the plot shows
-        params (list[str]): List of required parameters for the plot
-        plot_function (Callable[..., Callable[..., Figure]]): Function to generate the plot
-        sql_query (Callable[..., str]): Function to generate the SQL query for the plot
-    """
-    name: str
-    description: str
-    params: list[str]
-    plot_function: Callable[..., Callable[..., Figure]]
-    sql_query: Callable[..., str]
-    plot_information: Callable[..., str]
-    short_name: str

climateqa/engine/talk_to_data/objects/states.py DELETED Viewed

@@ -1,19 +0,0 @@
-from typing import Any, Callable, Optional, TypedDict
-from plotly.graph_objects import Figure
-import pandas as pd
-from climateqa.engine.talk_to_data.objects.plot import Plot
-class TTDOutput(TypedDict):
-    status: str
-    plot: Plot
-    table: str
-    sql_query: Optional[str]
-    dataframe: Optional[pd.DataFrame]
-    figure: Optional[Callable[..., Figure]]
-    plot_information: Optional[str]
-class State(TypedDict):
-    user_input: str
-    plots: list[str]
-    outputs: dict[str, TTDOutput]
-    error: Optional[str]