File size: 11,101 Bytes

import logging

from langchain_community.graphs import Neo4jGraph
from dotenv import load_dotenv
import os
from langchain.chains.openai_functions import create_structured_output_chain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from models import KnowledgeGraph
from typing import Optional, List

logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

load_dotenv()  # This loads the variables from .env into os.environ

def get_graph_connection(data_source_name):
    """
    Get Neo4j graph connection
    :param data_source_name: Data source name, e.g. "Traffic Law"
    :return: Neo4j graph connection
    """
    if data_source_name == "Chemotherapy":
        url = os.getenv("CHEMO_NEO4J_URL")
        username = os.getenv("CHEMO_NEO4J_USERNAME")
        password = os.getenv("CHEMO_NEO4J_PASSWORD")
    elif data_source_name == "Traffic Law":
        url = os.getenv("TRAFFIC_NEO4J_URL")
        username = os.getenv("TRAFFIC_NEO4J_USERNAME")
        password = os.getenv("TRAFFIC_NEO4J_PASSWORD")
    elif data_source_name == "SquirroDocs":
        url = os.getenv("TRAFFIC_NEO4J_URL")
        username = os.getenv("TRAFFIC_NEO4J_USERNAME")
        password = os.getenv("TRAFFIC_NEO4J_PASSWORD")
    else:
        errorMsg = f"No such Data Source connection configured: {data_source_name}"
        raise ValueError(errorMsg)

    return Neo4jGraph(url=url, username=username, password=password)

openai_api_key = os.getenv("OPENAI_API_KEY")

def get_llm():
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("No OpenAI API key found in environment variables.")
    return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    data_source_name,
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    """
    TODO
    :param data_source_name: Data source name, e.g. "Traffic Law"
    :param allowed_nodes: TODO
    :param allowed_rels: TODO
    :return: TODO
    """
    if data_source_name == "Chemotherapy":
        # Chemotherapy-specific prompt
        prompt_text = f"""# Knowledge Graph Instructions for GPT-4
    ## 1. Overview
    You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
    - **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
    - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.

    ## 2. Labeling Nodes
    - **Consistency**: Utilize uniform labels for node types to maintain clarity.
    - For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
    - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
    {'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
    {'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}

    ## 3. Handling Numerical Data and Dates
    - Integrate numerical data and dates as attributes of the corresponding nodes.
    - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
    - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.

    ## 4. Coreference Resolution
    - **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
    - For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.

    ## 5. Relationship Naming Conventions
    - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
    - For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
    - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.

    ## 6. Strict Compliance
    Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
    """

    elif data_source_name == "SquirroDocs":
        # Squirro Docs-specific prompt
        prompt_text = f"""# Knowledge Graph Instructions for GPT-4
    ## 1. Overview
    You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about Squirro documentation.
    - **Nodes** symbolize entities such as types of traffic violations, penalties, driving regulations, and relevant legal statutes.
    - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for legal professionals, law enforcement agencies, and the general public.

    ## 2. Labeling Nodes
    - **Consistency**: Utilize uniform labels for node types to maintain clarity.
    - For instance, consistently label violations as **"Violation"**, penalties as **"Penalty"**, and statutes as **"Statute"**.
    - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
    {'- **Allowed Node Labels:**' + ", ".join(['Violation', 'Penalty', 'Statute', 'VehicleType', 'LegalDocument']) if allowed_nodes else ""}
    {'- **Allowed Relationship Types**:' + ", ".join(['Violates', 'Penalizes', 'Governs', 'Cites']) if allowed_rels else ""}

    ## 3. Handling Numerical Data and Dates
    - Integrate numerical data and dates as attributes of the corresponding nodes.
    - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
    - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `fineAmount`, `lawEffectiveDate`.

    ## 4. Coreference Resolution
    - **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
    - For example, if "Vehicle Code 22350" and "Speed Law" reference the same statute, uniformly apply "Vehicle Code 22350" as the node ID.

    ## 5. Relationship Naming Conventions
    - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
    - For instance, use "IS_PENALIZED_BY" instead of "ISPENALIZEDBY", use "IS_GOVERNED_BY" instead of "ISGOVERNEDBY" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
    - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "REQUIRES" or "PROHIBITS" for legal requirements or prohibitions.

    ## 6. Strict Compliance
    Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
    """

    elif data_source_name == "Traffic Law":
        # Traffic Law-specific prompt
        prompt_text = f"""# Knowledge Graph Instructions for GPT-4
    ## 1. Overview
    You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about traffic laws and regulations in the United States.
    - **Nodes** symbolize entities such as types of traffic violations, penalties, driving regulations, and relevant legal statutes.
    - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for legal professionals, law enforcement agencies, and the general public.

    ## 2. Labeling Nodes
    - **Consistency**: Utilize uniform labels for node types to maintain clarity.
    - For instance, consistently label violations as **"Violation"**, penalties as **"Penalty"**, and statutes as **"Statute"**.
    - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
    {'- **Allowed Node Labels:**' + ", ".join(['Violation', 'Penalty', 'Statute', 'VehicleType', 'LegalDocument']) if allowed_nodes else ""}
    {'- **Allowed Relationship Types**:' + ", ".join(['Violates', 'Penalizes', 'Governs', 'Cites']) if allowed_rels else ""}

    ## 3. Handling Numerical Data and Dates
    - Integrate numerical data and dates as attributes of the corresponding nodes.
    - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
    - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `fineAmount`, `lawEffectiveDate`.

    ## 4. Coreference Resolution
    - **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
    - For example, if "Vehicle Code 22350" and "Speed Law" reference the same statute, uniformly apply "Vehicle Code 22350" as the node ID.

    ## 5. Relationship Naming Conventions
    - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
    - For instance, use "IS_PENALIZED_BY" instead of "ISPENALIZEDBY", use "IS_GOVERNED_BY" instead of "ISGOVERNEDBY" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
    - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "REQUIRES" or "PROHIBITS" for legal requirements or prohibitions.

    ## 6. Strict Compliance
    Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
    """

    else:
        errorMsg = f"No prompt configured for Data Source ¨{data_source_name}¨!"
        raise ValueError(errorMsg)

    logger.info(f"Prompt to extract graph data: {prompt_text}")
    
    llm = get_llm()
    prompt = ChatPromptTemplate.from_messages(
        [(
                    "system",prompt_text),
                    ("human", "Use the given format to extract information from the following input: {input}"),
                    ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt)