import logging from langchain_community.graphs import Neo4jGraph from dotenv import load_dotenv import os from langchain.chains.openai_functions import create_structured_output_chain from langchain_openai import ChatOpenAI from langchain.prompts import ChatPromptTemplate from models import KnowledgeGraph from typing import Optional, List logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) load_dotenv() # This loads the variables from .env into os.environ def get_graph_connection(data_source_name): """ Get Neo4j graph connection :param data_source_name: Data source name, e.g. "Traffic Law" :return: Neo4j graph connection """ if data_source_name == "Chemotherapy": url = os.getenv("CHEMO_NEO4J_URL") username = os.getenv("CHEMO_NEO4J_USERNAME") password = os.getenv("CHEMO_NEO4J_PASSWORD") elif data_source_name == "Traffic Law": url = os.getenv("TRAFFIC_NEO4J_URL") username = os.getenv("TRAFFIC_NEO4J_USERNAME") password = os.getenv("TRAFFIC_NEO4J_PASSWORD") elif data_source_name == "SquirroDocs": url = os.getenv("TRAFFIC_NEO4J_URL") username = os.getenv("TRAFFIC_NEO4J_USERNAME") password = os.getenv("TRAFFIC_NEO4J_PASSWORD") else: errorMsg = f"No such Data Source connection configured: {data_source_name}" raise ValueError(errorMsg) return Neo4jGraph(url=url, username=username, password=password) openai_api_key = os.getenv("OPENAI_API_KEY") def get_llm(): api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("No OpenAI API key found in environment variables.") return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0) def get_extraction_chain( data_source_name, allowed_nodes: Optional[List[str]] = None, allowed_rels: Optional[List[str]] = None ): """ TODO :param data_source_name: Data source name, e.g. "Traffic Law" :param allowed_nodes: TODO :param allowed_rels: TODO :return: TODO """ if data_source_name == "Chemotherapy": # Chemotherapy-specific prompt prompt_text = f"""# Knowledge Graph Instructions for GPT-4 ## 1. Overview You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments. - **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts. - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research. ## 2. Labeling Nodes - **Consistency**: Utilize uniform labels for node types to maintain clarity. - For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**. - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text. {'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""} {'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""} ## 3. Handling Numerical Data and Dates - Integrate numerical data and dates as attributes of the corresponding nodes. - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes. - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`. ## 4. Coreference Resolution - **Entity Consistency**: Guarantee uniform identification of each entity across the graph. - For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID. ## 5. Relationship Naming Conventions - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability. - For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination. - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances. ## 6. Strict Compliance Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard. """ elif data_source_name == "SquirroDocs": # Squirro Docs-specific prompt prompt_text = f"""# Knowledge Graph Instructions for GPT-4 ## 1. Overview You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about Squirro documentation. - **Nodes** symbolize entities such as types of traffic violations, penalties, driving regulations, and relevant legal statutes. - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for legal professionals, law enforcement agencies, and the general public. ## 2. Labeling Nodes - **Consistency**: Utilize uniform labels for node types to maintain clarity. - For instance, consistently label violations as **"Violation"**, penalties as **"Penalty"**, and statutes as **"Statute"**. - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text. {'- **Allowed Node Labels:**' + ", ".join(['Violation', 'Penalty', 'Statute', 'VehicleType', 'LegalDocument']) if allowed_nodes else ""} {'- **Allowed Relationship Types**:' + ", ".join(['Violates', 'Penalizes', 'Governs', 'Cites']) if allowed_rels else ""} ## 3. Handling Numerical Data and Dates - Integrate numerical data and dates as attributes of the corresponding nodes. - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes. - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `fineAmount`, `lawEffectiveDate`. ## 4. Coreference Resolution - **Entity Consistency**: Guarantee uniform identification of each entity across the graph. - For example, if "Vehicle Code 22350" and "Speed Law" reference the same statute, uniformly apply "Vehicle Code 22350" as the node ID. ## 5. Relationship Naming Conventions - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability. - For instance, use "IS_PENALIZED_BY" instead of "ISPENALIZEDBY", use "IS_GOVERNED_BY" instead of "ISGOVERNEDBY" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination. - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "REQUIRES" or "PROHIBITS" for legal requirements or prohibitions. ## 6. Strict Compliance Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard. """ elif data_source_name == "Traffic Law": # Traffic Law-specific prompt prompt_text = f"""# Knowledge Graph Instructions for GPT-4 ## 1. Overview You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about traffic laws and regulations in the United States. - **Nodes** symbolize entities such as types of traffic violations, penalties, driving regulations, and relevant legal statutes. - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for legal professionals, law enforcement agencies, and the general public. ## 2. Labeling Nodes - **Consistency**: Utilize uniform labels for node types to maintain clarity. - For instance, consistently label violations as **"Violation"**, penalties as **"Penalty"**, and statutes as **"Statute"**. - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text. {'- **Allowed Node Labels:**' + ", ".join(['Violation', 'Penalty', 'Statute', 'VehicleType', 'LegalDocument']) if allowed_nodes else ""} {'- **Allowed Relationship Types**:' + ", ".join(['Violates', 'Penalizes', 'Governs', 'Cites']) if allowed_rels else ""} ## 3. Handling Numerical Data and Dates - Integrate numerical data and dates as attributes of the corresponding nodes. - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes. - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `fineAmount`, `lawEffectiveDate`. ## 4. Coreference Resolution - **Entity Consistency**: Guarantee uniform identification of each entity across the graph. - For example, if "Vehicle Code 22350" and "Speed Law" reference the same statute, uniformly apply "Vehicle Code 22350" as the node ID. ## 5. Relationship Naming Conventions - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability. - For instance, use "IS_PENALIZED_BY" instead of "ISPENALIZEDBY", use "IS_GOVERNED_BY" instead of "ISGOVERNEDBY" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination. - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "REQUIRES" or "PROHIBITS" for legal requirements or prohibitions. ## 6. Strict Compliance Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard. """ else: errorMsg = f"No prompt configured for Data Source ¨{data_source_name}¨!" raise ValueError(errorMsg) logger.info(f"Prompt to extract graph data: {prompt_text}") llm = get_llm() prompt = ChatPromptTemplate.from_messages( [( "system",prompt_text), ("human", "Use the given format to extract information from the following input: {input}"), ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."), ]) return create_structured_output_chain(KnowledgeGraph, llm, prompt)