|
|
|
from langchain_community.graphs import Neo4jGraph |
|
from dotenv import load_dotenv |
|
import os |
|
from langchain.chains.openai_functions import create_structured_output_chain |
|
from langchain_openai import ChatOpenAI |
|
from langchain.prompts import ChatPromptTemplate |
|
from models import KnowledgeGraph |
|
from typing import Optional, List |
|
|
|
|
|
load_dotenv() |
|
|
|
def get_graph_connection(category): |
|
if category == "Chemotherapy": |
|
url = os.getenv("CHEMO_NEO4J_URL") |
|
username = os.getenv("CHEMO_NEO4J_USERNAME") |
|
password = os.getenv("CHEMO_NEO4J_PASSWORD") |
|
elif category == "Traffic Law": |
|
url = os.getenv("TRAFFIC_NEO4J_URL") |
|
username = os.getenv("TRAFFIC_NEO4J_USERNAME") |
|
password = os.getenv("TRAFFIC_NEO4J_PASSWORD") |
|
else: |
|
raise ValueError(f"Unknown category: {category}") |
|
|
|
return Neo4jGraph(url=url, username=username, password=password) |
|
|
|
openai_api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
def get_llm(): |
|
api_key = os.getenv("OPENAI_API_KEY") |
|
if not api_key: |
|
raise ValueError("No OpenAI API key found in environment variables.") |
|
return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0) |
|
|
|
def get_extraction_chain( |
|
category, |
|
allowed_nodes: Optional[List[str]] = None, |
|
allowed_rels: Optional[List[str]] = None |
|
): |
|
if category == "Chemotherapy": |
|
|
|
prompt_text = f"""# Knowledge Graph Instructions for GPT-4 |
|
## 1. Overview |
|
You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments. |
|
- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts. |
|
- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research. |
|
|
|
## 2. Labeling Nodes |
|
- **Consistency**: Utilize uniform labels for node types to maintain clarity. |
|
- For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**. |
|
- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text. |
|
{'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""} |
|
{'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""} |
|
|
|
## 3. Handling Numerical Data and Dates |
|
- Integrate numerical data and dates as attributes of the corresponding nodes. |
|
- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes. |
|
- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`. |
|
|
|
## 4. Coreference Resolution |
|
- **Entity Consistency**: Guarantee uniform identification of each entity across the graph. |
|
- For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID. |
|
|
|
## 5. Relationship Naming Conventions |
|
- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability. |
|
- For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination. |
|
- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances. |
|
|
|
## 6. Strict Compliance |
|
Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard. |
|
""" |
|
|
|
elif category == "Traffic Law": |
|
|
|
prompt_text = f"""# Knowledge Graph Instructions for GPT-4 |
|
## 1. Overview |
|
You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about traffic laws and regulations in the United States. |
|
- **Nodes** symbolize entities such as types of traffic violations, penalties, driving regulations, and relevant legal statutes. |
|
- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for legal professionals, law enforcement agencies, and the general public. |
|
|
|
## 2. Labeling Nodes |
|
- **Consistency**: Utilize uniform labels for node types to maintain clarity. |
|
- For instance, consistently label violations as **"Violation"**, penalties as **"Penalty"**, and statutes as **"Statute"**. |
|
- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text. |
|
{'- **Allowed Node Labels:**' + ", ".join(['Violation', 'Penalty', 'Statute', 'VehicleType', 'LegalDocument']) if allowed_nodes else ""} |
|
{'- **Allowed Relationship Types**:' + ", ".join(['Violates', 'Penalizes', 'Governs', 'Cites']) if allowed_rels else ""} |
|
|
|
## 3. Handling Numerical Data and Dates |
|
- Integrate numerical data and dates as attributes of the corresponding nodes. |
|
- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes. |
|
- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `fineAmount`, `lawEffectiveDate`. |
|
|
|
## 4. Coreference Resolution |
|
- **Entity Consistency**: Guarantee uniform identification of each entity across the graph. |
|
- For example, if "Vehicle Code 22350" and "Speed Law" reference the same statute, uniformly apply "Vehicle Code 22350" as the node ID. |
|
|
|
## 5. Relationship Naming Conventions |
|
- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability. |
|
- For instance, use "IS_PENALIZED_BY" instead of "ISPENALIZEDBY", use "IS_GOVERNED_BY" instead of "ISGOVERNEDBY" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination. |
|
- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "REQUIRES" or "PROHIBITS" for legal requirements or prohibitions. |
|
|
|
## 6. Strict Compliance |
|
Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard. |
|
""" |
|
|
|
else: |
|
raise ValueError("Unknown category") |
|
|
|
llm = get_llm() |
|
prompt = ChatPromptTemplate.from_messages( |
|
[( |
|
"system",prompt_text), |
|
("human", "Use the given format to extract information from the following input: {input}"), |
|
("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."), |
|
]) |
|
return create_structured_output_chain(KnowledgeGraph, llm, prompt) |
|
|