|
|
|
from langchain_community.graphs import Neo4jGraph |
|
from dotenv import load_dotenv |
|
import os |
|
from langchain.chains.openai_functions import create_structured_output_chain |
|
from langchain_openai import ChatOpenAI |
|
from langchain.prompts import ChatPromptTemplate |
|
from models import KnowledgeGraph |
|
from typing import Optional, List |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
url = os.getenv("NEO4J_URL") |
|
username = os.getenv("NEO4J_USERNAME") |
|
password = os.getenv("NEO4J_PASSWORD") |
|
openai_api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
graph = Neo4jGraph( |
|
url=url, |
|
username=username, |
|
password=password |
|
) |
|
|
|
def get_llm(): |
|
api_key = os.getenv("OPENAI_API_KEY") |
|
if not api_key: |
|
raise ValueError("No OpenAI API key found in environment variables.") |
|
return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0) |
|
|
|
def get_extraction_chain( |
|
allowed_nodes: Optional[List[str]] = None, |
|
allowed_rels: Optional[List[str]] = None |
|
): |
|
llm = get_llm() |
|
prompt = ChatPromptTemplate.from_messages( |
|
[( |
|
"system", |
|
f"""# Knowledge Graph Instructions for GPT-4 |
|
## 1. Overview |
|
You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments. |
|
- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts. |
|
- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research. |
|
|
|
## 2. Labeling Nodes |
|
- **Consistency**: Utilize uniform labels for node types to maintain clarity. |
|
- For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**. |
|
- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text. |
|
|
|
{'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""} |
|
{'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""} |
|
|
|
## 3. Handling Numerical Data and Dates |
|
- Integrate numerical data and dates as attributes of the corresponding nodes. |
|
- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes. |
|
- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`. |
|
|
|
## 4. Coreference Resolution |
|
- **Entity Consistency**: Guarantee uniform identification of each entity across the graph. |
|
- For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID. |
|
|
|
## 5. Relationship Naming Conventions |
|
- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability. |
|
- For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination. |
|
- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances. |
|
|
|
## 6. Strict Compliance |
|
Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard. |
|
"""), |
|
("human", "Use the given format to extract information from the following input: {input}"), |
|
("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."), |
|
]) |
|
return create_structured_output_chain(KnowledgeGraph, llm, prompt) |
|
|