GraphRAG / kg_builder /src /api_connections.py
Prakhar Bhandari
Modular v2.0
babec93
raw
history blame
4.22 kB
from langchain_community.graphs import Neo4jGraph
from dotenv import load_dotenv
import os
from langchain.chains.openai_functions import create_structured_output_chain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from models import KnowledgeGraph
from typing import Optional, List
load_dotenv() # This loads the variables from .env into os.environ
# Now use os.getenv to access your variables
url = os.getenv("NEO4J_URL")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
openai_api_key = os.getenv("OPENAI_API_KEY")
graph = Neo4jGraph(
url=url,
username=username,
password=password
)
def get_llm():
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("No OpenAI API key found in environment variables.")
return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
def get_extraction_chain(
allowed_nodes: Optional[List[str]] = None,
allowed_rels: Optional[List[str]] = None
):
llm = get_llm()
prompt = ChatPromptTemplate.from_messages(
[(
"system",
f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.
## 2. Labeling Nodes
- **Consistency**: Utilize uniform labels for node types to maintain clarity.
- For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
{'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Integrate numerical data and dates as attributes of the corresponding nodes.
- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.
## 4. Coreference Resolution
- **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
- For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.
## 5. Relationship Naming Conventions
- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
- For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.
## 6. Strict Compliance
Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
"""),
("human", "Use the given format to extract information from the following input: {input}"),
("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
])
return create_structured_output_chain(KnowledgeGraph, llm, prompt)