In [1]:
import os
os.environ['OPENAI_API_KEY'] = "sk-proj-k8uMlsAJbdAuSWWnvaHyT3BlbkFJyQB8yMQavFuQDVmc4sNs"

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output


# define LLM
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo-0125")
Settings.chunk_size = 512

In [21]:
!pip install langchain neo4j openai wikipedia tiktoken langchain_openai



In [2]:
from langchain.graphs import Neo4jGraph

url = "neo4j+s://2f409740.databases.neo4j.io"
username ="neo4j"
password = "oe7A9ugxhxcuEtwci8khPIt2TTdz_am9AYDx1r9e9Tw"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [4]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [5]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [17]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = "sk-proj-k8uMlsAJbdAuSWWnvaHyT3BlbkFJyQB8yMQavFuQDVmc4sNs"
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
        f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.

## 2. Labeling Nodes
- **Consistency**: Utilize uniform labels for node types to maintain clarity.
  - For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.

{'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}

## 3. Handling Numerical Data and Dates
- Integrate numerical data and dates as attributes of the corresponding nodes.
- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.

## 4. Coreference Resolution
- **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
  - For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.

## 5. Relationship Naming Conventions
- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
  - For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc.
- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.

## 6. Strict Compliance
Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
        """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [18]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [21]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="Chemotherapy").load()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents[:5])

In [22]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

  0%|          | 0/5 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 20%|██        | 1/5 [01:11<04:45, 71.44s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 40%|████      | 2/5 [01:25<01:53, 37.82s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 60%|██████    | 3/5 [01:33<00:48, 24.24s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 80%|████████  | 4/5 [01:49<00:20, 20.99s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


100%|██████████| 5/5 [01:52<00:00, 22.58s/it]


In [14]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

In [23]:
cypher_chain.invoke({"query": "What are the different treatment strategies for cancer?"})



[1m> Entering new GraphCypherQAChain chain...[0m
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Generated Cypher:
[32;1m[1;3mMATCH (c:Condition {name: "Cancer"})-[:CANRESULTFROM]->(t:Treatment) RETURN t.name[0m
Full Context:
[32;1m[1;3m[][0m
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

[1m> Finished chain.[0m


{'query': 'What are the different treatment strategies for cancer?',
 'result': "I'm sorry, but I don't have the information to answer that question."}