Prakhar Bhandari
commited on
Commit
·
babec93
1
Parent(s):
ae26cc4
Modular v2.0
Browse files- kg_builder/src/__pycache__/api_connections.cpython-39.pyc +0 -0
- kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc +0 -0
- kg_builder/src/__pycache__/models.cpython-39.pyc +0 -0
- kg_builder/src/__pycache__/query_graph.cpython-39.pyc +0 -0
- kg_builder/src/__pycache__/utils.cpython-39.pyc +0 -0
- kg_builder/src/api_connections.py +54 -0
- kg_builder/src/knowledge_graph_builder.py +3 -106
- kg_builder/src/models.py +27 -0
- kg_builder/src/utils.py +40 -0
kg_builder/src/__pycache__/api_connections.cpython-39.pyc
CHANGED
Binary files a/kg_builder/src/__pycache__/api_connections.cpython-39.pyc and b/kg_builder/src/__pycache__/api_connections.cpython-39.pyc differ
|
|
kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc
CHANGED
Binary files a/kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc and b/kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc differ
|
|
kg_builder/src/__pycache__/models.cpython-39.pyc
ADDED
Binary file (1.65 kB). View file
|
|
kg_builder/src/__pycache__/query_graph.cpython-39.pyc
CHANGED
Binary files a/kg_builder/src/__pycache__/query_graph.cpython-39.pyc and b/kg_builder/src/__pycache__/query_graph.cpython-39.pyc differ
|
|
kg_builder/src/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (1.66 kB). View file
|
|
kg_builder/src/api_connections.py
CHANGED
@@ -2,6 +2,12 @@
|
|
2 |
from langchain_community.graphs import Neo4jGraph
|
3 |
from dotenv import load_dotenv
|
4 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
load_dotenv() # This loads the variables from .env into os.environ
|
7 |
|
@@ -17,4 +23,52 @@ graph = Neo4jGraph(
|
|
17 |
password=password
|
18 |
)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from langchain_community.graphs import Neo4jGraph
|
3 |
from dotenv import load_dotenv
|
4 |
import os
|
5 |
+
from langchain.chains.openai_functions import create_structured_output_chain
|
6 |
+
from langchain_openai import ChatOpenAI
|
7 |
+
from langchain.prompts import ChatPromptTemplate
|
8 |
+
from models import KnowledgeGraph
|
9 |
+
from typing import Optional, List
|
10 |
+
|
11 |
|
12 |
load_dotenv() # This loads the variables from .env into os.environ
|
13 |
|
|
|
23 |
password=password
|
24 |
)
|
25 |
|
26 |
+
def get_llm():
|
27 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
28 |
+
if not api_key:
|
29 |
+
raise ValueError("No OpenAI API key found in environment variables.")
|
30 |
+
return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
|
31 |
+
|
32 |
+
def get_extraction_chain(
|
33 |
+
allowed_nodes: Optional[List[str]] = None,
|
34 |
+
allowed_rels: Optional[List[str]] = None
|
35 |
+
):
|
36 |
+
llm = get_llm()
|
37 |
+
prompt = ChatPromptTemplate.from_messages(
|
38 |
+
[(
|
39 |
+
"system",
|
40 |
+
f"""# Knowledge Graph Instructions for GPT-4
|
41 |
+
## 1. Overview
|
42 |
+
You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
|
43 |
+
- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
|
44 |
+
- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.
|
45 |
+
|
46 |
+
## 2. Labeling Nodes
|
47 |
+
- **Consistency**: Utilize uniform labels for node types to maintain clarity.
|
48 |
+
- For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
|
49 |
+
- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
|
50 |
+
|
51 |
+
{'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
|
52 |
+
{'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}
|
53 |
+
|
54 |
+
## 3. Handling Numerical Data and Dates
|
55 |
+
- Integrate numerical data and dates as attributes of the corresponding nodes.
|
56 |
+
- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
|
57 |
+
- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.
|
58 |
+
|
59 |
+
## 4. Coreference Resolution
|
60 |
+
- **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
|
61 |
+
- For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.
|
62 |
+
|
63 |
+
## 5. Relationship Naming Conventions
|
64 |
+
- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
|
65 |
+
- For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
|
66 |
+
- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.
|
67 |
|
68 |
+
## 6. Strict Compliance
|
69 |
+
Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
|
70 |
+
"""),
|
71 |
+
("human", "Use the given format to extract information from the following input: {input}"),
|
72 |
+
("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
|
73 |
+
])
|
74 |
+
return create_structured_output_chain(KnowledgeGraph, llm, prompt)
|
kg_builder/src/knowledge_graph_builder.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
|
2 |
-
# Add to knowledge_graph_builder.py
|
3 |
from api_connections import graph
|
4 |
|
5 |
from langchain_community.graphs.graph_document import (
|
@@ -10,118 +9,16 @@ from langchain_community.graphs.graph_document import (
|
|
10 |
from langchain.schema import Document
|
11 |
from typing import List, Dict, Any, Optional
|
12 |
from langchain.pydantic_v1 import Field, BaseModel
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
key: str = Field(..., description="key")
|
17 |
-
value: str = Field(..., description="value")
|
18 |
-
|
19 |
-
class Node(BaseNode):
|
20 |
-
properties: Optional[List[Property]] = Field(
|
21 |
-
None, description="List of node properties")
|
22 |
-
|
23 |
-
class Relationship(BaseRelationship):
|
24 |
-
properties: Optional[List[Property]] = Field(
|
25 |
-
None, description="List of relationship properties"
|
26 |
-
)
|
27 |
-
|
28 |
-
class KnowledgeGraph(BaseModel):
|
29 |
-
"""Generate a knowledge graph with entities and relationships."""
|
30 |
-
nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
|
31 |
-
rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")
|
32 |
-
|
33 |
-
def format_property_key(s: str) -> str:
|
34 |
-
words = s.split()
|
35 |
-
if not words:
|
36 |
-
return s
|
37 |
-
first_word = words[0].lower()
|
38 |
-
capitalized_words = [word.capitalize() for word in words[1:]]
|
39 |
-
return "".join([first_word] + capitalized_words)
|
40 |
-
|
41 |
-
def props_to_dict(props) -> dict:
|
42 |
-
"""Convert properties to a dictionary."""
|
43 |
-
properties = {}
|
44 |
-
if not props:
|
45 |
-
return properties
|
46 |
-
for p in props:
|
47 |
-
properties[format_property_key(p.key)] = p.value
|
48 |
-
return properties
|
49 |
-
|
50 |
-
def map_to_base_node(node: Node) -> BaseNode:
|
51 |
-
"""Map the KnowledgeGraph Node to the base Node."""
|
52 |
-
properties = props_to_dict(node.properties) if node.properties else {}
|
53 |
-
properties["name"] = node.id.title() # Assuming nodes have an 'id' attribute for this operation
|
54 |
-
return BaseNode(
|
55 |
-
id=node.id.title(), type=node.type.capitalize(), properties=properties
|
56 |
-
)
|
57 |
-
|
58 |
-
def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
|
59 |
-
"""Map the KnowledgeGraph Relationship to the base Relationship."""
|
60 |
-
source = map_to_base_node(rel.source)
|
61 |
-
target = map_to_base_node(rel.target)
|
62 |
-
properties = props_to_dict(rel.properties) if rel.properties else {}
|
63 |
-
return BaseRelationship(
|
64 |
-
source=source, target=target, type=rel.type, properties=properties
|
65 |
-
)
|
66 |
-
|
67 |
-
import os
|
68 |
-
from dotenv import load_dotenv
|
69 |
-
load_dotenv() # This loads the variables from .env into os.environ
|
70 |
|
71 |
from langchain.chains.openai_functions import (
|
72 |
create_openai_fn_chain,
|
73 |
create_structured_output_runnable,
|
74 |
create_structured_output_chain,
|
75 |
)
|
76 |
-
from langchain_openai import ChatOpenAI
|
77 |
-
from langchain.prompts import ChatPromptTemplate
|
78 |
-
|
79 |
-
# Setting the OpenAI API key for usage in LLM calls
|
80 |
-
os.environ["OPENAI_API_KEY"]
|
81 |
-
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
|
82 |
-
|
83 |
-
def get_extraction_chain(
|
84 |
-
allowed_nodes: Optional[List[str]] = None,
|
85 |
-
allowed_rels: Optional[List[str]] = None
|
86 |
-
):
|
87 |
-
prompt = ChatPromptTemplate.from_messages(
|
88 |
-
[(
|
89 |
-
"system",
|
90 |
-
f"""# Knowledge Graph Instructions for GPT-4
|
91 |
-
## 1. Overview
|
92 |
-
You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
|
93 |
-
- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
|
94 |
-
- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.
|
95 |
-
|
96 |
-
## 2. Labeling Nodes
|
97 |
-
- **Consistency**: Utilize uniform labels for node types to maintain clarity.
|
98 |
-
- For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
|
99 |
-
- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
|
100 |
-
|
101 |
-
{'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
|
102 |
-
{'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}
|
103 |
-
|
104 |
-
## 3. Handling Numerical Data and Dates
|
105 |
-
- Integrate numerical data and dates as attributes of the corresponding nodes.
|
106 |
-
- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
|
107 |
-
- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.
|
108 |
-
|
109 |
-
## 4. Coreference Resolution
|
110 |
-
- **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
|
111 |
-
- For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.
|
112 |
-
|
113 |
-
## 5. Relationship Naming Conventions
|
114 |
-
- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
|
115 |
-
- For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
|
116 |
-
- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.
|
117 |
|
118 |
-
## 6. Strict Compliance
|
119 |
-
Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
|
120 |
-
"""),
|
121 |
-
("human", "Use the given format to extract information from the following input: {input}"),
|
122 |
-
("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
|
123 |
-
])
|
124 |
-
return create_structured_output_chain(KnowledgeGraph, llm, prompt)
|
125 |
|
126 |
def extract_and_store_graph(
|
127 |
document: Document,
|
|
|
1 |
|
|
|
2 |
from api_connections import graph
|
3 |
|
4 |
from langchain_community.graphs.graph_document import (
|
|
|
9 |
from langchain.schema import Document
|
10 |
from typing import List, Dict, Any, Optional
|
11 |
from langchain.pydantic_v1 import Field, BaseModel
|
12 |
+
from models import Node, Relationship, KnowledgeGraph
|
13 |
+
from utils import map_to_base_node, map_to_base_relationship
|
14 |
+
from api_connections import get_extraction_chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
from langchain.chains.openai_functions import (
|
17 |
create_openai_fn_chain,
|
18 |
create_structured_output_runnable,
|
19 |
create_structured_output_chain,
|
20 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def extract_and_store_graph(
|
24 |
document: Document,
|
kg_builder/src/models.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.graphs.graph_document import (
|
2 |
+
Node as BaseNode,
|
3 |
+
Relationship as BaseRelationship,
|
4 |
+
GraphDocument,
|
5 |
+
)
|
6 |
+
from langchain.schema import Document
|
7 |
+
from typing import List, Dict, Any, Optional
|
8 |
+
from langchain.pydantic_v1 import Field, BaseModel
|
9 |
+
|
10 |
+
class Property(BaseModel):
|
11 |
+
"""A single property consisting of key and value"""
|
12 |
+
key: str = Field(..., description="key")
|
13 |
+
value: str = Field(..., description="value")
|
14 |
+
|
15 |
+
class Node(BaseNode):
|
16 |
+
properties: Optional[List[Property]] = Field(
|
17 |
+
None, description="List of node properties")
|
18 |
+
|
19 |
+
class Relationship(BaseRelationship):
|
20 |
+
properties: Optional[List[Property]] = Field(
|
21 |
+
None, description="List of relationship properties"
|
22 |
+
)
|
23 |
+
|
24 |
+
class KnowledgeGraph(BaseModel):
|
25 |
+
"""Generate a knowledge graph with entities and relationships."""
|
26 |
+
nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
|
27 |
+
rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")
|
kg_builder/src/utils.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.graphs.graph_document import (
|
2 |
+
Node as BaseNode,
|
3 |
+
Relationship as BaseRelationship,
|
4 |
+
GraphDocument,
|
5 |
+
)
|
6 |
+
from models import Node, Relationship
|
7 |
+
|
8 |
+
def format_property_key(s: str) -> str:
|
9 |
+
words = s.split()
|
10 |
+
if not words:
|
11 |
+
return s
|
12 |
+
first_word = words[0].lower()
|
13 |
+
capitalized_words = [word.capitalize() for word in words[1:]]
|
14 |
+
return "".join([first_word] + capitalized_words)
|
15 |
+
|
16 |
+
def props_to_dict(props) -> dict:
|
17 |
+
"""Convert properties to a dictionary."""
|
18 |
+
properties = {}
|
19 |
+
if not props:
|
20 |
+
return properties
|
21 |
+
for p in props:
|
22 |
+
properties[format_property_key(p.key)] = p.value
|
23 |
+
return properties
|
24 |
+
|
25 |
+
def map_to_base_node(node: Node) -> BaseNode:
|
26 |
+
"""Map the KnowledgeGraph Node to the base Node."""
|
27 |
+
properties = props_to_dict(node.properties) if node.properties else {}
|
28 |
+
properties["name"] = node.id.title() # Assuming nodes have an 'id' attribute for this operation
|
29 |
+
return BaseNode(
|
30 |
+
id=node.id.title(), type=node.type.capitalize(), properties=properties
|
31 |
+
)
|
32 |
+
|
33 |
+
def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
|
34 |
+
"""Map the KnowledgeGraph Relationship to the base Relationship."""
|
35 |
+
source = map_to_base_node(rel.source)
|
36 |
+
target = map_to_base_node(rel.target)
|
37 |
+
properties = props_to_dict(rel.properties) if rel.properties else {}
|
38 |
+
return BaseRelationship(
|
39 |
+
source=source, target=target, type=rel.type, properties=properties
|
40 |
+
)
|