Prakhar Bhandari commited on
Commit
684e834
·
1 Parent(s): f4cb83e

All files added

Browse files
kg_builder/notebooks/kg_creation.ipynb ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.environ['OPENAI_API_KEY'] = \"\"\n",
11
+ "\n",
12
+ "import logging\n",
13
+ "import sys\n",
14
+ "\n",
15
+ "logging.basicConfig(\n",
16
+ " stream=sys.stdout, level=logging.INFO\n",
17
+ ") # logging.DEBUG for more verbose output\n",
18
+ "\n",
19
+ "\n",
20
+ "# define LLM\n",
21
+ "from llama_index.llms.openai import OpenAI\n",
22
+ "from llama_index.core import Settings\n",
23
+ "\n",
24
+ "Settings.llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\")\n",
25
+ "Settings.chunk_size = 512"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 21,
31
+ "metadata": {},
32
+ "outputs": [
33
+ {
34
+ "name": "stdout",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "Requirement already satisfied: langchain in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.1.16)\n",
38
+ "Requirement already satisfied: neo4j in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (5.19.0)\n",
39
+ "Requirement already satisfied: openai in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (1.23.2)\n",
40
+ "Requirement already satisfied: wikipedia in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (1.4.0)\n",
41
+ "Requirement already satisfied: tiktoken in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.6.0)\n",
42
+ "Requirement already satisfied: langchain_openai in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.1.3)\n",
43
+ "Requirement already satisfied: PyYAML>=5.3 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (6.0.1)\n",
44
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.0.29)\n",
45
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (3.9.5)\n",
46
+ "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (4.0.3)\n",
47
+ "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.6.4)\n",
48
+ "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (1.33)\n",
49
+ "Requirement already satisfied: langchain-community<0.1,>=0.0.32 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.0.34)\n",
50
+ "Requirement already satisfied: langchain-core<0.2.0,>=0.1.42 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.1.45)\n",
51
+ "Requirement already satisfied: langchain-text-splitters<0.1,>=0.0.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.0.1)\n",
52
+ "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.1.49)\n",
53
+ "Requirement already satisfied: numpy<2,>=1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (1.26.4)\n",
54
+ "Requirement already satisfied: pydantic<3,>=1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.7.0)\n",
55
+ "Requirement already satisfied: requests<3,>=2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.31.0)\n",
56
+ "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (8.2.3)\n",
57
+ "Requirement already satisfied: pytz in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from neo4j) (2024.1)\n",
58
+ "Requirement already satisfied: anyio<5,>=3.5.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.3.0)\n",
59
+ "Requirement already satisfied: distro<2,>=1.7.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (1.9.0)\n",
60
+ "Requirement already satisfied: httpx<1,>=0.23.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (0.27.0)\n",
61
+ "Requirement already satisfied: sniffio in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (1.3.1)\n",
62
+ "Requirement already satisfied: tqdm>4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.66.2)\n",
63
+ "Requirement already satisfied: typing-extensions<5,>=4.7 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.11.0)\n",
64
+ "Requirement already satisfied: beautifulsoup4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from wikipedia) (4.12.3)\n",
65
+ "Requirement already satisfied: regex>=2022.1.18 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from tiktoken) (2024.4.16)\n",
66
+ "Requirement already satisfied: aiosignal>=1.1.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
67
+ "Requirement already satisfied: attrs>=17.3.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
68
+ "Requirement already satisfied: frozenlist>=1.1.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
69
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
70
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
71
+ "Requirement already satisfied: idna>=2.8 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai) (3.7)\n",
72
+ "Requirement already satisfied: exceptiongroup>=1.0.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai) (1.2.1)\n",
73
+ "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.21.1)\n",
74
+ "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
75
+ "Requirement already satisfied: certifi in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpx<1,>=0.23.0->openai) (2024.2.2)\n",
76
+ "Requirement already satisfied: httpcore==1.* in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpx<1,>=0.23.0->openai) (1.0.5)\n",
77
+ "Requirement already satisfied: h11<0.15,>=0.13 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n",
78
+ "Requirement already satisfied: jsonpointer>=1.9 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
79
+ "Requirement already satisfied: packaging<24.0,>=23.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain-core<0.2.0,>=0.1.42->langchain) (23.2)\n",
80
+ "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.1)\n",
81
+ "Requirement already satisfied: annotated-types>=0.4.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (0.6.0)\n",
82
+ "Requirement already satisfied: pydantic-core==2.18.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (2.18.1)\n",
83
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from requests<3,>=2->langchain) (3.3.2)\n",
84
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from requests<3,>=2->langchain) (2.2.1)\n",
85
+ "Requirement already satisfied: greenlet!=0.4.17 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
86
+ "Requirement already satisfied: soupsieve>1.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from beautifulsoup4->wikipedia) (2.5)\n",
87
+ "Requirement already satisfied: mypy-extensions>=0.3.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n"
88
+ ]
89
+ }
90
+ ],
91
+ "source": [
92
+ "!pip install langchain neo4j openai wikipedia tiktoken langchain_openai"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 4,
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "from langchain.graphs import Neo4jGraph\n",
102
+ "\n",
103
+ "url = \"neo4j+s://2f409740.databases.neo4j.io\"\n",
104
+ "username =\"neo4j\"\n",
105
+ "password = \"oe7A9ugxhxcuEtwci8khPIt2TTdz_am9AYDx1r9e9Tw\"\n",
106
+ "graph = Neo4jGraph(\n",
107
+ " url=url,\n",
108
+ " username=username,\n",
109
+ " password=password\n",
110
+ ")"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 5,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "from langchain_community.graphs.graph_document import (\n",
120
+ " Node as BaseNode,\n",
121
+ " Relationship as BaseRelationship,\n",
122
+ " GraphDocument,\n",
123
+ ")\n",
124
+ "from langchain.schema import Document\n",
125
+ "from typing import List, Dict, Any, Optional\n",
126
+ "from langchain.pydantic_v1 import Field, BaseModel\n",
127
+ "\n",
128
+ "class Property(BaseModel):\n",
129
+ " \"\"\"A single property consisting of key and value\"\"\"\n",
130
+ " key: str = Field(..., description=\"key\")\n",
131
+ " value: str = Field(..., description=\"value\")\n",
132
+ "\n",
133
+ "class Node(BaseNode):\n",
134
+ " properties: Optional[List[Property]] = Field(\n",
135
+ " None, description=\"List of node properties\")\n",
136
+ "\n",
137
+ "class Relationship(BaseRelationship):\n",
138
+ " properties: Optional[List[Property]] = Field(\n",
139
+ " None, description=\"List of relationship properties\"\n",
140
+ " )\n",
141
+ "\n",
142
+ "class KnowledgeGraph(BaseModel):\n",
143
+ " \"\"\"Generate a knowledge graph with entities and relationships.\"\"\"\n",
144
+ " nodes: List[Node] = Field(\n",
145
+ " ..., description=\"List of nodes in the knowledge graph\")\n",
146
+ " rels: List[Relationship] = Field(\n",
147
+ " ..., description=\"List of relationships in the knowledge graph\"\n",
148
+ " )"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 6,
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "def format_property_key(s: str) -> str:\n",
158
+ " words = s.split()\n",
159
+ " if not words:\n",
160
+ " return s\n",
161
+ " first_word = words[0].lower()\n",
162
+ " capitalized_words = [word.capitalize() for word in words[1:]]\n",
163
+ " return \"\".join([first_word] + capitalized_words)\n",
164
+ "\n",
165
+ "def props_to_dict(props) -> dict:\n",
166
+ " \"\"\"Convert properties to a dictionary.\"\"\"\n",
167
+ " properties = {}\n",
168
+ " if not props:\n",
169
+ " return properties\n",
170
+ " for p in props:\n",
171
+ " properties[format_property_key(p.key)] = p.value\n",
172
+ " return properties\n",
173
+ "\n",
174
+ "def map_to_base_node(node: Node) -> BaseNode:\n",
175
+ " \"\"\"Map the KnowledgeGraph Node to the base Node.\"\"\"\n",
176
+ " properties = props_to_dict(node.properties) if node.properties else {}\n",
177
+ " # Add name property for better Cypher statement generation\n",
178
+ " properties[\"name\"] = node.id.title()\n",
179
+ " return BaseNode(\n",
180
+ " id=node.id.title(), type=node.type.capitalize(), properties=properties\n",
181
+ " )\n",
182
+ "\n",
183
+ "\n",
184
+ "def map_to_base_relationship(rel: Relationship) -> BaseRelationship:\n",
185
+ " \"\"\"Map the KnowledgeGraph Relationship to the base Relationship.\"\"\"\n",
186
+ " source = map_to_base_node(rel.source)\n",
187
+ " target = map_to_base_node(rel.target)\n",
188
+ " properties = props_to_dict(rel.properties) if rel.properties else {}\n",
189
+ " return BaseRelationship(\n",
190
+ " source=source, target=target, type=rel.type, properties=properties\n",
191
+ " )"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 11,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "import os\n",
201
+ "from langchain.chains.openai_functions import (\n",
202
+ " create_openai_fn_chain,\n",
203
+ " create_structured_output_runnable,\n",
204
+ ")\n",
205
+ "from langchain_openai import ChatOpenAI\n",
206
+ "from langchain.prompts import ChatPromptTemplate\n",
207
+ "\n",
208
+ "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
209
+ "llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\", temperature=0)\n",
210
+ "\n",
211
+ "def get_extraction_chain(\n",
212
+ " allowed_nodes: Optional[List[str]] = None,\n",
213
+ " allowed_rels: Optional[List[str]] = None\n",
214
+ " ):\n",
215
+ " prompt = ChatPromptTemplate.from_messages(\n",
216
+ " [(\n",
217
+ " \"system\",\n",
218
+ " f\"\"\"# Knowledge Graph Instructions for GPT-4\n",
219
+ "## 1. Overview\n",
220
+ "You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.\n",
221
+ "- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.\n",
222
+ "- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.\n",
223
+ "\n",
224
+ "## 2. Labeling Nodes\n",
225
+ "- **Consistency**: Utilize uniform labels for node types to maintain clarity.\n",
226
+ " - For instance, consistently label drugs as **\"Drug\"**, symptoms as **\"Symptom\"**, and treatments as **\"Treatment\"**.\n",
227
+ "- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.\n",
228
+ "\n",
229
+ "{'- **Allowed Node Labels:**' + \", \".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else \"\"}\n",
230
+ "{'- **Allowed Relationship Types**:' + \", \".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else \"\"}\n",
231
+ "\n",
232
+ "## 3. Handling Numerical Data and Dates\n",
233
+ "- Integrate numerical data and dates as attributes of the corresponding nodes.\n",
234
+ "- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.\n",
235
+ "- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.\n",
236
+ "\n",
237
+ "## 4. Coreference Resolution\n",
238
+ "- **Entity Consistency**: Guarantee uniform identification of each entity across the graph.\n",
239
+ " - For example, if \"Methotrexate\" and \"MTX\" reference the same medication, uniformly apply \"Methotrexate\" as the node ID.\n",
240
+ "\n",
241
+ "## 5. Relationship Naming Conventions\n",
242
+ "- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.\n",
243
+ " - For instance, use \"HAS_SIDE_EFFECT\" instead of \"HASSIDEEFFECT\", use \"CAN_RESULT_FROM\" instead of \"CANRESULTFROM\" etc. You keep making the same mistakes of storing the relationships without the \"_\" in between the words. Any further similar errors will lead to termination.\n",
244
+ "- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as \"INHIBITS\" or \"ACTIVATES\" for interactions between substances.\n",
245
+ "\n",
246
+ "## 6. Strict Compliance\n",
247
+ "Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.\n",
248
+ " \"\"\"),\n",
249
+ " (\"human\", \"Use the given format to extract information from the following input: {input}\"),\n",
250
+ " (\"human\", \"Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph.\"),\n",
251
+ " ])\n",
252
+ " return create_structured_output_chain(KnowledgeGraph, llm, prompt)"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 12,
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": [
261
+ "def extract_and_store_graph(\n",
262
+ " document: Document,\n",
263
+ " nodes:Optional[List[str]] = None,\n",
264
+ " rels:Optional[List[str]]=None) -> None:\n",
265
+ " # Extract graph data using OpenAI functions\n",
266
+ " extract_chain = get_extraction_chain(nodes, rels)\n",
267
+ " data = extract_chain.invoke(document.page_content)['function']\n",
268
+ " # Construct a graph document\n",
269
+ " graph_document = GraphDocument(\n",
270
+ " nodes = [map_to_base_node(node) for node in data.nodes],\n",
271
+ " relationships = [map_to_base_relationship(rel) for rel in data.rels],\n",
272
+ " source = document\n",
273
+ " )\n",
274
+ " # Store information into a graph\n",
275
+ " graph.add_graph_documents([graph_document])"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 13,
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": [
284
+ "from langchain.document_loaders import WikipediaLoader\n",
285
+ "from langchain.text_splitter import TokenTextSplitter\n",
286
+ "\n",
287
+ "# Read the wikipedia article\n",
288
+ "raw_documents = WikipediaLoader(query=\"Chemotherapy\").load()\n",
289
+ "# Define chunking strategy\n",
290
+ "text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)\n",
291
+ "\n",
292
+ "# Only take the first the raw_documents\n",
293
+ "documents = text_splitter.split_documents(raw_documents[:5])"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": 14,
299
+ "metadata": {},
300
+ "outputs": [
301
+ {
302
+ "name": "stderr",
303
+ "output_type": "stream",
304
+ "text": [
305
+ " 0%| | 0/5 [00:00<?, ?it/s]"
306
+ ]
307
+ },
308
+ {
309
+ "name": "stdout",
310
+ "output_type": "stream",
311
+ "text": [
312
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
313
+ ]
314
+ },
315
+ {
316
+ "name": "stderr",
317
+ "output_type": "stream",
318
+ "text": [
319
+ " 0%| | 0/5 [01:25<?, ?it/s]\n"
320
+ ]
321
+ },
322
+ {
323
+ "ename": "TypeError",
324
+ "evalue": "'KnowledgeGraph' object is not subscriptable",
325
+ "output_type": "error",
326
+ "traceback": [
327
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
328
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
329
+ "Cell \u001b[0;32mIn[14], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtqdm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, d \u001b[38;5;129;01min\u001b[39;00m tqdm(\u001b[38;5;28menumerate\u001b[39m(documents), total\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(documents)):\n\u001b[0;32m----> 4\u001b[0m \u001b[43mextract_and_store_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43md\u001b[49m\u001b[43m)\u001b[49m\n",
330
+ "Cell \u001b[0;32mIn[12], line 7\u001b[0m, in \u001b[0;36mextract_and_store_graph\u001b[0;34m(document, nodes, rels)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mextract_and_store_graph\u001b[39m(\n\u001b[1;32m 2\u001b[0m document: Document,\n\u001b[1;32m 3\u001b[0m nodes:Optional[List[\u001b[38;5;28mstr\u001b[39m]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 4\u001b[0m rels:Optional[List[\u001b[38;5;28mstr\u001b[39m]]\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Extract graph data using OpenAI functions\u001b[39;00m\n\u001b[1;32m 6\u001b[0m extract_chain \u001b[38;5;241m=\u001b[39m get_extraction_chain(nodes, rels)\n\u001b[0;32m----> 7\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mextract_chain\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocument\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfunction\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# Construct a graph document\u001b[39;00m\n\u001b[1;32m 9\u001b[0m graph_document \u001b[38;5;241m=\u001b[39m GraphDocument(\n\u001b[1;32m 10\u001b[0m nodes \u001b[38;5;241m=\u001b[39m [map_to_base_node(node) \u001b[38;5;28;01mfor\u001b[39;00m node \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mnodes],\n\u001b[1;32m 11\u001b[0m relationships \u001b[38;5;241m=\u001b[39m [map_to_base_relationship(rel) \u001b[38;5;28;01mfor\u001b[39;00m rel \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mrels],\n\u001b[1;32m 12\u001b[0m source \u001b[38;5;241m=\u001b[39m document\n\u001b[1;32m 13\u001b[0m )\n",
331
+ "\u001b[0;31mTypeError\u001b[0m: 'KnowledgeGraph' object is not subscriptable"
332
+ ]
333
+ }
334
+ ],
335
+ "source": [
336
+ "from tqdm import tqdm\n",
337
+ "\n",
338
+ "for i, d in tqdm(enumerate(documents), total=len(documents)):\n",
339
+ " extract_and_store_graph(d)"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": 10,
345
+ "metadata": {},
346
+ "outputs": [],
347
+ "source": [
348
+ "# Query the knowledge graph in a RAG application\n",
349
+ "from langchain.chains import GraphCypherQAChain\n",
350
+ "\n",
351
+ "graph.refresh_schema()\n",
352
+ "\n",
353
+ "cypher_chain = GraphCypherQAChain.from_llm(\n",
354
+ " graph=graph,\n",
355
+ " cypher_llm=ChatOpenAI(temperature=0, model=\"gpt-4\"),\n",
356
+ " qa_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-16k\"),\n",
357
+ " #validate_cypher=True, # Validate relationship directions\n",
358
+ " verbose=True\n",
359
+ ")"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 11,
365
+ "metadata": {},
366
+ "outputs": [
367
+ {
368
+ "name": "stdout",
369
+ "output_type": "stream",
370
+ "text": [
371
+ "\n",
372
+ "\n",
373
+ "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
374
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
375
+ "Generated Cypher:\n",
376
+ "\u001b[32;1m\u001b[1;3mMATCH (t:Treatment {name: \"Induction Chemotherapy\"})-[:CONTROLS]->(mc) RETURN mc.name\u001b[0m\n",
377
+ "Full Context:\n",
378
+ "\u001b[32;1m\u001b[1;3m[{'mc.name': 'Malignant Lymphomas'}, {'mc.name': 'Head And Neck Squamous Cell Carcinomas'}, {'mc.name': 'Malignant Lymphomas'}, {'mc.name': 'Head And Neck Squamous Cell Carcinomas'}]\u001b[0m\n",
379
+ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
380
+ "\n",
381
+ "\u001b[1m> Finished chain.\u001b[0m\n"
382
+ ]
383
+ },
384
+ {
385
+ "data": {
386
+ "text/plain": [
387
+ "{'query': 'What does Induction Chemotherapy control?',\n",
388
+ " 'result': 'Induction Chemotherapy controls Malignant Lymphomas and Head And Neck Squamous Cell Carcinomas.'}"
389
+ ]
390
+ },
391
+ "execution_count": 11,
392
+ "metadata": {},
393
+ "output_type": "execute_result"
394
+ }
395
+ ],
396
+ "source": [
397
+ "cypher_chain.invoke({\"query\": \"What does Induction Chemotherapy control?\"})"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": null,
403
+ "metadata": {},
404
+ "outputs": [],
405
+ "source": []
406
+ }
407
+ ],
408
+ "metadata": {
409
+ "kernelspec": {
410
+ "display_name": "my_project_env",
411
+ "language": "python",
412
+ "name": "python3"
413
+ },
414
+ "language_info": {
415
+ "codemirror_mode": {
416
+ "name": "ipython",
417
+ "version": 3
418
+ },
419
+ "file_extension": ".py",
420
+ "mimetype": "text/x-python",
421
+ "name": "python",
422
+ "nbconvert_exporter": "python",
423
+ "pygments_lexer": "ipython3",
424
+ "version": "3.9.19"
425
+ }
426
+ },
427
+ "nbformat": 4,
428
+ "nbformat_minor": 2
429
+ }