File size: 17,588 Bytes
098536f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "nest_asyncio.apply()\n",
    "\n",
    "import os\n",
    "import getpass\n",
    "import openai\n",
    "import logging\n",
    "import sys\n",
    "from llama_index import SimpleDirectoryReader, SummaryIndex, ServiceContext\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key:\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llms import OpenAI\n",
    "from llama_index.callbacks import LlamaDebugHandler, CallbackManager\n",
    "\n",
    "llm = OpenAI(\"gpt-3.5-turbo\")\n",
    "\n",
    "callback_manager = CallbackManager([LlamaDebugHandler()])\n",
    "\n",
    "service_context = ServiceContext.from_defaults(\n",
    "    llm=llm, callback_manager=callback_manager, chunk_size=256\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 4 docs\n"
     ]
    }
   ],
   "source": [
    "required_exts = [\".txt\"]\n",
    "\n",
    "reader = SimpleDirectoryReader(\n",
    "    input_dir=\"../data\",\n",
    "    required_exts=required_exts,\n",
    "    recursive=True,\n",
    "    filename_as_id=True\n",
    ")\n",
    "\n",
    "docs = reader.load_data()\n",
    "print(f\"Loaded {len(docs)} docs\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Metadata Filters + Auto-Retrieval\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n",
    "from llama_index.vector_stores import ChromaVectorStore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
      "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
      "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
     ]
    }
   ],
   "source": [
    "import chromadb\n",
    "from llama_index.storage.storage_context import StorageContext\n",
    "\n",
    "db = chromadb.PersistentClient(path=\"../chroma_db\")\n",
    "chroma_collection = db.get_or_create_collection(\"quickstart\")\n",
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.066032 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.063786 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.335255 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.430667 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.39471 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.341174 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.333922 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.371205 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.655165 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.534313 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.513138 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.396431 seconds\n",
      "**********\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "type() takes 1 or 3 arguments",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[49], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mtype\u001b[39;49m(vector_index \u001b[39m=\u001b[39;49m VectorStoreIndex\u001b[39m.\u001b[39;49mfrom_documents([docs[\u001b[39m0\u001b[39;49m]], \n\u001b[1;32m      2\u001b[0m                                                 service_context\u001b[39m=\u001b[39;49mservice_context\n\u001b[1;32m      3\u001b[0m ))\n",
      "\u001b[0;31mTypeError\u001b[0m: type() takes 1 or 3 arguments"
     ]
    }
   ],
   "source": [
    "vector_index = VectorStoreIndex.from_documents([docs[0]], \n",
    "                                                service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "llama_index.indices.vector_store.base.VectorStoreIndex"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(vector_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.078989 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.075335 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.272066 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.344792 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.351537 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.247337 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.351224 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.23581 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.309488 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.25491 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.192247 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.23071 seconds\n",
      "**********\n",
      "**Summary for final-hh.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n",
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.235509 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.231563 seconds\n",
      "    |_CBEventType.EMBEDDING ->  1.126853 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.306191 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.451583 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.415356 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.435105 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.37879 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.280844 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.24501 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.300654 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.496476 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.44205 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.52554 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.853941 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.394818 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.338529 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.319579 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.52271 seconds\n",
      "**********\n",
      "**Summary for hh100.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n",
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.052081 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.050151 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.434005 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.417429 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.359151 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.347035 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.342142 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.277749 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.348186 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.2124 seconds\n",
      "**********\n",
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.050094 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.048018 seconds\n",
      "**********\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
      "**********\n",
      "Trace: query\n",
      "    |_CBEventType.QUERY ->  14.970623 seconds\n",
      "      |_CBEventType.RETRIEVE ->  0.002261 seconds\n",
      "      |_CBEventType.SYNTHESIZE ->  14.968218 seconds\n",
      "        |_CBEventType.TEMPLATING ->  2.5e-05 seconds\n",
      "        |_CBEventType.LLM ->  5.465885 seconds\n",
      "        |_CBEventType.TEMPLATING ->  8e-06 seconds\n",
      "        |_CBEventType.LLM ->  4.726724 seconds\n",
      "        |_CBEventType.TEMPLATING ->  6e-06 seconds\n",
      "        |_CBEventType.LLM ->  4.548116 seconds\n",
      "        |_CBEventType.TEMPLATING ->  5e-06 seconds\n",
      "        |_CBEventType.LLM ->  5.774904 seconds\n",
      "        |_CBEventType.TEMPLATING ->  6e-06 seconds\n",
      "        |_CBEventType.LLM ->  4.269908 seconds\n",
      "        |_CBEventType.TEMPLATING ->  6e-06 seconds\n",
      "        |_CBEventType.LLM ->  6.508074 seconds\n",
      "        |_CBEventType.TEMPLATING ->  3.6e-05 seconds\n",
      "        |_CBEventType.LLM ->  8.223118 seconds\n",
      "**********\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "write() argument must be str, not Document",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[54], line 32\u001b[0m\n\u001b[1;32m     30\u001b[0m     Path(\u001b[39m\"\u001b[39m\u001b[39msummaries\u001b[39m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mmkdir(exist_ok\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m     31\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mw\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n\u001b[0;32m---> 32\u001b[0m         fp\u001b[39m.\u001b[39mwrite(doc)\n\u001b[1;32m     33\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m     34\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n",
      "\u001b[0;31mTypeError\u001b[0m: write() argument must be str, not Document"
     ]
    }
   ],
   "source": [
    "# define top-level nodes and vector retrievers\n",
    "nodes = []\n",
    "vector_query_engines = {}\n",
    "vector_retrievers = {}\n",
    "\n",
    "for doc in docs:\n",
    "    # build vector index\n",
    "    doc_id = doc.id_.split(\"/\")[-1]\n",
    "    vector_index = VectorStoreIndex.from_documents([doc], \n",
    "                                                   service_context=service_context\n",
    "    )\n",
    "    # define query engines\n",
    "    vector_query_engine = vector_index.as_query_engine()\n",
    "    vector_query_engines[doc_id] = vector_query_engine\n",
    "    vector_retrievers[doc_id] = vector_index.as_retriever()\n",
    "\n",
    "    # save summaries\n",
    "    \n",
    "    out_path = Path(\"summaries\") / f\"{doc_id}.txt\"\n",
    "    if not out_path.exists():\n",
    "        # use LLM-generated summary\n",
    "        summary_index = SummaryIndex.from_documents([doc], \n",
    "                                                    service_context=service_context\n",
    "        )\n",
    "\n",
    "        summarizer = summary_index.as_query_engine(response_mode=\"tree_summarize\")\n",
    "        response = await summarizer.aquery(f\"Give me a summary of {doc_id}\")\n",
    "\n",
    "        doc_summary = response.response\n",
    "        Path(\"summaries\").mkdir(exist_ok=True)\n",
    "        with open(out_path, \"w\") as fp:\n",
    "            fp.write(doc)\n",
    "    else:\n",
    "        with open(out_path, \"r\") as fp:\n",
    "            doc = fp.read()\n",
    "\n",
    "    print(f\"**Summary for {doc_id}: {doc_summary}\")\n",
    "    node = IndexNode(text=doc_summary, index_id=doc)\n",
    "    nodes.append(node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aimakerspace",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}