Spaces:
Runtime error
Runtime error
File size: 17,588 Bytes
098536f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import nest_asyncio\n",
"nest_asyncio.apply()\n",
"\n",
"import os\n",
"import getpass\n",
"import openai\n",
"import logging\n",
"import sys\n",
"from llama_index import SimpleDirectoryReader, SummaryIndex, ServiceContext\n",
"\n",
"logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
"logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key:\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
"logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.llms import OpenAI\n",
"from llama_index.callbacks import LlamaDebugHandler, CallbackManager\n",
"\n",
"llm = OpenAI(\"gpt-3.5-turbo\")\n",
"\n",
"callback_manager = CallbackManager([LlamaDebugHandler()])\n",
"\n",
"service_context = ServiceContext.from_defaults(\n",
" llm=llm, callback_manager=callback_manager, chunk_size=256\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 4 docs\n"
]
}
],
"source": [
"required_exts = [\".txt\"]\n",
"\n",
"reader = SimpleDirectoryReader(\n",
" input_dir=\"../data\",\n",
" required_exts=required_exts,\n",
" recursive=True,\n",
" filename_as_id=True\n",
")\n",
"\n",
"docs = reader.load_data()\n",
"print(f\"Loaded {len(docs)} docs\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Metadata Filters + Auto-Retrieval\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from llama_index import VectorStoreIndex, SimpleDirectoryReader\n",
"from llama_index.vector_stores import ChromaVectorStore"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
"Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
"Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
]
}
],
"source": [
"import chromadb\n",
"from llama_index.storage.storage_context import StorageContext\n",
"\n",
"db = chromadb.PersistentClient(path=\"../chroma_db\")\n",
"chroma_collection = db.get_or_create_collection(\"quickstart\")\n",
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"**********\n",
"Trace: index_construction\n",
" |_CBEventType.NODE_PARSING -> 0.066032 seconds\n",
" |_CBEventType.CHUNKING -> 0.063786 seconds\n",
" |_CBEventType.EMBEDDING -> 0.335255 seconds\n",
" |_CBEventType.EMBEDDING -> 0.430667 seconds\n",
" |_CBEventType.EMBEDDING -> 0.39471 seconds\n",
" |_CBEventType.EMBEDDING -> 0.341174 seconds\n",
" |_CBEventType.EMBEDDING -> 0.333922 seconds\n",
" |_CBEventType.EMBEDDING -> 0.371205 seconds\n",
" |_CBEventType.EMBEDDING -> 0.655165 seconds\n",
" |_CBEventType.EMBEDDING -> 0.534313 seconds\n",
" |_CBEventType.EMBEDDING -> 0.513138 seconds\n",
" |_CBEventType.EMBEDDING -> 0.396431 seconds\n",
"**********\n"
]
},
{
"ename": "TypeError",
"evalue": "type() takes 1 or 3 arguments",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[49], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mtype\u001b[39;49m(vector_index \u001b[39m=\u001b[39;49m VectorStoreIndex\u001b[39m.\u001b[39;49mfrom_documents([docs[\u001b[39m0\u001b[39;49m]], \n\u001b[1;32m 2\u001b[0m service_context\u001b[39m=\u001b[39;49mservice_context\n\u001b[1;32m 3\u001b[0m ))\n",
"\u001b[0;31mTypeError\u001b[0m: type() takes 1 or 3 arguments"
]
}
],
"source": [
"vector_index = VectorStoreIndex.from_documents([docs[0]], \n",
" service_context=service_context\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"llama_index.indices.vector_store.base.VectorStoreIndex"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(vector_index)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"**********\n",
"Trace: index_construction\n",
" |_CBEventType.NODE_PARSING -> 0.078989 seconds\n",
" |_CBEventType.CHUNKING -> 0.075335 seconds\n",
" |_CBEventType.EMBEDDING -> 0.272066 seconds\n",
" |_CBEventType.EMBEDDING -> 0.344792 seconds\n",
" |_CBEventType.EMBEDDING -> 0.351537 seconds\n",
" |_CBEventType.EMBEDDING -> 0.247337 seconds\n",
" |_CBEventType.EMBEDDING -> 0.351224 seconds\n",
" |_CBEventType.EMBEDDING -> 0.23581 seconds\n",
" |_CBEventType.EMBEDDING -> 0.309488 seconds\n",
" |_CBEventType.EMBEDDING -> 0.25491 seconds\n",
" |_CBEventType.EMBEDDING -> 0.192247 seconds\n",
" |_CBEventType.EMBEDDING -> 0.23071 seconds\n",
"**********\n",
"**Summary for final-hh.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n",
"**********\n",
"Trace: index_construction\n",
" |_CBEventType.NODE_PARSING -> 0.235509 seconds\n",
" |_CBEventType.CHUNKING -> 0.231563 seconds\n",
" |_CBEventType.EMBEDDING -> 1.126853 seconds\n",
" |_CBEventType.EMBEDDING -> 0.306191 seconds\n",
" |_CBEventType.EMBEDDING -> 0.451583 seconds\n",
" |_CBEventType.EMBEDDING -> 0.415356 seconds\n",
" |_CBEventType.EMBEDDING -> 0.435105 seconds\n",
" |_CBEventType.EMBEDDING -> 0.37879 seconds\n",
" |_CBEventType.EMBEDDING -> 0.280844 seconds\n",
" |_CBEventType.EMBEDDING -> 0.24501 seconds\n",
" |_CBEventType.EMBEDDING -> 0.300654 seconds\n",
" |_CBEventType.EMBEDDING -> 0.496476 seconds\n",
" |_CBEventType.EMBEDDING -> 0.44205 seconds\n",
" |_CBEventType.EMBEDDING -> 0.52554 seconds\n",
" |_CBEventType.EMBEDDING -> 0.853941 seconds\n",
" |_CBEventType.EMBEDDING -> 0.394818 seconds\n",
" |_CBEventType.EMBEDDING -> 0.338529 seconds\n",
" |_CBEventType.EMBEDDING -> 0.319579 seconds\n",
" |_CBEventType.EMBEDDING -> 0.52271 seconds\n",
"**********\n",
"**Summary for hh100.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n",
"**********\n",
"Trace: index_construction\n",
" |_CBEventType.NODE_PARSING -> 0.052081 seconds\n",
" |_CBEventType.CHUNKING -> 0.050151 seconds\n",
" |_CBEventType.EMBEDDING -> 0.434005 seconds\n",
" |_CBEventType.EMBEDDING -> 0.417429 seconds\n",
" |_CBEventType.EMBEDDING -> 0.359151 seconds\n",
" |_CBEventType.EMBEDDING -> 0.347035 seconds\n",
" |_CBEventType.EMBEDDING -> 0.342142 seconds\n",
" |_CBEventType.EMBEDDING -> 0.277749 seconds\n",
" |_CBEventType.EMBEDDING -> 0.348186 seconds\n",
" |_CBEventType.EMBEDDING -> 0.2124 seconds\n",
"**********\n",
"**********\n",
"Trace: index_construction\n",
" |_CBEventType.NODE_PARSING -> 0.050094 seconds\n",
" |_CBEventType.CHUNKING -> 0.048018 seconds\n",
"**********\n",
"INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
"INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
"INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
"INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
"INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
"INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
"INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
"message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
"**********\n",
"Trace: query\n",
" |_CBEventType.QUERY -> 14.970623 seconds\n",
" |_CBEventType.RETRIEVE -> 0.002261 seconds\n",
" |_CBEventType.SYNTHESIZE -> 14.968218 seconds\n",
" |_CBEventType.TEMPLATING -> 2.5e-05 seconds\n",
" |_CBEventType.LLM -> 5.465885 seconds\n",
" |_CBEventType.TEMPLATING -> 8e-06 seconds\n",
" |_CBEventType.LLM -> 4.726724 seconds\n",
" |_CBEventType.TEMPLATING -> 6e-06 seconds\n",
" |_CBEventType.LLM -> 4.548116 seconds\n",
" |_CBEventType.TEMPLATING -> 5e-06 seconds\n",
" |_CBEventType.LLM -> 5.774904 seconds\n",
" |_CBEventType.TEMPLATING -> 6e-06 seconds\n",
" |_CBEventType.LLM -> 4.269908 seconds\n",
" |_CBEventType.TEMPLATING -> 6e-06 seconds\n",
" |_CBEventType.LLM -> 6.508074 seconds\n",
" |_CBEventType.TEMPLATING -> 3.6e-05 seconds\n",
" |_CBEventType.LLM -> 8.223118 seconds\n",
"**********\n"
]
},
{
"ename": "TypeError",
"evalue": "write() argument must be str, not Document",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[54], line 32\u001b[0m\n\u001b[1;32m 30\u001b[0m Path(\u001b[39m\"\u001b[39m\u001b[39msummaries\u001b[39m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mmkdir(exist_ok\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 31\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mw\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n\u001b[0;32m---> 32\u001b[0m fp\u001b[39m.\u001b[39mwrite(doc)\n\u001b[1;32m 33\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n",
"\u001b[0;31mTypeError\u001b[0m: write() argument must be str, not Document"
]
}
],
"source": [
"# define top-level nodes and vector retrievers\n",
"nodes = []\n",
"vector_query_engines = {}\n",
"vector_retrievers = {}\n",
"\n",
"for doc in docs:\n",
" # build vector index\n",
" doc_id = doc.id_.split(\"/\")[-1]\n",
" vector_index = VectorStoreIndex.from_documents([doc], \n",
" service_context=service_context\n",
" )\n",
" # define query engines\n",
" vector_query_engine = vector_index.as_query_engine()\n",
" vector_query_engines[doc_id] = vector_query_engine\n",
" vector_retrievers[doc_id] = vector_index.as_retriever()\n",
"\n",
" # save summaries\n",
" \n",
" out_path = Path(\"summaries\") / f\"{doc_id}.txt\"\n",
" if not out_path.exists():\n",
" # use LLM-generated summary\n",
" summary_index = SummaryIndex.from_documents([doc], \n",
" service_context=service_context\n",
" )\n",
"\n",
" summarizer = summary_index.as_query_engine(response_mode=\"tree_summarize\")\n",
" response = await summarizer.aquery(f\"Give me a summary of {doc_id}\")\n",
"\n",
" doc_summary = response.response\n",
" Path(\"summaries\").mkdir(exist_ok=True)\n",
" with open(out_path, \"w\") as fp:\n",
" fp.write(doc)\n",
" else:\n",
" with open(out_path, \"r\") as fp:\n",
" doc = fp.read()\n",
"\n",
" print(f\"**Summary for {doc_id}: {doc_summary}\")\n",
" node = IndexNode(text=doc_summary, index_id=doc)\n",
" nodes.append(node)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "aimakerspace",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|