{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", "import os\n", "import getpass\n", "import openai\n", "import logging\n", "import sys\n", "from llama_index import SimpleDirectoryReader, SummaryIndex, ServiceContext\n", "\n", "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key:\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from llama_index.llms import OpenAI\n", "from llama_index.callbacks import LlamaDebugHandler, CallbackManager\n", "\n", "llm = OpenAI(\"gpt-3.5-turbo\")\n", "\n", "callback_manager = CallbackManager([LlamaDebugHandler()])\n", "\n", "service_context = ServiceContext.from_defaults(\n", " llm=llm, callback_manager=callback_manager, chunk_size=256\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 4 docs\n" ] } ], "source": [ "required_exts = [\".txt\"]\n", "\n", "reader = SimpleDirectoryReader(\n", " input_dir=\"../data\",\n", " required_exts=required_exts,\n", " recursive=True,\n", " filename_as_id=True\n", ")\n", "\n", "docs = reader.load_data()\n", "print(f\"Loaded {len(docs)} docs\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Metadata Filters + Auto-Retrieval\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores import ChromaVectorStore" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n" ] } ], "source": [ "import chromadb\n", "from llama_index.storage.storage_context import StorageContext\n", "\n", "db = chromadb.PersistentClient(path=\"../chroma_db\")\n", "chroma_collection = db.get_or_create_collection(\"quickstart\")\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "**********\n", "Trace: index_construction\n", " |_CBEventType.NODE_PARSING -> 0.066032 seconds\n", " |_CBEventType.CHUNKING -> 0.063786 seconds\n", " |_CBEventType.EMBEDDING -> 0.335255 seconds\n", " |_CBEventType.EMBEDDING -> 0.430667 seconds\n", " |_CBEventType.EMBEDDING -> 0.39471 seconds\n", " |_CBEventType.EMBEDDING -> 0.341174 seconds\n", " |_CBEventType.EMBEDDING -> 0.333922 seconds\n", " |_CBEventType.EMBEDDING -> 0.371205 seconds\n", " |_CBEventType.EMBEDDING -> 0.655165 seconds\n", " |_CBEventType.EMBEDDING -> 0.534313 seconds\n", " |_CBEventType.EMBEDDING -> 0.513138 seconds\n", " |_CBEventType.EMBEDDING -> 0.396431 seconds\n", "**********\n" ] }, { "ename": "TypeError", "evalue": "type() takes 1 or 3 arguments", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[49], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mtype\u001b[39;49m(vector_index \u001b[39m=\u001b[39;49m VectorStoreIndex\u001b[39m.\u001b[39;49mfrom_documents([docs[\u001b[39m0\u001b[39;49m]], \n\u001b[1;32m 2\u001b[0m service_context\u001b[39m=\u001b[39;49mservice_context\n\u001b[1;32m 3\u001b[0m ))\n", "\u001b[0;31mTypeError\u001b[0m: type() takes 1 or 3 arguments" ] } ], "source": [ "vector_index = VectorStoreIndex.from_documents([docs[0]], \n", " service_context=service_context\n", ")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "llama_index.indices.vector_store.base.VectorStoreIndex" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(vector_index)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "**********\n", "Trace: index_construction\n", " |_CBEventType.NODE_PARSING -> 0.078989 seconds\n", " |_CBEventType.CHUNKING -> 0.075335 seconds\n", " |_CBEventType.EMBEDDING -> 0.272066 seconds\n", " |_CBEventType.EMBEDDING -> 0.344792 seconds\n", " |_CBEventType.EMBEDDING -> 0.351537 seconds\n", " |_CBEventType.EMBEDDING -> 0.247337 seconds\n", " |_CBEventType.EMBEDDING -> 0.351224 seconds\n", " |_CBEventType.EMBEDDING -> 0.23581 seconds\n", " |_CBEventType.EMBEDDING -> 0.309488 seconds\n", " |_CBEventType.EMBEDDING -> 0.25491 seconds\n", " |_CBEventType.EMBEDDING -> 0.192247 seconds\n", " |_CBEventType.EMBEDDING -> 0.23071 seconds\n", "**********\n", "**Summary for final-hh.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n", "**********\n", "Trace: index_construction\n", " |_CBEventType.NODE_PARSING -> 0.235509 seconds\n", " |_CBEventType.CHUNKING -> 0.231563 seconds\n", " |_CBEventType.EMBEDDING -> 1.126853 seconds\n", " |_CBEventType.EMBEDDING -> 0.306191 seconds\n", " |_CBEventType.EMBEDDING -> 0.451583 seconds\n", " |_CBEventType.EMBEDDING -> 0.415356 seconds\n", " |_CBEventType.EMBEDDING -> 0.435105 seconds\n", " |_CBEventType.EMBEDDING -> 0.37879 seconds\n", " |_CBEventType.EMBEDDING -> 0.280844 seconds\n", " |_CBEventType.EMBEDDING -> 0.24501 seconds\n", " |_CBEventType.EMBEDDING -> 0.300654 seconds\n", " |_CBEventType.EMBEDDING -> 0.496476 seconds\n", " |_CBEventType.EMBEDDING -> 0.44205 seconds\n", " |_CBEventType.EMBEDDING -> 0.52554 seconds\n", " |_CBEventType.EMBEDDING -> 0.853941 seconds\n", " |_CBEventType.EMBEDDING -> 0.394818 seconds\n", " |_CBEventType.EMBEDDING -> 0.338529 seconds\n", " |_CBEventType.EMBEDDING -> 0.319579 seconds\n", " |_CBEventType.EMBEDDING -> 0.52271 seconds\n", "**********\n", "**Summary for hh100.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n", "**********\n", "Trace: index_construction\n", " |_CBEventType.NODE_PARSING -> 0.052081 seconds\n", " |_CBEventType.CHUNKING -> 0.050151 seconds\n", " |_CBEventType.EMBEDDING -> 0.434005 seconds\n", " |_CBEventType.EMBEDDING -> 0.417429 seconds\n", " |_CBEventType.EMBEDDING -> 0.359151 seconds\n", " |_CBEventType.EMBEDDING -> 0.347035 seconds\n", " |_CBEventType.EMBEDDING -> 0.342142 seconds\n", " |_CBEventType.EMBEDDING -> 0.277749 seconds\n", " |_CBEventType.EMBEDDING -> 0.348186 seconds\n", " |_CBEventType.EMBEDDING -> 0.2124 seconds\n", "**********\n", "**********\n", "Trace: index_construction\n", " |_CBEventType.NODE_PARSING -> 0.050094 seconds\n", " |_CBEventType.CHUNKING -> 0.048018 seconds\n", "**********\n", "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n", "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n", "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n", "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n", "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n", "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n", "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n", "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n", "**********\n", "Trace: query\n", " |_CBEventType.QUERY -> 14.970623 seconds\n", " |_CBEventType.RETRIEVE -> 0.002261 seconds\n", " |_CBEventType.SYNTHESIZE -> 14.968218 seconds\n", " |_CBEventType.TEMPLATING -> 2.5e-05 seconds\n", " |_CBEventType.LLM -> 5.465885 seconds\n", " |_CBEventType.TEMPLATING -> 8e-06 seconds\n", " |_CBEventType.LLM -> 4.726724 seconds\n", " |_CBEventType.TEMPLATING -> 6e-06 seconds\n", " |_CBEventType.LLM -> 4.548116 seconds\n", " |_CBEventType.TEMPLATING -> 5e-06 seconds\n", " |_CBEventType.LLM -> 5.774904 seconds\n", " |_CBEventType.TEMPLATING -> 6e-06 seconds\n", " |_CBEventType.LLM -> 4.269908 seconds\n", " |_CBEventType.TEMPLATING -> 6e-06 seconds\n", " |_CBEventType.LLM -> 6.508074 seconds\n", " |_CBEventType.TEMPLATING -> 3.6e-05 seconds\n", " |_CBEventType.LLM -> 8.223118 seconds\n", "**********\n" ] }, { "ename": "TypeError", "evalue": "write() argument must be str, not Document", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[54], line 32\u001b[0m\n\u001b[1;32m 30\u001b[0m Path(\u001b[39m\"\u001b[39m\u001b[39msummaries\u001b[39m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mmkdir(exist_ok\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 31\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mw\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n\u001b[0;32m---> 32\u001b[0m fp\u001b[39m.\u001b[39mwrite(doc)\n\u001b[1;32m 33\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n", "\u001b[0;31mTypeError\u001b[0m: write() argument must be str, not Document" ] } ], "source": [ "# define top-level nodes and vector retrievers\n", "nodes = []\n", "vector_query_engines = {}\n", "vector_retrievers = {}\n", "\n", "for doc in docs:\n", " # build vector index\n", " doc_id = doc.id_.split(\"/\")[-1]\n", " vector_index = VectorStoreIndex.from_documents([doc], \n", " service_context=service_context\n", " )\n", " # define query engines\n", " vector_query_engine = vector_index.as_query_engine()\n", " vector_query_engines[doc_id] = vector_query_engine\n", " vector_retrievers[doc_id] = vector_index.as_retriever()\n", "\n", " # save summaries\n", " \n", " out_path = Path(\"summaries\") / f\"{doc_id}.txt\"\n", " if not out_path.exists():\n", " # use LLM-generated summary\n", " summary_index = SummaryIndex.from_documents([doc], \n", " service_context=service_context\n", " )\n", "\n", " summarizer = summary_index.as_query_engine(response_mode=\"tree_summarize\")\n", " response = await summarizer.aquery(f\"Give me a summary of {doc_id}\")\n", "\n", " doc_summary = response.response\n", " Path(\"summaries\").mkdir(exist_ok=True)\n", " with open(out_path, \"w\") as fp:\n", " fp.write(doc)\n", " else:\n", " with open(out_path, \"r\") as fp:\n", " doc = fp.read()\n", "\n", " print(f\"**Summary for {doc_id}: {doc_summary}\")\n", " node = IndexNode(text=doc_summary, index_id=doc)\n", " nodes.append(node)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "aimakerspace", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }