Spaces:

tdeshane
/

artists-of-data-science-chainlit

Runtime error

File size: 17,588 Bytes

098536f

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "nest_asyncio.apply()\n",
    "\n",
    "import os\n",
    "import getpass\n",
    "import openai\n",
    "import logging\n",
    "import sys\n",
    "from llama_index import SimpleDirectoryReader, SummaryIndex, ServiceContext\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key:\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llms import OpenAI\n",
    "from llama_index.callbacks import LlamaDebugHandler, CallbackManager\n",
    "\n",
    "llm = OpenAI(\"gpt-3.5-turbo\")\n",
    "\n",
    "callback_manager = CallbackManager([LlamaDebugHandler()])\n",
    "\n",
    "service_context = ServiceContext.from_defaults(\n",
    "    llm=llm, callback_manager=callback_manager, chunk_size=256\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 4 docs\n"
     ]
    }
   ],
   "source": [
    "required_exts = [\".txt\"]\n",
    "\n",
    "reader = SimpleDirectoryReader(\n",
    "    input_dir=\"../data\",\n",
    "    required_exts=required_exts,\n",
    "    recursive=True,\n",
    "    filename_as_id=True\n",
    ")\n",
    "\n",
    "docs = reader.load_data()\n",
    "print(f\"Loaded {len(docs)} docs\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Metadata Filters + Auto-Retrieval\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n",
    "from llama_index.vector_stores import ChromaVectorStore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
      "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
      "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
     ]
    }
   ],
   "source": [
    "import chromadb\n",
    "from llama_index.storage.storage_context import StorageContext\n",
    "\n",
    "db = chromadb.PersistentClient(path=\"../chroma_db\")\n",
    "chroma_collection = db.get_or_create_collection(\"quickstart\")\n",
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.066032 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.063786 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.335255 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.430667 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.39471 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.341174 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.333922 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.371205 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.655165 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.534313 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.513138 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.396431 seconds\n",
      "**********\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "type() takes 1 or 3 arguments",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[49], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mtype\u001b[39;49m(vector_index \u001b[39m=\u001b[39;49m VectorStoreIndex\u001b[39m.\u001b[39;49mfrom_documents([docs[\u001b[39m0\u001b[39;49m]], \n\u001b[1;32m      2\u001b[0m                                                 service_context\u001b[39m=\u001b[39;49mservice_context\n\u001b[1;32m      3\u001b[0m ))\n",
      "\u001b[0;31mTypeError\u001b[0m: type() takes 1 or 3 arguments"
     ]
    }
   ],
   "source": [
    "vector_index = VectorStoreIndex.from_documents([docs[0]], \n",
    "                                                service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "llama_index.indices.vector_store.base.VectorStoreIndex"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(vector_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.078989 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.075335 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.272066 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.344792 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.351537 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.247337 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.351224 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.23581 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.309488 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.25491 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.192247 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.23071 seconds\n",
      "**********\n",
      "**Summary for final-hh.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n",
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.235509 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.231563 seconds\n",
      "    |_CBEventType.EMBEDDING ->  1.126853 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.306191 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.451583 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.415356 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.435105 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.37879 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.280844 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.24501 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.300654 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.496476 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.44205 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.52554 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.853941 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.394818 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.338529 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.319579 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.52271 seconds\n",
      "**********\n",
      "**Summary for hh100.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.\n",
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.052081 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.050151 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.434005 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.417429 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.359151 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.347035 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.342142 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.277749 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.348186 seconds\n",
      "    |_CBEventType.EMBEDDING ->  0.2124 seconds\n",
      "**********\n",
      "**********\n",
      "Trace: index_construction\n",
      "    |_CBEventType.NODE_PARSING ->  0.050094 seconds\n",
      "      |_CBEventType.CHUNKING ->  0.048018 seconds\n",
      "**********\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3733 request_id=432e730b60eb67a37e6a053607aedb6d response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4231 request_id=899ee7790bce0fe1146820fe33d03cd4 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4365 request_id=2d7fdc83b169954947616f163ef18112 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5050 request_id=8b9dc3841570291809af3554085c4768 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5198 request_id=9d964787588a214590c722b2c4328ccb response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=5899 request_id=a76ac87225e96b507c814d3679e918a7 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=7810 request_id=f42da8b58a13fc6332f00424a2e3b20f response_code=200\n",
      "**********\n",
      "Trace: query\n",
      "    |_CBEventType.QUERY ->  14.970623 seconds\n",
      "      |_CBEventType.RETRIEVE ->  0.002261 seconds\n",
      "      |_CBEventType.SYNTHESIZE ->  14.968218 seconds\n",
      "        |_CBEventType.TEMPLATING ->  2.5e-05 seconds\n",
      "        |_CBEventType.LLM ->  5.465885 seconds\n",
      "        |_CBEventType.TEMPLATING ->  8e-06 seconds\n",
      "        |_CBEventType.LLM ->  4.726724 seconds\n",
      "        |_CBEventType.TEMPLATING ->  6e-06 seconds\n",
      "        |_CBEventType.LLM ->  4.548116 seconds\n",
      "        |_CBEventType.TEMPLATING ->  5e-06 seconds\n",
      "        |_CBEventType.LLM ->  5.774904 seconds\n",
      "        |_CBEventType.TEMPLATING ->  6e-06 seconds\n",
      "        |_CBEventType.LLM ->  4.269908 seconds\n",
      "        |_CBEventType.TEMPLATING ->  6e-06 seconds\n",
      "        |_CBEventType.LLM ->  6.508074 seconds\n",
      "        |_CBEventType.TEMPLATING ->  3.6e-05 seconds\n",
      "        |_CBEventType.LLM ->  8.223118 seconds\n",
      "**********\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "write() argument must be str, not Document",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[54], line 32\u001b[0m\n\u001b[1;32m     30\u001b[0m     Path(\u001b[39m\"\u001b[39m\u001b[39msummaries\u001b[39m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mmkdir(exist_ok\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m     31\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mw\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n\u001b[0;32m---> 32\u001b[0m         fp\u001b[39m.\u001b[39mwrite(doc)\n\u001b[1;32m     33\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m     34\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(out_path, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n",
      "\u001b[0;31mTypeError\u001b[0m: write() argument must be str, not Document"
     ]
    }
   ],
   "source": [
    "# define top-level nodes and vector retrievers\n",
    "nodes = []\n",
    "vector_query_engines = {}\n",
    "vector_retrievers = {}\n",
    "\n",
    "for doc in docs:\n",
    "    # build vector index\n",
    "    doc_id = doc.id_.split(\"/\")[-1]\n",
    "    vector_index = VectorStoreIndex.from_documents([doc], \n",
    "                                                   service_context=service_context\n",
    "    )\n",
    "    # define query engines\n",
    "    vector_query_engine = vector_index.as_query_engine()\n",
    "    vector_query_engines[doc_id] = vector_query_engine\n",
    "    vector_retrievers[doc_id] = vector_index.as_retriever()\n",
    "\n",
    "    # save summaries\n",
    "    \n",
    "    out_path = Path(\"summaries\") / f\"{doc_id}.txt\"\n",
    "    if not out_path.exists():\n",
    "        # use LLM-generated summary\n",
    "        summary_index = SummaryIndex.from_documents([doc], \n",
    "                                                    service_context=service_context\n",
    "        )\n",
    "\n",
    "        summarizer = summary_index.as_query_engine(response_mode=\"tree_summarize\")\n",
    "        response = await summarizer.aquery(f\"Give me a summary of {doc_id}\")\n",
    "\n",
    "        doc_summary = response.response\n",
    "        Path(\"summaries\").mkdir(exist_ok=True)\n",
    "        with open(out_path, \"w\") as fp:\n",
    "            fp.write(doc)\n",
    "    else:\n",
    "        with open(out_path, \"r\") as fp:\n",
    "            doc = fp.read()\n",
    "\n",
    "    print(f\"**Summary for {doc_id}: {doc_summary}\")\n",
    "    node = IndexNode(text=doc_summary, index_id=doc)\n",
    "    nodes.append(node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aimakerspace",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}