In [3]:
import nest_asyncio
nest_asyncio.apply()

import os
import getpass
import openai
import logging
import sys
from llama_index import SimpleDirectoryReader, SummaryIndex, ServiceContext

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [4]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key:")

In [5]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [8]:
from llama_index.llms import OpenAI
from llama_index.callbacks import LlamaDebugHandler, CallbackManager

llm = OpenAI("gpt-3.5-turbo")

callback_manager = CallbackManager([LlamaDebugHandler()])

service_context = ServiceContext.from_defaults(
 llm=llm, callback_manager=callback_manager, chunk_size=256
)

In [9]:
required_exts = [".txt"]

reader = SimpleDirectoryReader(
 input_dir="../data",
 required_exts=required_exts,
 recursive=True,
 filename_as_id=True
)

docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

Loaded 4 docs


# Metadata Filters + Auto-Retrieval



In [13]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import ChromaVectorStore

In [17]:
import chromadb
from llama_index.storage.storage_context import StorageContext

db = chromadb.PersistentClient(path="../chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


In [49]:
vector_index = VectorStoreIndex.from_documents([docs[0]], 
 service_context=service_context
)

**********
Trace: index_construction
 |_CBEventType.NODE_PARSING -> 0.066032 seconds
 |_CBEventType.CHUNKING -> 0.063786 seconds
 |_CBEventType.EMBEDDING -> 0.335255 seconds
 |_CBEventType.EMBEDDING -> 0.430667 seconds
 |_CBEventType.EMBEDDING -> 0.39471 seconds
 |_CBEventType.EMBEDDING -> 0.341174 seconds
 |_CBEventType.EMBEDDING -> 0.333922 seconds
 |_CBEventType.EMBEDDING -> 0.371205 seconds
 |_CBEventType.EMBEDDING -> 0.655165 seconds
 |_CBEventType.EMBEDDING -> 0.534313 seconds
 |_CBEventType.EMBEDDING -> 0.513138 seconds
 |_CBEventType.EMBEDDING -> 0.396431 seconds
**********


TypeError: type() takes 1 or 3 arguments

In [50]:
type(vector_index)

llama_index.indices.vector_store.base.VectorStoreIndex

In [54]:
# define top-level nodes and vector retrievers
nodes = []
vector_query_engines = {}
vector_retrievers = {}

for doc in docs:
 # build vector index
 doc_id = doc.id_.split("/")[-1]
 vector_index = VectorStoreIndex.from_documents([doc], 
 service_context=service_context
 )
 # define query engines
 vector_query_engine = vector_index.as_query_engine()
 vector_query_engines[doc_id] = vector_query_engine
 vector_retrievers[doc_id] = vector_index.as_retriever()

 # save summaries
 
 out_path = Path("summaries") / f"{doc_id}.txt"
 if not out_path.exists():
 # use LLM-generated summary
 summary_index = SummaryIndex.from_documents([doc], 
 service_context=service_context
 )

 summarizer = summary_index.as_query_engine(response_mode="tree_summarize")
 response = await summarizer.aquery(f"Give me a summary of {doc_id}")

 doc_summary = response.response
 Path("summaries").mkdir(exist_ok=True)
 with open(out_path, "w") as fp:
 fp.write(doc)
 else:
 with open(out_path, "r") as fp:
 doc = fp.read()

 print(f"**Summary for {doc_id}: {doc_summary}")
 node = IndexNode(text=doc_summary, index_id=doc)
 nodes.append(node)

**********
Trace: index_construction
 |_CBEventType.NODE_PARSING -> 0.078989 seconds
 |_CBEventType.CHUNKING -> 0.075335 seconds
 |_CBEventType.EMBEDDING -> 0.272066 seconds
 |_CBEventType.EMBEDDING -> 0.344792 seconds
 |_CBEventType.EMBEDDING -> 0.351537 seconds
 |_CBEventType.EMBEDDING -> 0.247337 seconds
 |_CBEventType.EMBEDDING -> 0.351224 seconds
 |_CBEventType.EMBEDDING -> 0.23581 seconds
 |_CBEventType.EMBEDDING -> 0.309488 seconds
 |_CBEventType.EMBEDDING -> 0.25491 seconds
 |_CBEventType.EMBEDDING -> 0.192247 seconds
 |_CBEventType.EMBEDDING -> 0.23071 seconds
**********
**Summary for final-hh.txt: I'm sorry, but I cannot provide a summary of hh100.txt based on the given information. The provided context information does not contain any reference to hh100.txt or its content.
**********
Trace: index_construction
 |_CBEventType.NODE_PARSING -> 0.235509 seconds
 |_CBEventType.CHUNKING -> 0.231563 seconds
 |_CBEventType.EMBEDDING -> 1.126853 seconds
 |_CBEventType.EMBEDDING -> 0.3

TypeError: write() argument must be str, not Document