eaglelandsonce commited on
Commit
99694bd
·
1 Parent(s): 4ecf027

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +6 -12
utils.py CHANGED
@@ -3,7 +3,7 @@ from langchain.embeddings import OpenAIEmbeddings
3
  from langchain.vectorstores import Chroma
4
 
5
 
6
- # loading PDF, DOCX and TXT files as LangChain Documents
7
  def load_document(file):
8
  import os
9
  name, extension = os.path.splitext(file)
@@ -30,7 +30,7 @@ def load_document(file):
30
  return data
31
 
32
 
33
- # splitting data in chunks
34
  def chunk_data(data, chunk_size=256, chunk_overlap=20):
35
  from langchain.text_splitter import RecursiveCharacterTextSplitter
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
@@ -38,33 +38,27 @@ def chunk_data(data, chunk_size=256, chunk_overlap=20):
38
  return chunks
39
 
40
 
41
- # create embeddings using OpenAIEmbeddings() and save them in a Chroma vector store
42
  def create_embeddings(chunks):
43
  embeddings = OpenAIEmbeddings()
44
  vector_store = Chroma.from_documents(chunks, embeddings)
45
-
46
- # if you want to use a specific directory for chromadb
47
- # vector_store = Chroma.from_documents(chunks, embeddings, persist_directory='./mychroma_db')
48
  return vector_store
49
 
50
-
51
  def ask_and_get_answer(vector_store, q, k=3):
52
  from langchain.chains import RetrievalQA
53
  from langchain.chat_models import ChatOpenAI
54
-
55
  llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
56
  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
57
  chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
58
-
59
  answer = chain.run(q)
60
  return answer
61
 
62
 
63
- # calculate embedding cost using tiktoken
64
  def calculate_embedding_cost(texts):
65
  import tiktoken
66
  enc = tiktoken.encoding_for_model('text-embedding-ada-002')
67
  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
68
- # print(f'Total Tokens: {total_tokens}')
69
- # print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
70
  return total_tokens, total_tokens / 1000 * 0.0004
 
3
  from langchain.vectorstores import Chroma
4
 
5
 
6
+ # stack up loading methods using elif statments for loading PDF, DOCX, TXT, and CSV files into LangChain Documents
7
  def load_document(file):
8
  import os
9
  name, extension = os.path.splitext(file)
 
30
  return data
31
 
32
 
33
+ # chunck your data for embedding
34
  def chunk_data(data, chunk_size=256, chunk_overlap=20):
35
  from langchain.text_splitter import RecursiveCharacterTextSplitter
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
38
  return chunks
39
 
40
 
41
+ # using OpenAIEmbeddings() create your embeddings and save to the Chroma vector store
42
  def create_embeddings(chunks):
43
  embeddings = OpenAIEmbeddings()
44
  vector_store = Chroma.from_documents(chunks, embeddings)
 
 
 
45
  return vector_store
46
 
47
+ # here where you ask your question, here we use a combination of RetrievalQA and ChatOpenAI but his is not the only way to do this
48
  def ask_and_get_answer(vector_store, q, k=3):
49
  from langchain.chains import RetrievalQA
50
  from langchain.chat_models import ChatOpenAI
51
+ # choose the 3.5 turbo model which is default and set the temperature to 1 which is maximum
52
  llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
53
  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
54
  chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
 
55
  answer = chain.run(q)
56
  return answer
57
 
58
 
59
+ # return the embedding cost (using tiktoken)
60
  def calculate_embedding_cost(texts):
61
  import tiktoken
62
  enc = tiktoken.encoding_for_model('text-embedding-ada-002')
63
  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
 
 
64
  return total_tokens, total_tokens / 1000 * 0.0004