Spaces:
Build error
Build error
| # %% | |
| import nltk | |
| from langchain.indexes import VectorstoreIndexCreator | |
| from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter | |
| from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings | |
| from chromadb.config import Settings | |
| import chromadb | |
| from chromadb.utils import embedding_functions | |
| from hashlib import sha256 | |
| import cloudpickle | |
| import logging | |
| import os | |
| from load_model import load_embedding, load_vectorstore | |
| import torch | |
| import re | |
| import pathlib | |
| import tempfile | |
| current_path = str( pathlib.Path(__file__).parent.resolve() ) | |
| os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" | |
| nltk.download('punkt') | |
| persist_directory = current_path + "/VectorStore" | |
| logger = logging.getLogger() | |
| # %% | |
| def create_collection(collection_name, model_name, client): | |
| """Not used atm""" | |
| if not torch.cuda.is_available(): | |
| device= "cpu" | |
| else: | |
| device= "cuda" | |
| ef = embedding_functions.InstructorEmbeddingFunction( | |
| model_name=model_name, device=device) | |
| client.get_or_create_collection(collection_name, embedding_function=ef) | |
| return True | |
| def create_and_add(collection_name, sub_docs, model_name, metadata): | |
| logging.info(f"Adding documents to {collection_name}") | |
| embeddings = load_embedding(model_name) | |
| vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata) | |
| vectorstore.add_documents(documents=sub_docs, embedding=embeddings) | |
| vectorstore.persist() | |
| # Test Vectorstore | |
| vectorstore2 = load_vectorstore(model_name, collection_name, metadata = metadata) | |
| print( vectorstore2.similarity_search_with_score(query="What are AXAs green Goals?", k=4) ) | |
| return True | |
| def load_from_file(files): | |
| saved_files=[] | |
| with tempfile.TemporaryDirectory() as tmpdirname: | |
| for file in files: | |
| temp_dir = pathlib.Path(tmpdirname) | |
| file_name = os.path.join(temp_dir,file.name) | |
| saved_files.append(file_name) | |
| with open(file_name, mode='wb') as w: | |
| w.write(file.read()) | |
| print(saved_files) | |
| loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files] | |
| docs = [] | |
| print(loaders) | |
| for loader in loaders: | |
| docs.extend(loader.load()) | |
| return docs | |
| def load_from_web(urls, cache=True): | |
| docs_list = urls | |
| filename=f"{current_path}/.cache/{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl" | |
| isFile = os.path.isfile(filename) | |
| if cache and isFile: | |
| logger.info("Using Cache") | |
| pikd = open(filename, "rb") | |
| docs = cloudpickle.load(pikd) | |
| else: | |
| loaders=[OnlinePDFLoader(pdf) for pdf in docs_list] | |
| docs = [] | |
| for loader in loaders: | |
| docs.extend(loader.load()) | |
| with open(filename, 'wb') as output: | |
| cloudpickle.dump(docs, output) | |
| #update metadata | |
| i=0 | |
| for doc in docs: | |
| doc.metadata = {'source': docs_list[i], 'url': docs_list[i], 'owner':'Heiko Wagner'} | |
| i=i+1 | |
| return docs | |
| def load_and_split(docs, chunk_size=700): | |
| text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0) | |
| sub_docs = text_splitter.split_documents(docs) | |
| return sub_docs | |
| def metadata_generator(doc, llm,max_token=4000): | |
| #query = f"Document = {doc.page_content[1:max_token]} -> Respond a python code using a dict filling xxxx like {{'document_type': xxxx, 'summary (max. 30 letters)':'xxxx'}} resond at leat 10 letter" | |
| query = f""" | |
| Cluster the following Input document into topic categories based on patterns seen within the text. Also mention reasoning behind how these categories were defined. | |
| Output format: | |
| {{ | |
| "DOCUMENT TYPE": "", | |
| "SUMMARY": [], | |
| "REASONING": "" | |
| }} | |
| Input document: | |
| {doc.page_content[1:max_token]} | |
| Output: | |
| """ | |
| return llm(query) |