Vinh Nguyen commited on
Commit
91d4c2f
1 Parent(s): 2bcd3aa

Disable tokenizer transformer parallelism to avoid deadlocks

Browse files
Files changed (1) hide show
  1. document_retriever.py +5 -3
document_retriever.py CHANGED
@@ -2,17 +2,19 @@ import os
2
  import tempfile
3
 
4
  import streamlit as st
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.document_loaders import (
7
  Docx2txtLoader,
8
  PyPDFLoader,
9
  TextLoader,
10
  UnstructuredEPubLoader,
11
  )
 
12
  from langchain_community.vectorstores import DocArrayInMemorySearch
13
  from langchain_text_splitters import RecursiveCharacterTextSplitter
14
 
15
- EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
 
 
16
 
17
 
18
  @st.cache_resource(ttl="1h")
@@ -47,7 +49,7 @@ def configure_retriever(files):
47
  splits = text_splitter.split_documents(docs)
48
 
49
  # Create embeddings and store in vectordb
50
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
51
  vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)
52
 
53
  # Define retriever
 
2
  import tempfile
3
 
4
  import streamlit as st
 
5
  from langchain_community.document_loaders import (
6
  Docx2txtLoader,
7
  PyPDFLoader,
8
  TextLoader,
9
  UnstructuredEPubLoader,
10
  )
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
  from langchain_community.vectorstores import DocArrayInMemorySearch
13
  from langchain_text_splitters import RecursiveCharacterTextSplitter
14
 
15
+ # disable tokenizer transformer parallelism to avoid deadlocks
16
+ # https://github.com/huggingface/transformers/issues/5486
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
18
 
19
 
20
  @st.cache_resource(ttl="1h")
 
49
  splits = text_splitter.split_documents(docs)
50
 
51
  # Create embeddings and store in vectordb
52
+ embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
53
  vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)
54
 
55
  # Define retriever