Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import re
|
| 2 |
import PyPDF2
|
| 3 |
from langchain_community.embeddings import OllamaEmbeddings
|
| 4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -22,7 +22,7 @@ llm_groq = ChatGroq(
|
|
| 22 |
)
|
| 23 |
|
| 24 |
# Initialize anonymizer
|
| 25 |
-
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
|
| 26 |
|
| 27 |
def extract_text_from_pdf(file_path):
|
| 28 |
pdf = PyPDF2.PdfReader(file_path)
|
|
@@ -125,6 +125,27 @@ async def on_chat_start():
|
|
| 125 |
pdf_text
|
| 126 |
)
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
| 129 |
|
| 130 |
docsearch = await cl.make_async(Chroma.from_texts)(
|
|
@@ -170,12 +191,9 @@ async def main(message: cl.Message):
|
|
| 170 |
# Call the chain with user's message content
|
| 171 |
res = await chain.ainvoke(message.content, callbacks=[cb])
|
| 172 |
answer = anonymizer.deanonymize(
|
| 173 |
-
res["answer"]
|
| 174 |
)
|
| 175 |
text_elements = []
|
| 176 |
|
| 177 |
# Return results
|
| 178 |
await cl.Message(content=answer, elements=text_elements).send()
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
|
|
|
| 1 |
+
import re
|
| 2 |
import PyPDF2
|
| 3 |
from langchain_community.embeddings import OllamaEmbeddings
|
| 4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
# Initialize anonymizer
|
| 25 |
+
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN'], faker_seed=18)
|
| 26 |
|
| 27 |
def extract_text_from_pdf(file_path):
|
| 28 |
pdf = PyPDF2.PdfReader(file_path)
|
|
|
|
| 125 |
pdf_text
|
| 126 |
)
|
| 127 |
|
| 128 |
+
# with splitting into chunks
|
| 129 |
+
# {
|
| 130 |
+
# # Split the sanitized text into chunks
|
| 131 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 132 |
+
# texts = text_splitter.split_text(anonymized_text)
|
| 133 |
+
|
| 134 |
+
# # Create metadata for each chunk
|
| 135 |
+
# metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
|
| 136 |
+
|
| 137 |
+
# # Create a Chroma vector store
|
| 138 |
+
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
|
| 139 |
+
# docsearch = await cl.make_async(Chroma.from_texts)(
|
| 140 |
+
# texts, embeddings, metadatas=metadatas
|
| 141 |
+
# )
|
| 142 |
+
# }
|
| 143 |
+
|
| 144 |
+
# without splitting into chunks
|
| 145 |
+
# {
|
| 146 |
+
# Create a Chroma vector store
|
| 147 |
+
|
| 148 |
+
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
|
| 149 |
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
| 150 |
|
| 151 |
docsearch = await cl.make_async(Chroma.from_texts)(
|
|
|
|
| 191 |
# Call the chain with user's message content
|
| 192 |
res = await chain.ainvoke(message.content, callbacks=[cb])
|
| 193 |
answer = anonymizer.deanonymize(
|
| 194 |
+
"ok"+res["answer"]
|
| 195 |
)
|
| 196 |
text_elements = []
|
| 197 |
|
| 198 |
# Return results
|
| 199 |
await cl.Message(content=answer, elements=text_elements).send()
|
|
|
|
|
|
|
|
|