Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -181,6 +181,104 @@
|
|
| 181 |
|
| 182 |
|
| 183 |
# v2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
@cl.on_chat_start
|
| 185 |
async def on_chat_start():
|
| 186 |
|
|
@@ -245,3 +343,20 @@ async def on_chat_start():
|
|
| 245 |
# Store the chain in user session
|
| 246 |
cl.user_session.set("chain", chain)
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
|
| 183 |
# v2:
|
| 184 |
+
import re
|
| 185 |
+
import PyPDF2
|
| 186 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
| 187 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 188 |
+
from langchain_community.vectorstores import Chroma
|
| 189 |
+
from langchain.chains import ConversationalRetrievalChain
|
| 190 |
+
from langchain_community.chat_models import ChatOllama
|
| 191 |
+
from langchain_groq import ChatGroq
|
| 192 |
+
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
|
| 193 |
+
import chainlit as cl
|
| 194 |
+
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
| 195 |
+
import logging
|
| 196 |
+
import pypandoc
|
| 197 |
+
import pdfkit
|
| 198 |
+
from paddleocr import PaddleOCR
|
| 199 |
+
import fitz
|
| 200 |
+
import asyncio
|
| 201 |
+
from langchain_nomic.embeddings import NomicEmbeddings
|
| 202 |
+
|
| 203 |
+
llm_groq = ChatGroq(
|
| 204 |
+
model_name='llama3-70b-8192'
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# Initialize anonymizer
|
| 208 |
+
anonymizer = PresidioReversibleAnonymizer(
|
| 209 |
+
analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'],
|
| 210 |
+
faker_seed=18
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
def extract_text_from_pdf(file_path):
|
| 214 |
+
pdf = PyPDF2.PdfReader(file_path)
|
| 215 |
+
pdf_text = ""
|
| 216 |
+
for page in pdf.pages:
|
| 217 |
+
pdf_text += page.extract_text()
|
| 218 |
+
return pdf_text
|
| 219 |
+
|
| 220 |
+
def has_sufficient_selectable_text(page, threshold=50):
|
| 221 |
+
text = page.extract_text()
|
| 222 |
+
if len(text.strip()) > threshold:
|
| 223 |
+
return True
|
| 224 |
+
return False
|
| 225 |
+
|
| 226 |
+
async def get_text(file_path):
|
| 227 |
+
text = ""
|
| 228 |
+
try:
|
| 229 |
+
logging.info("Starting OCR process for file: %s", file_path)
|
| 230 |
+
extension = file_path.split(".")[-1].lower()
|
| 231 |
+
allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
|
| 232 |
+
if extension not in allowed_extension:
|
| 233 |
+
error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
|
| 234 |
+
logging.error(error)
|
| 235 |
+
return {"error": error}
|
| 236 |
+
|
| 237 |
+
if extension == "docx":
|
| 238 |
+
file_path = convert_docx_to_pdf(file_path)
|
| 239 |
+
|
| 240 |
+
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 241 |
+
result = ocr.ocr(file_path, cls=True)
|
| 242 |
+
for idx in range(len(result)):
|
| 243 |
+
res = result[idx]
|
| 244 |
+
for line in res:
|
| 245 |
+
text += line[1][0] + " "
|
| 246 |
+
logging.info("OCR process completed successfully for file: %s", file_path)
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
|
| 249 |
+
text = "Error occurred during OCR process."
|
| 250 |
+
logging.info("Extracted text: %s", text)
|
| 251 |
+
return text
|
| 252 |
+
|
| 253 |
+
def convert_docx_to_pdf(input_path):
|
| 254 |
+
html_path = input_path.replace('.docx', '.html')
|
| 255 |
+
output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
|
| 256 |
+
pypandoc.convert_file(input_path, 'html', outputfile=html_path)
|
| 257 |
+
pdfkit.from_file(html_path, output_path)
|
| 258 |
+
logging.info("DOCX Format Handled")
|
| 259 |
+
return output_path
|
| 260 |
+
|
| 261 |
+
async def extract_text_from_mixed_pdf(file_path):
|
| 262 |
+
pdf = PyPDF2.PdfReader(file_path)
|
| 263 |
+
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 264 |
+
pdf_text = ""
|
| 265 |
+
for i, page in enumerate(pdf.pages):
|
| 266 |
+
text = page.extract_text()
|
| 267 |
+
if not has_sufficient_selectable_text(page):
|
| 268 |
+
logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
|
| 269 |
+
pdf_document = fitz.open(file_path)
|
| 270 |
+
pdf_page = pdf_document.load_page(i)
|
| 271 |
+
pix = pdf_page.get_pixmap()
|
| 272 |
+
image_path = f"page_{i+1}.png"
|
| 273 |
+
pix.save(image_path)
|
| 274 |
+
result = ocr.ocr(image_path, cls=True)
|
| 275 |
+
for idx in range(len(result)):
|
| 276 |
+
res = result[idx]
|
| 277 |
+
for line in res:
|
| 278 |
+
text += line[1][0] + " "
|
| 279 |
+
pdf_text += text
|
| 280 |
+
return pdf_text
|
| 281 |
+
|
| 282 |
@cl.on_chat_start
|
| 283 |
async def on_chat_start():
|
| 284 |
|
|
|
|
| 343 |
# Store the chain in user session
|
| 344 |
cl.user_session.set("chain", chain)
|
| 345 |
|
| 346 |
+
@cl.on_message
|
| 347 |
+
async def main(message: cl.Message):
|
| 348 |
+
|
| 349 |
+
# Retrieve the chain from user session
|
| 350 |
+
chain = cl.user_session.get("chain")
|
| 351 |
+
# Callbacks happen asynchronously/parallel
|
| 352 |
+
cb = cl.AsyncLangchainCallbackHandler()
|
| 353 |
+
|
| 354 |
+
# Call the chain with user's message content
|
| 355 |
+
res = await chain.ainvoke(message.content, callbacks=[cb])
|
| 356 |
+
answer = anonymizer.deanonymize(
|
| 357 |
+
res["answer"]
|
| 358 |
+
)
|
| 359 |
+
text_elements = []
|
| 360 |
+
|
| 361 |
+
# Return results
|
| 362 |
+
await cl.Message(content=answer, elements=text_elements).send()
|