Spaces:

luanpoppe
/

vella-backend

Running

luanpoppe commited on Apr 24

Commit

ab79998

1 Parent(s): 64ed488

fix: adicionando mais uma possibilidade de carregar o pdf caso dê erro com o pypdf

Files changed (6) hide show

_utils/bubble_integrations/obter_arquivo.py CHANGED Viewed

@@ -1,6 +1,11 @@
 # from setup.easy_imports import PyPDFLoader
 import os
-from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
 import tempfile
 import requests
@@ -37,7 +42,10 @@ async def get_pdf_from_bubble(
     else:
         extension = file_url.split(".")[-1]
         if extension.lower() == "pdf":
-            result = PyPDFLoader(file_url, headers=headers).load()
         elif extension.lower() == "odt":
             temp_path = download_file_from_bubble(file_url, headers, ".odt")
             full_text = splitter_utils.load_odt_file(temp_path)

 # from setup.easy_imports import PyPDFLoader
 import os
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    Docx2txtLoader,
+    TextLoader,
+    PyMuPDFLoader,
+)
 import tempfile
 import requests
     else:
         extension = file_url.split(".")[-1]
         if extension.lower() == "pdf":
+            try:
+                result = PyPDFLoader(file_url, headers=headers).load()
+            except:
+                result = PyMuPDFLoader(file_url, headers=headers).load()
         elif extension.lower() == "odt":
             temp_path = download_file_from_bubble(file_url, headers, ".odt")
             full_text = splitter_utils.load_odt_file(temp_path)

_utils/bubble_integrations/tests/__init__.py ADDED Viewed

File without changes

_utils/bubble_integrations/tests/test_obter_arquivo.py ADDED Viewed

+import pytest
+from _utils.bubble_integrations.obter_arquivo import (
+    get_pdf_from_bubble,
+)
+from langchain_core.documents import Document
+from _utils.gerar_documento_utils.contextual_retriever import (
+    ContextualRetriever,
+    ContextualRetrieverUtils,
+)
+from _utils.gerar_documento_utils.llm_calls import agemini_answer
+from _utils.models.gerar_documento import RetrievalConfig
+from _utils.models.gerar_documento import (
+    ContextualizedChunk,
+    DocumentChunk,
+    RetrievalConfig,
+)
+from gerar_documento.serializer import (
+    GerarDocumentoComPDFProprioSerializer,
+    GerarDocumentoComPDFProprioSerializerData,
+)
+from setup.logging import Axiom
+from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
+class TestObterArquivo:
+    @pytest.mark.asyncio
+    async def test_get_pdf_from_bubble(self):
+        file_url = "https://vella.app.br/version-5265j/fileupload/f1745094959601x803512841326306200/000_I%CC%81ntegra%20%200015348-89.2020.4.06.0981.pdf"
+        response = await get_pdf_from_bubble(file_url, False)
+        assert len(response) > 0
+        for page in response:
+            assert isinstance(page, Document)

_utils/langchain_utils/Splitter_class.py CHANGED Viewed

@@ -11,6 +11,7 @@ from setup.easy_imports import (
     Document,
     Docx2txtLoader,
     TextLoader,
 )
 from typing import Any, List, Dict, Tuple, Optional, cast
 from _utils.models.gerar_documento import (
@@ -71,7 +72,10 @@ class Splitter:
                 file_extension = self.splitter_util.get_file_type(pdf_path)
                 print("file_extension: ", file_extension)
                 if file_extension == "pdf":
-                    pages = PyPDFLoader(pdf_path).load()
                 elif file_extension == "odt":
                     full_text = self.splitter_util.load_odt_file(pdf_path)
                     pages = self.splitter_simple.load_and_split_text(full_text)

     Document,
     Docx2txtLoader,
     TextLoader,
+    PyMuPDFLoader,
 )
 from typing import Any, List, Dict, Tuple, Optional, cast
 from _utils.models.gerar_documento import (
                 file_extension = self.splitter_util.get_file_type(pdf_path)
                 print("file_extension: ", file_extension)
                 if file_extension == "pdf":
+                    try:
+                        pages = PyPDFLoader(pdf_path).load()
+                    except:
+                        pages = PyMuPDFLoader(pdf_path).load()
                 elif file_extension == "odt":
                     full_text = self.splitter_util.load_odt_file(pdf_path)
                     pages = self.splitter_simple.load_and_split_text(full_text)

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

setup/easy_imports.py CHANGED Viewed

@@ -13,7 +13,12 @@ from langchain_huggingface import HuggingFaceEmbeddings
 from langchain.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.prompts import ChatPromptTemplate
-from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
 from langchain_community.vectorstores import Chroma
 from langchain_google_genai import ChatGoogleGenerativeAI

 from langchain.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.prompts import ChatPromptTemplate
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    Docx2txtLoader,
+    TextLoader,
+    PyMuPDFLoader,
+)
 from langchain_community.vectorstores import Chroma
 from langchain_google_genai import ChatGoogleGenerativeAI