Spaces:
Running
Running
luanpoppe
commited on
Commit
·
ab79998
1
Parent(s):
64ed488
fix: adicionando mais uma possibilidade de carregar o pdf caso dê erro com o pypdf
Browse files
_utils/bubble_integrations/obter_arquivo.py
CHANGED
|
@@ -1,6 +1,11 @@
|
|
| 1 |
# from setup.easy_imports import PyPDFLoader
|
| 2 |
import os
|
| 3 |
-
from langchain_community.document_loaders import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import requests
|
| 6 |
|
|
@@ -37,7 +42,10 @@ async def get_pdf_from_bubble(
|
|
| 37 |
else:
|
| 38 |
extension = file_url.split(".")[-1]
|
| 39 |
if extension.lower() == "pdf":
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
elif extension.lower() == "odt":
|
| 42 |
temp_path = download_file_from_bubble(file_url, headers, ".odt")
|
| 43 |
full_text = splitter_utils.load_odt_file(temp_path)
|
|
|
|
| 1 |
# from setup.easy_imports import PyPDFLoader
|
| 2 |
import os
|
| 3 |
+
from langchain_community.document_loaders import (
|
| 4 |
+
PyPDFLoader,
|
| 5 |
+
Docx2txtLoader,
|
| 6 |
+
TextLoader,
|
| 7 |
+
PyMuPDFLoader,
|
| 8 |
+
)
|
| 9 |
import tempfile
|
| 10 |
import requests
|
| 11 |
|
|
|
|
| 42 |
else:
|
| 43 |
extension = file_url.split(".")[-1]
|
| 44 |
if extension.lower() == "pdf":
|
| 45 |
+
try:
|
| 46 |
+
result = PyPDFLoader(file_url, headers=headers).load()
|
| 47 |
+
except:
|
| 48 |
+
result = PyMuPDFLoader(file_url, headers=headers).load()
|
| 49 |
elif extension.lower() == "odt":
|
| 50 |
temp_path = download_file_from_bubble(file_url, headers, ".odt")
|
| 51 |
full_text = splitter_utils.load_odt_file(temp_path)
|
_utils/bubble_integrations/tests/__init__.py
ADDED
|
File without changes
|
_utils/bubble_integrations/tests/test_obter_arquivo.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from _utils.bubble_integrations.obter_arquivo import (
|
| 3 |
+
get_pdf_from_bubble,
|
| 4 |
+
)
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
from _utils.gerar_documento_utils.contextual_retriever import (
|
| 7 |
+
ContextualRetriever,
|
| 8 |
+
ContextualRetrieverUtils,
|
| 9 |
+
)
|
| 10 |
+
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
| 11 |
+
from _utils.models.gerar_documento import RetrievalConfig
|
| 12 |
+
from _utils.models.gerar_documento import (
|
| 13 |
+
ContextualizedChunk,
|
| 14 |
+
DocumentChunk,
|
| 15 |
+
RetrievalConfig,
|
| 16 |
+
)
|
| 17 |
+
from gerar_documento.serializer import (
|
| 18 |
+
GerarDocumentoComPDFProprioSerializer,
|
| 19 |
+
GerarDocumentoComPDFProprioSerializerData,
|
| 20 |
+
)
|
| 21 |
+
from setup.logging import Axiom
|
| 22 |
+
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class TestObterArquivo:
|
| 26 |
+
@pytest.mark.asyncio
|
| 27 |
+
async def test_get_pdf_from_bubble(self):
|
| 28 |
+
file_url = "https://vella.app.br/version-5265j/fileupload/f1745094959601x803512841326306200/000_I%CC%81ntegra%20%200015348-89.2020.4.06.0981.pdf"
|
| 29 |
+
|
| 30 |
+
response = await get_pdf_from_bubble(file_url, False)
|
| 31 |
+
assert len(response) > 0
|
| 32 |
+
for page in response:
|
| 33 |
+
assert isinstance(page, Document)
|
_utils/langchain_utils/Splitter_class.py
CHANGED
|
@@ -11,6 +11,7 @@ from setup.easy_imports import (
|
|
| 11 |
Document,
|
| 12 |
Docx2txtLoader,
|
| 13 |
TextLoader,
|
|
|
|
| 14 |
)
|
| 15 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
| 16 |
from _utils.models.gerar_documento import (
|
|
@@ -71,7 +72,10 @@ class Splitter:
|
|
| 71 |
file_extension = self.splitter_util.get_file_type(pdf_path)
|
| 72 |
print("file_extension: ", file_extension)
|
| 73 |
if file_extension == "pdf":
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
| 75 |
elif file_extension == "odt":
|
| 76 |
full_text = self.splitter_util.load_odt_file(pdf_path)
|
| 77 |
pages = self.splitter_simple.load_and_split_text(full_text)
|
|
|
|
| 11 |
Document,
|
| 12 |
Docx2txtLoader,
|
| 13 |
TextLoader,
|
| 14 |
+
PyMuPDFLoader,
|
| 15 |
)
|
| 16 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
| 17 |
from _utils.models.gerar_documento import (
|
|
|
|
| 72 |
file_extension = self.splitter_util.get_file_type(pdf_path)
|
| 73 |
print("file_extension: ", file_extension)
|
| 74 |
if file_extension == "pdf":
|
| 75 |
+
try:
|
| 76 |
+
pages = PyPDFLoader(pdf_path).load()
|
| 77 |
+
except:
|
| 78 |
+
pages = PyMuPDFLoader(pdf_path).load()
|
| 79 |
elif file_extension == "odt":
|
| 80 |
full_text = self.splitter_util.load_odt_file(pdf_path)
|
| 81 |
pages = self.splitter_simple.load_and_split_text(full_text)
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|
setup/easy_imports.py
CHANGED
|
@@ -13,7 +13,12 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
| 13 |
from langchain.prompts import PromptTemplate
|
| 14 |
from langchain_core.runnables import RunnablePassthrough
|
| 15 |
from langchain_core.prompts import ChatPromptTemplate
|
| 16 |
-
from langchain_community.document_loaders import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
from langchain_community.vectorstores import Chroma
|
| 18 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 19 |
|
|
|
|
| 13 |
from langchain.prompts import PromptTemplate
|
| 14 |
from langchain_core.runnables import RunnablePassthrough
|
| 15 |
from langchain_core.prompts import ChatPromptTemplate
|
| 16 |
+
from langchain_community.document_loaders import (
|
| 17 |
+
PyPDFLoader,
|
| 18 |
+
Docx2txtLoader,
|
| 19 |
+
TextLoader,
|
| 20 |
+
PyMuPDFLoader,
|
| 21 |
+
)
|
| 22 |
from langchain_community.vectorstores import Chroma
|
| 23 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 24 |
|