Spaces:
Running
Running
import tempfile, os | |
from typing import List | |
from langchain_core.documents import Document as LangchainDocument | |
from llama_index import Document | |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble | |
from llama_parse import LlamaParse, ResultType | |
def handle_pdf_files_from_serializer(files): | |
listaPDFs = [] | |
for file in files: | |
file.seek(0) | |
with tempfile.NamedTemporaryFile( | |
delete=False, suffix=".pdf" | |
) as temp_file: # Create a temporary file to save the uploaded PDF | |
for ( | |
chunk | |
) in file.chunks(): # Write the uploaded file content to the temporary file | |
temp_file.write(chunk) | |
temp_file_path = temp_file.name # Get the path of the temporary file | |
listaPDFs.append(temp_file_path) | |
print("listaPDFs: ", listaPDFs) | |
return listaPDFs | |
def remove_pdf_temp_files(listaPDFs): | |
for file in listaPDFs: | |
os.remove(file) | |
async def return_document_list_with_llama_parser(file: str): | |
llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY") | |
documents: List[LangchainDocument] = [] | |
if llama_parser_api: | |
parser = LlamaParse( | |
api_key=llama_parser_api, | |
result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured' | |
language="pt", | |
verbose=True, | |
) | |
parsed_document = await parser.aget_json(file) | |
for doc in parsed_document[0].get("pages"): # type: ignore | |
# documents.append(doc.to_langchain_format()) | |
langchain_document = LangchainDocument( | |
page_content=doc.get("md"), # type: ignore | |
metadata={ | |
"page": doc.get("page"), # type: ignore | |
# **doc.get("metadata", {}), # type: ignore | |
}, # Include page number in metadata | |
) | |
documents.append(langchain_document) | |
return documents | |
else: | |
raise ValueError("Não foi possível obter a API_KEY do llama parser") | |