Spaces:
Running
Running
File size: 2,068 Bytes
12b0dd7 b374298 cb23311 b374298 cb23311 12b0dd7 b374298 12b0dd7 b374298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import tempfile, os
from typing import List
from langchain_core.documents import Document as LangchainDocument
from llama_index import Document
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
from llama_parse import LlamaParse, ResultType
def handle_pdf_files_from_serializer(files):
listaPDFs = []
for file in files:
file.seek(0)
with tempfile.NamedTemporaryFile(
delete=False, suffix=".pdf"
) as temp_file: # Create a temporary file to save the uploaded PDF
for (
chunk
) in file.chunks(): # Write the uploaded file content to the temporary file
temp_file.write(chunk)
temp_file_path = temp_file.name # Get the path of the temporary file
listaPDFs.append(temp_file_path)
print("listaPDFs: ", listaPDFs)
return listaPDFs
def remove_pdf_temp_files(listaPDFs):
for file in listaPDFs:
os.remove(file)
async def return_document_list_with_llama_parser(file: str):
llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY")
documents: List[LangchainDocument] = []
if llama_parser_api:
parser = LlamaParse(
api_key=llama_parser_api,
result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured'
language="pt",
verbose=True,
)
parsed_document = await parser.aget_json(file)
for doc in parsed_document[0].get("pages"): # type: ignore
# documents.append(doc.to_langchain_format())
langchain_document = LangchainDocument(
page_content=doc.get("md"), # type: ignore
metadata={
"page": doc.get("page"), # type: ignore
# **doc.get("metadata", {}), # type: ignore
}, # Include page number in metadata
)
documents.append(langchain_document)
return documents
else:
raise ValueError("Não foi possível obter a API_KEY do llama parser")
|