Spaces:

luanpoppe
/

vella-backend

Running

luanpoppe

feat: adicionando llama_parse e algumas tipagens

b374298 15 days ago

2.07 kB

	import tempfile, os
	from typing import List
	from langchain_core.documents import Document as LangchainDocument
	from llama_index import Document
	from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
	from llama_parse import LlamaParse, ResultType


	def handle_pdf_files_from_serializer(files):
	listaPDFs = []
	for file in files:
	file.seek(0)
	with tempfile.NamedTemporaryFile(
	delete=False, suffix=".pdf"
	) as temp_file: # Create a temporary file to save the uploaded PDF
	for (
	chunk
	) in file.chunks(): # Write the uploaded file content to the temporary file
	temp_file.write(chunk)
	temp_file_path = temp_file.name # Get the path of the temporary file
	listaPDFs.append(temp_file_path)
	print("listaPDFs: ", listaPDFs)
	return listaPDFs


	def remove_pdf_temp_files(listaPDFs):
	for file in listaPDFs:
	os.remove(file)


	async def return_document_list_with_llama_parser(file: str):
	llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY")
	documents: List[LangchainDocument] = []
	if llama_parser_api:
	parser = LlamaParse(
	api_key=llama_parser_api,
	result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured'
	language="pt",
	verbose=True,
	)

	parsed_document = await parser.aget_json(file)
	for doc in parsed_document[0].get("pages"): # type: ignore
	# documents.append(doc.to_langchain_format())

	langchain_document = LangchainDocument(
	page_content=doc.get("md"), # type: ignore
	metadata={
	"page": doc.get("page"), # type: ignore
	# **doc.get("metadata", {}), # type: ignore
	}, # Include page number in metadata
	)

	documents.append(langchain_document)

	return documents
	else:
	raise ValueError("Não foi possível obter a API_KEY do llama parser")