File size: 2,068 Bytes
12b0dd7
b374298
 
 
cb23311
b374298
cb23311
12b0dd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b374298
12b0dd7
 
 
b374298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import tempfile, os
from typing import List
from langchain_core.documents import Document as LangchainDocument
from llama_index import Document
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
from llama_parse import LlamaParse, ResultType


def handle_pdf_files_from_serializer(files):
    listaPDFs = []
    for file in files:
        file.seek(0)
        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".pdf"
        ) as temp_file:  # Create a temporary file to save the uploaded PDF
            for (
                chunk
            ) in file.chunks():  # Write the uploaded file content to the temporary file
                temp_file.write(chunk)
            temp_file_path = temp_file.name  # Get the path of the temporary file
            listaPDFs.append(temp_file_path)
    print("listaPDFs: ", listaPDFs)
    return listaPDFs


def remove_pdf_temp_files(listaPDFs):
    for file in listaPDFs:
        os.remove(file)


async def return_document_list_with_llama_parser(file: str):
    llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY")
    documents: List[LangchainDocument] = []
    if llama_parser_api:
        parser = LlamaParse(
            api_key=llama_parser_api,
            result_type=ResultType.JSON,  # Options: 'text', 'markdown', 'json', 'structured'
            language="pt",
            verbose=True,
        )

        parsed_document = await parser.aget_json(file)
        for doc in parsed_document[0].get("pages"):  # type: ignore
            # documents.append(doc.to_langchain_format())

            langchain_document = LangchainDocument(
                page_content=doc.get("md"),  # type: ignore
                metadata={
                    "page": doc.get("page"),  # type: ignore
                    # **doc.get("metadata", {}),  # type: ignore
                },  # Include page number in metadata
            )

            documents.append(langchain_document)

        return documents
    else:
        raise ValueError("Não foi possível obter a API_KEY do llama parser")