Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		luanpoppe
		
	commited on
		
		
					Commit 
							
							·
						
						d07865c
	
1
								Parent(s):
							
							39fc36b
								
feat: pequenas melhorias
Browse files- _utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py +1 -9
- _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +58 -206
- _utils/gerar_relatorio_modelo_usuario/utils.py +55 -0
- _utils/resumo_completo_cursor.py +23 -10
- tests/gerar_relatorio_modelo_usuario/test_contextual_retriever.py +2 -0
    	
        _utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py
    CHANGED
    
    | @@ -20,15 +20,12 @@ from _utils.models.gerar_relatorio import ( | |
| 20 | 
             
            )
         | 
| 21 | 
             
            from modelos_usuarios.serializer import ModeloUsuarioSerializer
         | 
| 22 | 
             
            from setup.environment import api_url
         | 
| 23 | 
            -
             | 
| 24 | 
            -
                ContextualRetriever,
         | 
| 25 | 
            -
            )
         | 
| 26 | 
             
            from asgiref.sync import sync_to_async
         | 
| 27 |  | 
| 28 |  | 
| 29 | 
             
            class EnhancedDocumentSummarizer(DocumentSummarizer):
         | 
| 30 | 
             
                openai_api_key = os.environ.get("OPENAI_API_KEY", "")
         | 
| 31 | 
            -
                claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
         | 
| 32 |  | 
| 33 | 
             
                def __init__(
         | 
| 34 | 
             
                    self,
         | 
| @@ -38,7 +35,6 @@ class EnhancedDocumentSummarizer(DocumentSummarizer): | |
| 38 | 
             
                    chunk_overlap,
         | 
| 39 | 
             
                    num_k_rerank,
         | 
| 40 | 
             
                    model_cohere_rerank,
         | 
| 41 | 
            -
                    claude_context_model,
         | 
| 42 | 
             
                    prompt_auxiliar,
         | 
| 43 | 
             
                    gpt_model,
         | 
| 44 | 
             
                    gpt_temperature,
         | 
| @@ -56,14 +52,10 @@ class EnhancedDocumentSummarizer(DocumentSummarizer): | |
| 56 | 
             
                        model_cohere_rerank,
         | 
| 57 | 
             
                    )
         | 
| 58 | 
             
                    self.config = config
         | 
| 59 | 
            -
                    self.contextual_retriever = ContextualRetriever(
         | 
| 60 | 
            -
                        config, self.claude_api_key, claude_context_model
         | 
| 61 | 
            -
                    )
         | 
| 62 | 
             
                    self.logger = logging.getLogger(__name__)
         | 
| 63 | 
             
                    self.prompt_auxiliar = prompt_auxiliar
         | 
| 64 | 
             
                    self.gpt_model = gpt_model
         | 
| 65 | 
             
                    self.gpt_temperature = gpt_temperature
         | 
| 66 | 
            -
                    # self.id_modelo_do_usuario = id_modelo_do_usuario
         | 
| 67 | 
             
                    self.prompt_gerar_documento = prompt_gerar_documento
         | 
| 68 | 
             
                    self.reciprocal_rank_fusion = reciprocal_rank_fusion
         | 
| 69 | 
             
                    self.resumo_gerado = ""
         | 
|  | |
| 20 | 
             
            )
         | 
| 21 | 
             
            from modelos_usuarios.serializer import ModeloUsuarioSerializer
         | 
| 22 | 
             
            from setup.environment import api_url
         | 
| 23 | 
            +
             | 
|  | |
|  | |
| 24 | 
             
            from asgiref.sync import sync_to_async
         | 
| 25 |  | 
| 26 |  | 
| 27 | 
             
            class EnhancedDocumentSummarizer(DocumentSummarizer):
         | 
| 28 | 
             
                openai_api_key = os.environ.get("OPENAI_API_KEY", "")
         | 
|  | |
| 29 |  | 
| 30 | 
             
                def __init__(
         | 
| 31 | 
             
                    self,
         | 
|  | |
| 35 | 
             
                    chunk_overlap,
         | 
| 36 | 
             
                    num_k_rerank,
         | 
| 37 | 
             
                    model_cohere_rerank,
         | 
|  | |
| 38 | 
             
                    prompt_auxiliar,
         | 
| 39 | 
             
                    gpt_model,
         | 
| 40 | 
             
                    gpt_temperature,
         | 
|  | |
| 52 | 
             
                        model_cohere_rerank,
         | 
| 53 | 
             
                    )
         | 
| 54 | 
             
                    self.config = config
         | 
|  | |
|  | |
|  | |
| 55 | 
             
                    self.logger = logging.getLogger(__name__)
         | 
| 56 | 
             
                    self.prompt_auxiliar = prompt_auxiliar
         | 
| 57 | 
             
                    self.gpt_model = gpt_model
         | 
| 58 | 
             
                    self.gpt_temperature = gpt_temperature
         | 
|  | |
| 59 | 
             
                    self.prompt_gerar_documento = prompt_gerar_documento
         | 
| 60 | 
             
                    self.reciprocal_rank_fusion = reciprocal_rank_fusion
         | 
| 61 | 
             
                    self.resumo_gerado = ""
         | 
    	
        _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
    CHANGED
    
    | @@ -1,33 +1,16 @@ | |
| 1 | 
             
            import os
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            from _utils.LLMs.LLM_class import LLM
         | 
| 4 | 
            -
            from _utils.gerar_relatorio_modelo_usuario.prompts import (
         | 
| 5 | 
            -
                prompt_auxiliar_do_contextual_prompt,
         | 
| 6 | 
            -
                create_prompt_auxiliar_do_contextual_prompt,
         | 
| 7 | 
            -
            )
         | 
| 8 | 
            -
            from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
         | 
| 9 | 
            -
            from _utils.chains.Chain_class import Chain
         | 
| 10 | 
             
            from _utils.gerar_relatorio_modelo_usuario.utils import (
         | 
|  | |
| 11 | 
             
                validate_many_chunks_in_one_request,
         | 
| 12 | 
             
            )
         | 
| 13 | 
            -
            from _utils.handle_files import return_document_list_with_llama_parser
         | 
| 14 | 
            -
            from _utils.prompts.Prompt_class import Prompt
         | 
| 15 | 
            -
            from _utils.splitters.Splitter_class import Splitter
         | 
| 16 | 
            -
            from setup.easy_imports import PyPDFLoader
         | 
| 17 | 
            -
            from langchain_openai import ChatOpenAI
         | 
| 18 | 
             
            from typing import List, Dict, Tuple, Optional, cast
         | 
| 19 | 
             
            from anthropic import Anthropic, AsyncAnthropic
         | 
| 20 | 
             
            import logging
         | 
| 21 | 
             
            from langchain.schema import Document
         | 
| 22 | 
             
            from llama_index import Document as Llama_Index_Document
         | 
| 23 | 
             
            import asyncio
         | 
| 24 | 
            -
            from langchain.prompts import PromptTemplate
         | 
| 25 | 
             
            from typing import List
         | 
| 26 | 
            -
            from multiprocessing import Process, Barrier, Queue
         | 
| 27 | 
             
            from dataclasses import dataclass
         | 
| 28 | 
            -
            from langchain_core.messages import HumanMessage
         | 
| 29 | 
            -
            from asgiref.sync import sync_to_async
         | 
| 30 | 
            -
            from setup.easy_imports import ChatPromptTemplate, ChatOpenAI
         | 
| 31 |  | 
| 32 | 
             
            from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
         | 
| 33 | 
             
            from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
         | 
| @@ -36,161 +19,30 @@ from _utils.models.gerar_relatorio import ( | |
| 36 | 
             
                DocumentChunk,
         | 
| 37 | 
             
                RetrievalConfig,
         | 
| 38 | 
             
            )
         | 
| 39 | 
            -
            from _utils.prompts.Prompt_class import prompt as prompt_obj
         | 
| 40 |  | 
| 41 | 
             
            lista_contador = []
         | 
| 42 |  | 
| 43 |  | 
| 44 | 
             
            class ContextualRetriever:
         | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
                ):
         | 
| 48 | 
             
                    self.config = config
         | 
| 49 | 
            -
                    # self.claude_client = Anthropic(api_key=claude_api_key)
         | 
| 50 | 
            -
                    self.claude_client = AsyncAnthropic(api_key=claude_api_key)
         | 
| 51 | 
             
                    self.logger = logging.getLogger(__name__)
         | 
| 52 | 
             
                    self.bm25 = None
         | 
| 53 | 
             
                    self.claude_context_model = claude_context_model
         | 
| 54 |  | 
| 55 | 
            -
             | 
| 56 | 
            -
                    self | 
| 57 | 
            -
             | 
| 58 | 
            -
                    """Add context to all chunks"""
         | 
| 59 | 
            -
                    contextualized_chunks = []
         | 
| 60 | 
            -
                    full_text = ""
         | 
| 61 | 
            -
                    for x in full_text_as_array:
         | 
| 62 | 
            -
                        full_text += x
         | 
| 63 | 
            -
             | 
| 64 | 
            -
                    prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                    print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
         | 
| 67 | 
            -
             | 
| 68 | 
            -
                    # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
         | 
| 69 | 
            -
                    # response_auxiliar_summary = await aclaude_answer(
         | 
| 70 | 
            -
                    #     self.claude_client, self.claude_context_model, prompt_auxiliar_summary
         | 
| 71 | 
            -
                    # )
         | 
| 72 | 
            -
             | 
| 73 | 
            -
                    llms = LLM()
         | 
| 74 | 
            -
                    response_auxiliar_summary = await llms.googleGemini().ainvoke(
         | 
| 75 | 
            -
                        [HumanMessage(content=prompt_auxiliar_summary)]
         | 
| 76 | 
            -
                    )
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                    print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content)
         | 
| 79 | 
            -
             | 
| 80 | 
            -
                    lista_de_listas_cada_com_20_chunks = [
         | 
| 81 | 
            -
                        chunks[i : i + 20] for i in range(0, len(chunks), 20)
         | 
| 82 | 
            -
                    ]
         | 
| 83 | 
            -
                    print(
         | 
| 84 | 
            -
                        "lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks
         | 
| 85 | 
            -
                    )
         | 
| 86 | 
            -
             | 
| 87 | 
            -
                    async with asyncio.TaskGroup() as tg:
         | 
| 88 | 
            -
                        tasks = [
         | 
| 89 | 
            -
                            tg.create_task(
         | 
| 90 | 
            -
                                self.create_contextualized_chunk(
         | 
| 91 | 
            -
                                    chunk, full_text_as_array, response_auxiliar_summary.content
         | 
| 92 | 
            -
                                )
         | 
| 93 | 
            -
                            )
         | 
| 94 | 
            -
                            # for chunk in chunks # ORIGINAL
         | 
| 95 | 
            -
                            for chunk in lista_de_listas_cada_com_20_chunks
         | 
| 96 | 
            -
                        ]
         | 
| 97 | 
            -
             | 
| 98 | 
            -
                    # contextualized_chunks = [task.result() for task in tasks]
         | 
| 99 | 
            -
                    contextualized_chunks = []
         | 
| 100 | 
            -
                    for task in tasks:
         | 
| 101 | 
            -
                        # print("\n\ntask", task)
         | 
| 102 | 
            -
                        # print("\n\ntask.result()", task.result())
         | 
| 103 | 
            -
             | 
| 104 | 
            -
                        contextualized_chunks = contextualized_chunks + task.result()
         | 
| 105 | 
            -
             | 
| 106 | 
            -
                    return contextualized_chunks
         | 
| 107 | 
            -
             | 
| 108 | 
            -
                # ORIGINAL
         | 
| 109 | 
            -
                # async def create_contextualized_chunk(
         | 
| 110 | 
            -
                #     self, chunk, single_page_text, response_auxiliar_summary
         | 
| 111 | 
            -
                # ):
         | 
| 112 | 
            -
                #     lista_contador.append(0)
         | 
| 113 | 
            -
                #     print("contador: ", len(lista_contador))
         | 
| 114 | 
            -
                #     page_number = chunk.page_number - 1
         | 
| 115 | 
            -
                #     page_content = single_page_text[page_number].page_content
         | 
| 116 | 
            -
             | 
| 117 | 
            -
                #     context = await self.llm_generate_context(
         | 
| 118 | 
            -
                #         page_content, chunk, response_auxiliar_summary
         | 
| 119 | 
            -
                #     )
         | 
| 120 | 
            -
                #     print("context: ", context)
         | 
| 121 | 
            -
                #     return ContextualizedChunk(
         | 
| 122 | 
            -
                #         content=chunk.content,
         | 
| 123 | 
            -
                #         page_number=chunk.page_number,
         | 
| 124 | 
            -
                #         chunk_id=chunk.chunk_id,
         | 
| 125 | 
            -
                #         start_char=chunk.start_char,
         | 
| 126 | 
            -
                #         end_char=chunk.end_char,
         | 
| 127 | 
            -
                #         context=context,
         | 
| 128 | 
            -
                #     )
         | 
| 129 | 
            -
             | 
| 130 | 
            -
                async def create_contextualized_chunk(
         | 
| 131 | 
            -
                    self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary
         | 
| 132 | 
            -
                ):
         | 
| 133 | 
            -
             | 
| 134 | 
            -
                    lista_contador.append(0)
         | 
| 135 | 
            -
                    print("contador: ", len(lista_contador))
         | 
| 136 | 
            -
                    # all_pages_contents = ""
         | 
| 137 | 
            -
                    # contador = 1
         | 
| 138 | 
            -
                    # for chunk in chunks:
         | 
| 139 | 
            -
                    #     page_number = chunk.page_number - 1
         | 
| 140 | 
            -
                    #     page_content = single_page_text[page_number].page_content
         | 
| 141 | 
            -
             | 
| 142 | 
            -
                    #     all_pages_contents += page_content
         | 
| 143 | 
            -
                    #     contador += 1
         | 
| 144 | 
            -
             | 
| 145 | 
            -
                    result = await self.llm_generate_context(chunks, response_auxiliar_summary)
         | 
| 146 | 
            -
             | 
| 147 | 
            -
                    lista_chunks = []
         | 
| 148 | 
            -
                    for index, chunk in enumerate(chunks):
         | 
| 149 | 
            -
                        lista_chunks.append(
         | 
| 150 | 
            -
                            ContextualizedChunk(
         | 
| 151 | 
            -
                                contextual_summary=result[index][2],
         | 
| 152 | 
            -
                                content=chunk.content,
         | 
| 153 | 
            -
                                page_number=chunk.page_number,
         | 
| 154 | 
            -
                                id_do_processo=int(result[index][0]),
         | 
| 155 | 
            -
                                chunk_id=chunk.chunk_id,
         | 
| 156 | 
            -
                                start_char=chunk.start_char,
         | 
| 157 | 
            -
                                end_char=chunk.end_char,
         | 
| 158 | 
            -
                                context=result[index][1],
         | 
| 159 | 
            -
                            )
         | 
| 160 | 
            -
                        )
         | 
| 161 | 
            -
             | 
| 162 | 
            -
                    return lista_chunks
         | 
| 163 | 
            -
             | 
| 164 | 
            -
                # ORIGINAL
         | 
| 165 | 
            -
                # async def llm_generate_context(
         | 
| 166 | 
            -
                #     self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
         | 
| 167 | 
            -
                # ) -> str:
         | 
| 168 | 
            -
                #     """Generate contextual description using ChatOpenAI"""
         | 
| 169 | 
            -
                #     try:
         | 
| 170 | 
            -
                #         print("COMEÇOU A REQUISIÇÃO")
         | 
| 171 | 
            -
                #         prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
         | 
| 172 | 
            -
                #         # response = await aclaude_answer(
         | 
| 173 | 
            -
                #         #     self.claude_client, self.claude_context_model, prompt
         | 
| 174 | 
            -
                #         # )
         | 
| 175 | 
            -
             | 
| 176 | 
            -
                #         # response = await agpt_answer(prompt)
         | 
| 177 | 
            -
                #         llms = LLM()
         | 
| 178 | 
            -
                #         response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
         | 
| 179 | 
            -
                #         return cast(str, response.content)
         | 
| 180 | 
            -
                #     except Exception as e:
         | 
| 181 | 
            -
                #         self.logger.error(
         | 
| 182 | 
            -
                #             f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
         | 
| 183 | 
            -
                #         )
         | 
| 184 | 
            -
                #         return ""
         | 
| 185 |  | 
| 186 | 
            -
                async def  | 
| 187 | 
            -
                    self,  | 
| 188 | 
             
                ) -> str:
         | 
| 189 | 
             
                    """Generate contextual description using ChatOpenAI"""
         | 
| 190 | 
             
                    contador = 1
         | 
| 191 | 
             
                    all_chunks_contents = ""
         | 
| 192 |  | 
| 193 | 
            -
                    for chunk in  | 
| 194 | 
             
                        all_chunks_contents += chunk.content
         | 
| 195 | 
             
                        all_chunks_contents += f"\n\n CHUNK {contador}:\n"
         | 
| 196 | 
             
                        contador += 1
         | 
| @@ -203,7 +55,9 @@ class ContextualRetriever: | |
| 203 | 
             
                        # )
         | 
| 204 |  | 
| 205 | 
             
                        for attempt in range(4):
         | 
| 206 | 
            -
                            print( | 
|  | |
|  | |
| 207 | 
             
                            raw_response = await agpt_answer(prompt)
         | 
| 208 | 
             
                            response = cast(str, raw_response)
         | 
| 209 | 
             
                            # llms = LLM()
         | 
| @@ -211,7 +65,6 @@ class ContextualRetriever: | |
| 211 | 
             
                            # return cast(str, response.content)
         | 
| 212 |  | 
| 213 | 
             
                            matches = validate_many_chunks_in_one_request(response)
         | 
| 214 | 
            -
                            # Convert matches to the desired format
         | 
| 215 |  | 
| 216 | 
             
                            if matches:
         | 
| 217 | 
             
                                result = [
         | 
| @@ -224,62 +77,61 @@ class ContextualRetriever: | |
| 224 | 
             
                        self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
         | 
| 225 | 
             
                        return ""
         | 
| 226 |  | 
| 227 | 
            -
                 | 
| 228 | 
            -
             | 
| 229 | 
            -
                 | 
| 230 | 
            -
             | 
| 231 | 
            -
             | 
| 232 | 
            -
                #     return
         | 
| 233 | 
            -
             | 
| 234 |  | 
| 235 | 
            -
             | 
| 236 | 
            -
             | 
| 237 | 
            -
                serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
         | 
| 238 | 
            -
            ):
         | 
| 239 | 
            -
                if serializer["should_have_contextual_chunks"]:
         | 
| 240 | 
            -
                    contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
         | 
| 241 | 
            -
                        pages, all_PDFs_chunks
         | 
| 242 | 
             
                    )
         | 
| 243 | 
            -
                    chunks_passados = contextualized_chunks
         | 
| 244 | 
            -
                    is_contextualized_chunk = True
         | 
| 245 | 
            -
                else:
         | 
| 246 | 
            -
                    chunks_passados = all_PDFs_chunks
         | 
| 247 | 
            -
                    is_contextualized_chunk = False
         | 
| 248 | 
            -
             | 
| 249 | 
            -
                return chunks_passados, is_contextualized_chunk
         | 
| 250 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 251 |  | 
| 252 | 
            -
             | 
| 253 | 
            -
                listaPDFs: List[str],
         | 
| 254 | 
            -
                splitterObject: Splitter,
         | 
| 255 | 
            -
                should_use_llama_parse: bool,
         | 
| 256 | 
            -
                isBubble: bool,
         | 
| 257 | 
            -
            ):
         | 
| 258 | 
            -
                all_PDFs_chunks = []
         | 
| 259 |  | 
| 260 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 261 |  | 
| 262 | 
            -
             | 
| 263 | 
            -
             | 
| 264 | 
            -
                     | 
| 265 | 
            -
                    #     pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
         | 
| 266 | 
            -
                    # else:
         | 
| 267 | 
            -
                    #     if should_use_llama_parse:
         | 
| 268 | 
            -
                    #         pages = pages + await return_document_list_with_llama_parser(pdf_path)
         | 
| 269 | 
            -
                    #     else:
         | 
| 270 | 
            -
                    #         pages = pages + PyPDFLoader(pdf_path).load()
         | 
| 271 |  | 
| 272 | 
            -
                     | 
| 273 | 
            -
                         | 
| 274 | 
            -
             | 
| 275 | 
            -
             | 
| 276 | 
            -
             | 
| 277 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 278 |  | 
| 279 | 
            -
             | 
| 280 | 
            -
             | 
|  | |
|  | |
| 281 |  | 
| 282 | 
            -
             | 
| 283 |  | 
| 284 |  | 
| 285 | 
             
            # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
         | 
|  | |
| 1 | 
             
            import os
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2 | 
             
            from _utils.gerar_relatorio_modelo_usuario.utils import (
         | 
| 3 | 
            +
                get_response_from_auxiliar_contextual_prompt,
         | 
| 4 | 
             
                validate_many_chunks_in_one_request,
         | 
| 5 | 
             
            )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 6 | 
             
            from typing import List, Dict, Tuple, Optional, cast
         | 
| 7 | 
             
            from anthropic import Anthropic, AsyncAnthropic
         | 
| 8 | 
             
            import logging
         | 
| 9 | 
             
            from langchain.schema import Document
         | 
| 10 | 
             
            from llama_index import Document as Llama_Index_Document
         | 
| 11 | 
             
            import asyncio
         | 
|  | |
| 12 | 
             
            from typing import List
         | 
|  | |
| 13 | 
             
            from dataclasses import dataclass
         | 
|  | |
|  | |
|  | |
| 14 |  | 
| 15 | 
             
            from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
         | 
| 16 | 
             
            from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
         | 
|  | |
| 19 | 
             
                DocumentChunk,
         | 
| 20 | 
             
                RetrievalConfig,
         | 
| 21 | 
             
            )
         | 
|  | |
| 22 |  | 
| 23 | 
             
            lista_contador = []
         | 
| 24 |  | 
| 25 |  | 
| 26 | 
             
            class ContextualRetriever:
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                def __init__(self, config: RetrievalConfig, claude_context_model: str):
         | 
|  | |
| 29 | 
             
                    self.config = config
         | 
|  | |
|  | |
| 30 | 
             
                    self.logger = logging.getLogger(__name__)
         | 
| 31 | 
             
                    self.bm25 = None
         | 
| 32 | 
             
                    self.claude_context_model = claude_context_model
         | 
| 33 |  | 
| 34 | 
            +
                    self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
         | 
| 35 | 
            +
                    self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
         | 
| 36 | 
            +
                    # self.claude_client = Anthropic(api_key=claude_api_key)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 37 |  | 
| 38 | 
            +
                async def llm_call_uma_lista_de_chunks(
         | 
| 39 | 
            +
                    self, lista_com_20_chunks: List[DocumentChunk], resumo_auxiliar
         | 
| 40 | 
             
                ) -> str:
         | 
| 41 | 
             
                    """Generate contextual description using ChatOpenAI"""
         | 
| 42 | 
             
                    contador = 1
         | 
| 43 | 
             
                    all_chunks_contents = ""
         | 
| 44 |  | 
| 45 | 
            +
                    for chunk in lista_com_20_chunks:
         | 
| 46 | 
             
                        all_chunks_contents += chunk.content
         | 
| 47 | 
             
                        all_chunks_contents += f"\n\n CHUNK {contador}:\n"
         | 
| 48 | 
             
                        contador += 1
         | 
|  | |
| 55 | 
             
                        # )
         | 
| 56 |  | 
| 57 | 
             
                        for attempt in range(4):
         | 
| 58 | 
            +
                            print(
         | 
| 59 | 
            +
                                f"\n\nTENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt}: {all_chunks_contents[0:500]}"
         | 
| 60 | 
            +
                            )
         | 
| 61 | 
             
                            raw_response = await agpt_answer(prompt)
         | 
| 62 | 
             
                            response = cast(str, raw_response)
         | 
| 63 | 
             
                            # llms = LLM()
         | 
|  | |
| 65 | 
             
                            # return cast(str, response.content)
         | 
| 66 |  | 
| 67 | 
             
                            matches = validate_many_chunks_in_one_request(response)
         | 
|  | |
| 68 |  | 
| 69 | 
             
                            if matches:
         | 
| 70 | 
             
                                result = [
         | 
|  | |
| 77 | 
             
                        self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
         | 
| 78 | 
             
                        return ""
         | 
| 79 |  | 
| 80 | 
            +
                async def contextualize_uma_lista_de_chunks(
         | 
| 81 | 
            +
                    self, lista_com_20_chunks: List[DocumentChunk], response_auxiliar_summary
         | 
| 82 | 
            +
                ):
         | 
| 83 | 
            +
                    lista_contador.append(0)
         | 
| 84 | 
            +
                    print("contador: ", len(lista_contador))
         | 
|  | |
|  | |
| 85 |  | 
| 86 | 
            +
                    result = await self.llm_call_uma_lista_de_chunks(
         | 
| 87 | 
            +
                        lista_com_20_chunks, response_auxiliar_summary
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 88 | 
             
                    )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 89 |  | 
| 90 | 
            +
                    lista_chunks = []
         | 
| 91 | 
            +
                    for index, chunk in enumerate(lista_com_20_chunks):
         | 
| 92 | 
            +
                        lista_chunks.append(
         | 
| 93 | 
            +
                            ContextualizedChunk(
         | 
| 94 | 
            +
                                contextual_summary=result[index][2],
         | 
| 95 | 
            +
                                content=chunk.content,
         | 
| 96 | 
            +
                                page_number=chunk.page_number,
         | 
| 97 | 
            +
                                id_do_processo=int(result[index][0]),
         | 
| 98 | 
            +
                                chunk_id=chunk.chunk_id,
         | 
| 99 | 
            +
                                start_char=chunk.start_char,
         | 
| 100 | 
            +
                                end_char=chunk.end_char,
         | 
| 101 | 
            +
                                context=result[index][1],
         | 
| 102 | 
            +
                            )
         | 
| 103 | 
            +
                        )
         | 
| 104 |  | 
| 105 | 
            +
                    return lista_chunks
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 106 |  | 
| 107 | 
            +
                async def contextualize_all_chunks(
         | 
| 108 | 
            +
                    self,
         | 
| 109 | 
            +
                    all_PDFs_chunks: List[DocumentChunk],
         | 
| 110 | 
            +
                    response_auxiliar_summary,
         | 
| 111 | 
            +
                ) -> List[ContextualizedChunk]:
         | 
| 112 | 
            +
                    """Add context to all chunks"""
         | 
| 113 |  | 
| 114 | 
            +
                    lista_de_listas_cada_com_20_chunks = [
         | 
| 115 | 
            +
                        all_PDFs_chunks[i : i + 20] for i in range(0, len(all_PDFs_chunks), 20)
         | 
| 116 | 
            +
                    ]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 117 |  | 
| 118 | 
            +
                    async with asyncio.TaskGroup() as tg:
         | 
| 119 | 
            +
                        tasks = [
         | 
| 120 | 
            +
                            tg.create_task(
         | 
| 121 | 
            +
                                self.contextualize_uma_lista_de_chunks(
         | 
| 122 | 
            +
                                    lista_com_20_chunks,
         | 
| 123 | 
            +
                                    response_auxiliar_summary,
         | 
| 124 | 
            +
                                )
         | 
| 125 | 
            +
                            )
         | 
| 126 | 
            +
                            for lista_com_20_chunks in lista_de_listas_cada_com_20_chunks
         | 
| 127 | 
            +
                        ]
         | 
| 128 |  | 
| 129 | 
            +
                    # contextualized_chunks = [task.result() for task in tasks]
         | 
| 130 | 
            +
                    contextualized_chunks = []
         | 
| 131 | 
            +
                    for task in tasks:
         | 
| 132 | 
            +
                        contextualized_chunks = contextualized_chunks + task.result()
         | 
| 133 |  | 
| 134 | 
            +
                    return contextualized_chunks
         | 
| 135 |  | 
| 136 |  | 
| 137 | 
             
            # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
         | 
    	
        _utils/gerar_relatorio_modelo_usuario/utils.py
    CHANGED
    
    | @@ -1,5 +1,12 @@ | |
| 1 | 
             
            from typing import List, Tuple
         | 
| 2 | 
             
            from langchain_core.documents import Document
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 3 |  | 
| 4 |  | 
| 5 | 
             
            def gerar_resposta_compilada(serializer):
         | 
| @@ -69,3 +76,51 @@ def validate_many_chunks_in_one_request(response: str): | |
| 69 | 
             
                if len(matches) == 0:
         | 
| 70 | 
             
                    return False
         | 
| 71 | 
             
                return matches_as_list
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            from typing import List, Tuple
         | 
| 2 | 
             
            from langchain_core.documents import Document
         | 
| 3 | 
            +
            from langchain_core.messages import HumanMessage
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            from _utils.splitters.Splitter_class import Splitter
         | 
| 6 | 
            +
            from _utils.LLMs.LLM_class import LLM
         | 
| 7 | 
            +
            from _utils.gerar_relatorio_modelo_usuario.prompts import (
         | 
| 8 | 
            +
                create_prompt_auxiliar_do_contextual_prompt,
         | 
| 9 | 
            +
            )
         | 
| 10 |  | 
| 11 |  | 
| 12 | 
             
            def gerar_resposta_compilada(serializer):
         | 
|  | |
| 76 | 
             
                if len(matches) == 0:
         | 
| 77 | 
             
                    return False
         | 
| 78 | 
             
                return matches_as_list
         | 
| 79 | 
            +
             | 
| 80 | 
            +
             | 
| 81 | 
            +
            # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
         | 
| 82 | 
            +
            async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
         | 
| 83 | 
            +
                full_text = ""
         | 
| 84 | 
            +
                for x in full_text_as_array:
         | 
| 85 | 
            +
                    full_text += x
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
         | 
| 92 | 
            +
                # response_auxiliar_summary = await aclaude_answer(
         | 
| 93 | 
            +
                #     self.claude_client, self.claude_context_model, prompt_auxiliar_summary
         | 
| 94 | 
            +
                # )
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                llms = LLM()
         | 
| 97 | 
            +
                response_auxiliar_summary = await llms.googleGemini().ainvoke(
         | 
| 98 | 
            +
                    [HumanMessage(content=prompt_auxiliar_summary)]
         | 
| 99 | 
            +
                )
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                print(
         | 
| 102 | 
            +
                    "\n\n\n\nresponse_auxiliar_summary.content[0:500]: ",
         | 
| 103 | 
            +
                    response_auxiliar_summary.content[0:500],
         | 
| 104 | 
            +
                )
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                return response_auxiliar_summary.content
         | 
| 107 | 
            +
             | 
| 108 | 
            +
             | 
| 109 | 
            +
            async def get_full_text_and_all_PDFs_chunks(
         | 
| 110 | 
            +
                listaPDFs: List[str],
         | 
| 111 | 
            +
                splitterObject: Splitter,
         | 
| 112 | 
            +
                should_use_llama_parse: bool,
         | 
| 113 | 
            +
                isBubble: bool,
         | 
| 114 | 
            +
            ):
         | 
| 115 | 
            +
                all_PDFs_chunks = []
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                pages: List[str] = []
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                # Load and process document
         | 
| 120 | 
            +
                for pdf_path in listaPDFs:
         | 
| 121 | 
            +
                    chunks, pages = await splitterObject.load_and_split_document(
         | 
| 122 | 
            +
                        pdf_path, should_use_llama_parse, isBubble
         | 
| 123 | 
            +
                    )
         | 
| 124 | 
            +
                    all_PDFs_chunks = all_PDFs_chunks + chunks
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                return all_PDFs_chunks, pages
         | 
    	
        _utils/resumo_completo_cursor.py
    CHANGED
    
    | @@ -4,10 +4,13 @@ from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import ( | |
| 4 | 
             
                EnhancedDocumentSummarizer,
         | 
| 5 | 
             
            )
         | 
| 6 | 
             
            from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
         | 
| 7 | 
            -
                 | 
|  | |
|  | |
|  | |
| 8 | 
             
                get_full_text_and_all_PDFs_chunks,
         | 
|  | |
| 9 | 
             
            )
         | 
| 10 | 
            -
            from _utils.gerar_relatorio_modelo_usuario.utils import gerar_resposta_compilada
         | 
| 11 | 
             
            from _utils.models.gerar_relatorio import (
         | 
| 12 | 
             
                RetrievalConfig,
         | 
| 13 | 
             
            )
         | 
| @@ -51,6 +54,10 @@ async def get_llm_summary_answer_by_cursor_complete( | |
| 51 | 
             
                    chunk_overlap=serializer["chunk_overlap"],
         | 
| 52 | 
             
                )
         | 
| 53 |  | 
|  | |
|  | |
|  | |
|  | |
| 54 | 
             
                # Initialize enhanced summarizer
         | 
| 55 | 
             
                summarizer = EnhancedDocumentSummarizer(
         | 
| 56 | 
             
                    config=config,
         | 
| @@ -59,29 +66,35 @@ async def get_llm_summary_answer_by_cursor_complete( | |
| 59 | 
             
                    chunk_size=serializer["chunk_size"],
         | 
| 60 | 
             
                    num_k_rerank=serializer["num_k_rerank"],
         | 
| 61 | 
             
                    model_cohere_rerank=serializer["model_cohere_rerank"],
         | 
| 62 | 
            -
                    claude_context_model=serializer["claude_context_model"],
         | 
| 63 | 
             
                    prompt_auxiliar=serializer["prompt_auxiliar"],
         | 
| 64 | 
             
                    gpt_model=serializer["model"],
         | 
| 65 | 
             
                    gpt_temperature=serializer["gpt_temperature"],
         | 
| 66 | 
            -
                    # id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
         | 
| 67 | 
             
                    prompt_gerar_documento=serializer["prompt_gerar_documento"],
         | 
| 68 | 
             
                    reciprocal_rank_fusion=reciprocal_rank_fusion,
         | 
| 69 | 
             
                )
         | 
| 70 |  | 
| 71 | 
            -
                 | 
| 72 | 
             
                    listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"], isBubble
         | 
| 73 | 
             
                )
         | 
| 74 |  | 
| 75 | 
            -
                 | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
|  | |
|  | |
| 78 | 
             
                    )
         | 
| 79 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 80 |  | 
| 81 | 
             
                # Create enhanced vector store and BM25 index
         | 
| 82 | 
             
                vector_store, bm25, chunk_ids = (
         | 
| 83 | 
             
                    summarizer.vector_store.create_enhanced_vector_store(
         | 
| 84 | 
            -
                         | 
| 85 | 
             
                    )
         | 
| 86 | 
             
                )
         | 
| 87 |  | 
|  | |
| 4 | 
             
                EnhancedDocumentSummarizer,
         | 
| 5 | 
             
            )
         | 
| 6 | 
             
            from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
         | 
| 7 | 
            +
                ContextualRetriever,
         | 
| 8 | 
            +
            )
         | 
| 9 | 
            +
            from _utils.gerar_relatorio_modelo_usuario.utils import (
         | 
| 10 | 
            +
                gerar_resposta_compilada,
         | 
| 11 | 
             
                get_full_text_and_all_PDFs_chunks,
         | 
| 12 | 
            +
                get_response_from_auxiliar_contextual_prompt,
         | 
| 13 | 
             
            )
         | 
|  | |
| 14 | 
             
            from _utils.models.gerar_relatorio import (
         | 
| 15 | 
             
                RetrievalConfig,
         | 
| 16 | 
             
            )
         | 
|  | |
| 54 | 
             
                    chunk_overlap=serializer["chunk_overlap"],
         | 
| 55 | 
             
                )
         | 
| 56 |  | 
| 57 | 
            +
                contextual_retriever = ContextualRetriever(
         | 
| 58 | 
            +
                    config, serializer["claude_context_model"]
         | 
| 59 | 
            +
                )
         | 
| 60 | 
            +
             | 
| 61 | 
             
                # Initialize enhanced summarizer
         | 
| 62 | 
             
                summarizer = EnhancedDocumentSummarizer(
         | 
| 63 | 
             
                    config=config,
         | 
|  | |
| 66 | 
             
                    chunk_size=serializer["chunk_size"],
         | 
| 67 | 
             
                    num_k_rerank=serializer["num_k_rerank"],
         | 
| 68 | 
             
                    model_cohere_rerank=serializer["model_cohere_rerank"],
         | 
|  | |
| 69 | 
             
                    prompt_auxiliar=serializer["prompt_auxiliar"],
         | 
| 70 | 
             
                    gpt_model=serializer["model"],
         | 
| 71 | 
             
                    gpt_temperature=serializer["gpt_temperature"],
         | 
|  | |
| 72 | 
             
                    prompt_gerar_documento=serializer["prompt_gerar_documento"],
         | 
| 73 | 
             
                    reciprocal_rank_fusion=reciprocal_rank_fusion,
         | 
| 74 | 
             
                )
         | 
| 75 |  | 
| 76 | 
            +
                all_PDFs_chunks, full_text_as_array = await get_full_text_and_all_PDFs_chunks(
         | 
| 77 | 
             
                    listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"], isBubble
         | 
| 78 | 
             
                )
         | 
| 79 |  | 
| 80 | 
            +
                is_contextualized_chunk = serializer["should_have_contextual_chunks"]
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                if is_contextualized_chunk:
         | 
| 83 | 
            +
                    response_auxiliar_summary = await get_response_from_auxiliar_contextual_prompt(
         | 
| 84 | 
            +
                        full_text_as_array
         | 
| 85 | 
             
                    )
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                    contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
         | 
| 88 | 
            +
                        all_PDFs_chunks, response_auxiliar_summary
         | 
| 89 | 
            +
                    )
         | 
| 90 | 
            +
                    chunks_processados = contextualized_chunks
         | 
| 91 | 
            +
                else:
         | 
| 92 | 
            +
                    chunks_processados = all_PDFs_chunks
         | 
| 93 |  | 
| 94 | 
             
                # Create enhanced vector store and BM25 index
         | 
| 95 | 
             
                vector_store, bm25, chunk_ids = (
         | 
| 96 | 
             
                    summarizer.vector_store.create_enhanced_vector_store(
         | 
| 97 | 
            +
                        chunks_processados, is_contextualized_chunk
         | 
| 98 | 
             
                    )
         | 
| 99 | 
             
                )
         | 
| 100 |  | 
    	
        tests/gerar_relatorio_modelo_usuario/test_contextual_retriever.py
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            class TestContextualRetriever:
         | 
| 2 | 
            +
                pass
         |