File size: 7,519 Bytes
ca8a144
 
12d3e1a
 
 
 
 
 
 
 
 
ca8a144
 
 
 
 
 
12d3e1a
 
ca8a144
12d3e1a
 
ca8a144
 
 
12d3e1a
 
 
 
 
 
ca8a144
 
12d3e1a
 
 
 
 
 
 
 
 
 
ca8a144
12d3e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8a144
12d3e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8a144
 
 
12d3e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8a144
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
from typing import List, Dict, Tuple
from setup.easy_imports import (
    HuggingFaceEmbeddings,
    PyPDFLoader,
    Chroma,
    ChatOpenAI,
    create_extraction_chain,
    PromptTemplate,
    RecursiveCharacterTextSplitter,
)
from dataclasses import dataclass
import uuid
import json
from langchain_huggingface import HuggingFaceEndpoint
from setup.environment import default_model

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"


@dataclass
class DocumentChunk:
    content: str
    page_number: int
    chunk_id: str
    start_char: int
    end_char: int


class DocumentSummarizer:

    def __init__(
        self, openai_api_key: str, model, embedding, chunk_config, system_prompt
    ):
        self.model = model
        self.system_prompt = system_prompt
        self.openai_api_key = openai_api_key
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"]
        )
        self.chunk_metadata = {}  # Store chunk metadata for tracing

    def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
        """Load PDF and split into chunks with metadata"""
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        chunks = []
        char_count = 0

        for page in pages:
            text = page.page_content
            # Split the page content
            page_chunks = self.text_splitter.split_text(text)

            for chunk in page_chunks:
                chunk_id = str(uuid.uuid4())
                start_char = text.find(chunk)
                end_char = start_char + len(chunk)

                doc_chunk = DocumentChunk(
                    content=chunk,
                    page_number=page.metadata.get("page") + 1,  # 1-based page numbering
                    chunk_id=chunk_id,
                    start_char=char_count + start_char,
                    end_char=char_count + end_char,
                )
                chunks.append(doc_chunk)

                # Store metadata for later retrieval
                self.chunk_metadata[chunk_id] = {
                    "page": doc_chunk.page_number,
                    "start_char": doc_chunk.start_char,
                    "end_char": doc_chunk.end_char,
                }

            char_count += len(text)

        return chunks

    def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
        """Create vector store with metadata"""
        texts = [chunk.content for chunk in chunks]
        metadatas = [
            {
                "chunk_id": chunk.chunk_id,
                "page": chunk.page_number,
                "start_char": chunk.start_char,
                "end_char": chunk.end_char,
            }
            for chunk in chunks
        ]

        vector_store = Chroma.from_texts(
            texts=texts, metadatas=metadatas, embedding=self.embeddings
        )
        return vector_store

    def generate_summary_with_sources(
        self,
        vector_store: Chroma,
        query: str = "Summarize the main points of this document",
    ) -> List[Dict]:
        """Generate summary with source citations, returning structured JSON data"""
        # Retrieve relevant chunks with metadata
        relevant_docs = vector_store.similarity_search_with_score(query, k=5)

        # Prepare context and track sources
        contexts = []
        sources = []

        for doc, score in relevant_docs:
            chunk_id = doc.metadata["chunk_id"]
            context = doc.page_content
            contexts.append(context)

            sources.append(
                {
                    "content": context,
                    "page": doc.metadata["page"],
                    "chunk_id": chunk_id,
                    "relevance_score": score,
                }
            )

        prompt = PromptTemplate(
            template=self.system_prompt, input_variables=["context"]
        )
        llm = ""

        if self.model == default_model:
            llm = ChatOpenAI(
                temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
            )
        else:
            llm = HuggingFaceEndpoint(
                repo_id=self.model,
                task="text-generation",
                max_new_tokens=1100,
                do_sample=False,
                huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
            )

        response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content

        # Split the response into paragraphs
        summaries = [p.strip() for p in response.split("\n\n") if p.strip()]

        # Create structured output
        structured_output = []
        for idx, summary in enumerate(summaries):
            # Associate each summary with the most relevant source
            structured_output.append(
                {
                    "content": summary,
                    "source": {
                        "page": sources[min(idx, len(sources) - 1)]["page"],
                        "text": sources[min(idx, len(sources) - 1)]["content"][:200]
                        + "...",
                        "relevance_score": sources[min(idx, len(sources) - 1)][
                            "relevance_score"
                        ],
                    },
                }
            )

        return structured_output

    def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
        """Get extended context around a specific chunk"""
        metadata = self.chunk_metadata.get(chunk_id)
        if not metadata:
            return None

        return {
            "page": metadata["page"],
            "start_char": metadata["start_char"],
            "end_char": metadata["end_char"],
        }


def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
    # By Luan
    allPdfsChunks = []

    # Initialize summarizer
    summarizer = DocumentSummarizer(
        openai_api_key=os.environ.get("OPENAI_API_KEY"),
        embedding=serializer["hf_embedding"],
        chunk_config={
            "size": serializer["chunk_size"],
            "overlap": serializer["chunk_overlap"],
        },
        system_prompt=serializer["system_prompt"],
        model=serializer["model"],
    )

    # Load and process document
    for pdf in listaPDFs:
        pdf_path = pdf
        chunks = summarizer.load_and_split_document(pdf_path)
        allPdfsChunks = allPdfsChunks + chunks

    vector_store = summarizer.create_vector_store(allPdfsChunks)

    # Generate structured summary
    structured_summaries = summarizer.generate_summary_with_sources(vector_store)

    # Print or return the structured data
    # print(structured_summaries)
    json_data = json.dumps(structured_summaries)
    print("\n\n")
    print(json_data)
    return structured_summaries
    # If you need to send to frontend, you can just return structured_summaries
    # It will be in the format:
    # [
    #     {
    #         "content": "Summary point 1...",
    #         "source": {
    #             "page": 1,
    #             "text": "Source text...",
    #             "relevance_score": 0.95
    #         }
    #     },
    #     ...
    # ]


if __name__ == "__main__":
    get_llm_summary_answer_by_cursor()