vella-backend / _utils /resumo_simples_cursor.py
luanpoppe
feat: melhorias no c贸digo e refatora莽玫es
12d3e1a
raw
history blame
7.52 kB
import os
from typing import List, Dict, Tuple
from setup.easy_imports import (
HuggingFaceEmbeddings,
PyPDFLoader,
Chroma,
ChatOpenAI,
create_extraction_chain,
PromptTemplate,
RecursiveCharacterTextSplitter,
)
from dataclasses import dataclass
import uuid
import json
from langchain_huggingface import HuggingFaceEndpoint
from setup.environment import default_model
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
@dataclass
class DocumentChunk:
content: str
page_number: int
chunk_id: str
start_char: int
end_char: int
class DocumentSummarizer:
def __init__(
self, openai_api_key: str, model, embedding, chunk_config, system_prompt
):
self.model = model
self.system_prompt = system_prompt
self.openai_api_key = openai_api_key
self.embeddings = HuggingFaceEmbeddings(model_name=embedding)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"]
)
self.chunk_metadata = {} # Store chunk metadata for tracing
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
"""Load PDF and split into chunks with metadata"""
loader = PyPDFLoader(pdf_path)
pages = loader.load()
chunks = []
char_count = 0
for page in pages:
text = page.page_content
# Split the page content
page_chunks = self.text_splitter.split_text(text)
for chunk in page_chunks:
chunk_id = str(uuid.uuid4())
start_char = text.find(chunk)
end_char = start_char + len(chunk)
doc_chunk = DocumentChunk(
content=chunk,
page_number=page.metadata.get("page") + 1, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
char_count += len(text)
return chunks
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
"""Create vector store with metadata"""
texts = [chunk.content for chunk in chunks]
metadatas = [
{
"chunk_id": chunk.chunk_id,
"page": chunk.page_number,
"start_char": chunk.start_char,
"end_char": chunk.end_char,
}
for chunk in chunks
]
vector_store = Chroma.from_texts(
texts=texts, metadatas=metadatas, embedding=self.embeddings
)
return vector_store
def generate_summary_with_sources(
self,
vector_store: Chroma,
query: str = "Summarize the main points of this document",
) -> List[Dict]:
"""Generate summary with source citations, returning structured JSON data"""
# Retrieve relevant chunks with metadata
relevant_docs = vector_store.similarity_search_with_score(query, k=5)
# Prepare context and track sources
contexts = []
sources = []
for doc, score in relevant_docs:
chunk_id = doc.metadata["chunk_id"]
context = doc.page_content
contexts.append(context)
sources.append(
{
"content": context,
"page": doc.metadata["page"],
"chunk_id": chunk_id,
"relevance_score": score,
}
)
prompt = PromptTemplate(
template=self.system_prompt, input_variables=["context"]
)
llm = ""
if self.model == default_model:
llm = ChatOpenAI(
temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
)
else:
llm = HuggingFaceEndpoint(
repo_id=self.model,
task="text-generation",
max_new_tokens=1100,
do_sample=False,
huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
)
response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
# Split the response into paragraphs
summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
# Create structured output
structured_output = []
for idx, summary in enumerate(summaries):
# Associate each summary with the most relevant source
structured_output.append(
{
"content": summary,
"source": {
"page": sources[min(idx, len(sources) - 1)]["page"],
"text": sources[min(idx, len(sources) - 1)]["content"][:200]
+ "...",
"relevance_score": sources[min(idx, len(sources) - 1)][
"relevance_score"
],
},
}
)
return structured_output
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
"""Get extended context around a specific chunk"""
metadata = self.chunk_metadata.get(chunk_id)
if not metadata:
return None
return {
"page": metadata["page"],
"start_char": metadata["start_char"],
"end_char": metadata["end_char"],
}
def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
# By Luan
allPdfsChunks = []
# Initialize summarizer
summarizer = DocumentSummarizer(
openai_api_key=os.environ.get("OPENAI_API_KEY"),
embedding=serializer["hf_embedding"],
chunk_config={
"size": serializer["chunk_size"],
"overlap": serializer["chunk_overlap"],
},
system_prompt=serializer["system_prompt"],
model=serializer["model"],
)
# Load and process document
for pdf in listaPDFs:
pdf_path = pdf
chunks = summarizer.load_and_split_document(pdf_path)
allPdfsChunks = allPdfsChunks + chunks
vector_store = summarizer.create_vector_store(allPdfsChunks)
# Generate structured summary
structured_summaries = summarizer.generate_summary_with_sources(vector_store)
# Print or return the structured data
# print(structured_summaries)
json_data = json.dumps(structured_summaries)
print("\n\n")
print(json_data)
return structured_summaries
# If you need to send to frontend, you can just return structured_summaries
# It will be in the format:
# [
# {
# "content": "Summary point 1...",
# "source": {
# "page": 1,
# "text": "Source text...",
# "relevance_score": 0.95
# }
# },
# ...
# ]
if __name__ == "__main__":
get_llm_summary_answer_by_cursor()