Spaces:
Running
Running
| import os | |
| from typing import List, Dict, Tuple | |
| from setup.easy_imports import ( | |
| HuggingFaceEmbeddings, | |
| PyPDFLoader, | |
| Chroma, | |
| ChatOpenAI, | |
| create_extraction_chain, | |
| PromptTemplate, | |
| RecursiveCharacterTextSplitter, | |
| ) | |
| from dataclasses import dataclass | |
| import uuid | |
| import json | |
| from langchain_huggingface import HuggingFaceEndpoint | |
| from setup.environment import default_model | |
| os.environ["LANGCHAIN_TRACING_V2"] = "true" | |
| os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" | |
| os.environ.get("LANGCHAIN_API_KEY") | |
| os.environ["LANGCHAIN_PROJECT"] = "VELLA" | |
| class DocumentChunk: | |
| content: str | |
| page_number: int | |
| chunk_id: str | |
| start_char: int | |
| end_char: int | |
| class DocumentSummarizer: | |
| def __init__( | |
| self, openai_api_key: str, model, embedding, chunk_config, system_prompt | |
| ): | |
| self.model = model | |
| self.system_prompt = system_prompt | |
| self.openai_api_key = openai_api_key | |
| self.embeddings = HuggingFaceEmbeddings(model_name=embedding) | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"] | |
| ) | |
| self.chunk_metadata = {} # Store chunk metadata for tracing | |
| def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]: | |
| """Load PDF and split into chunks with metadata""" | |
| loader = PyPDFLoader(pdf_path) | |
| pages = loader.load() | |
| chunks = [] | |
| char_count = 0 | |
| for page in pages: | |
| text = page.page_content | |
| # Split the page content | |
| page_chunks = self.text_splitter.split_text(text) | |
| for chunk in page_chunks: | |
| chunk_id = str(uuid.uuid4()) | |
| start_char = text.find(chunk) | |
| end_char = start_char + len(chunk) | |
| doc_chunk = DocumentChunk( | |
| content=chunk, | |
| page_number=page.metadata.get("page") + 1, # 1-based page numbering | |
| chunk_id=chunk_id, | |
| start_char=char_count + start_char, | |
| end_char=char_count + end_char, | |
| ) | |
| chunks.append(doc_chunk) | |
| # Store metadata for later retrieval | |
| self.chunk_metadata[chunk_id] = { | |
| "page": doc_chunk.page_number, | |
| "start_char": doc_chunk.start_char, | |
| "end_char": doc_chunk.end_char, | |
| } | |
| char_count += len(text) | |
| return chunks | |
| def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma: | |
| """Create vector store with metadata""" | |
| texts = [chunk.content for chunk in chunks] | |
| metadatas = [ | |
| { | |
| "chunk_id": chunk.chunk_id, | |
| "page": chunk.page_number, | |
| "start_char": chunk.start_char, | |
| "end_char": chunk.end_char, | |
| } | |
| for chunk in chunks | |
| ] | |
| vector_store = Chroma.from_texts( | |
| texts=texts, metadatas=metadatas, embedding=self.embeddings | |
| ) | |
| return vector_store | |
| def generate_summary_with_sources( | |
| self, | |
| vector_store: Chroma, | |
| query: str = "Summarize the main points of this document", | |
| ) -> List[Dict]: | |
| """Generate summary with source citations, returning structured JSON data""" | |
| # Retrieve relevant chunks with metadata | |
| relevant_docs = vector_store.similarity_search_with_score(query, k=5) | |
| # Prepare context and track sources | |
| contexts = [] | |
| sources = [] | |
| for doc, score in relevant_docs: | |
| chunk_id = doc.metadata["chunk_id"] | |
| context = doc.page_content | |
| contexts.append(context) | |
| sources.append( | |
| { | |
| "content": context, | |
| "page": doc.metadata["page"], | |
| "chunk_id": chunk_id, | |
| "relevance_score": score, | |
| } | |
| ) | |
| prompt = PromptTemplate( | |
| template=self.system_prompt, input_variables=["context"] | |
| ) | |
| llm = "" | |
| if self.model == default_model: | |
| llm = ChatOpenAI( | |
| temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key | |
| ) | |
| else: | |
| llm = HuggingFaceEndpoint( | |
| repo_id=self.model, | |
| task="text-generation", | |
| max_new_tokens=1100, | |
| do_sample=False, | |
| huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"), | |
| ) | |
| response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content | |
| # Split the response into paragraphs | |
| summaries = [p.strip() for p in response.split("\n\n") if p.strip()] | |
| # Create structured output | |
| structured_output = [] | |
| for idx, summary in enumerate(summaries): | |
| # Associate each summary with the most relevant source | |
| structured_output.append( | |
| { | |
| "content": summary, | |
| "source": { | |
| "page": sources[min(idx, len(sources) - 1)]["page"], | |
| "text": sources[min(idx, len(sources) - 1)]["content"][:200] | |
| + "...", | |
| "relevance_score": sources[min(idx, len(sources) - 1)][ | |
| "relevance_score" | |
| ], | |
| }, | |
| } | |
| ) | |
| return structured_output | |
| def get_source_context(self, chunk_id: str, window: int = 100) -> Dict: | |
| """Get extended context around a specific chunk""" | |
| metadata = self.chunk_metadata.get(chunk_id) | |
| if not metadata: | |
| return None | |
| return { | |
| "page": metadata["page"], | |
| "start_char": metadata["start_char"], | |
| "end_char": metadata["end_char"], | |
| } | |
| def get_llm_summary_answer_by_cursor(serializer, listaPDFs): | |
| # By Luan | |
| allPdfsChunks = [] | |
| # Initialize summarizer | |
| summarizer = DocumentSummarizer( | |
| openai_api_key=os.environ.get("OPENAI_API_KEY"), | |
| embedding=serializer["hf_embedding"], | |
| chunk_config={ | |
| "size": serializer["chunk_size"], | |
| "overlap": serializer["chunk_overlap"], | |
| }, | |
| system_prompt=serializer["system_prompt"], | |
| model=serializer["model"], | |
| ) | |
| # Load and process document | |
| for pdf in listaPDFs: | |
| pdf_path = pdf | |
| chunks = summarizer.load_and_split_document(pdf_path) | |
| allPdfsChunks = allPdfsChunks + chunks | |
| vector_store = summarizer.create_vector_store(allPdfsChunks) | |
| # Generate structured summary | |
| structured_summaries = summarizer.generate_summary_with_sources(vector_store) | |
| # Print or return the structured data | |
| # print(structured_summaries) | |
| json_data = json.dumps(structured_summaries) | |
| print("\n\n") | |
| print(json_data) | |
| return structured_summaries | |
| # If you need to send to frontend, you can just return structured_summaries | |
| # It will be in the format: | |
| # [ | |
| # { | |
| # "content": "Summary point 1...", | |
| # "source": { | |
| # "page": 1, | |
| # "text": "Source text...", | |
| # "relevance_score": 0.95 | |
| # } | |
| # }, | |
| # ... | |
| # ] | |
| if __name__ == "__main__": | |
| get_llm_summary_answer_by_cursor() | |