Spaces:
Running
Running
| import os | |
| from typing import List, Dict, Tuple | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.chains import create_extraction_chain | |
| from langchain.prompts import PromptTemplate | |
| from dataclasses import dataclass | |
| import uuid | |
| import json | |
| from langchain_huggingface import HuggingFaceEndpoint | |
| from setup.environment import default_model | |
| os.environ["LANGCHAIN_TRACING_V2"]="true" | |
| os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com" | |
| os.environ.get("LANGCHAIN_API_KEY") | |
| os.environ["LANGCHAIN_PROJECT"]="VELLA" | |
| class DocumentChunk: | |
| content: str | |
| page_number: int | |
| chunk_id: str | |
| start_char: int | |
| end_char: int | |
| class DocumentSummarizer: | |
| def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt): | |
| self.model = model | |
| self.system_prompt = system_prompt | |
| self.openai_api_key = openai_api_key | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name=embedding | |
| ) | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_config["size"], | |
| chunk_overlap=chunk_config["overlap"] | |
| ) | |
| self.chunk_metadata = {} # Store chunk metadata for tracing | |
| def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]: | |
| """Load PDF and split into chunks with metadata""" | |
| loader = PyPDFLoader(pdf_path) | |
| pages = loader.load() | |
| chunks = [] | |
| char_count = 0 | |
| for page in pages: | |
| text = page.page_content | |
| # Split the page content | |
| page_chunks = self.text_splitter.split_text(text) | |
| for chunk in page_chunks: | |
| chunk_id = str(uuid.uuid4()) | |
| start_char = text.find(chunk) | |
| end_char = start_char + len(chunk) | |
| doc_chunk = DocumentChunk( | |
| content=chunk, | |
| page_number=page.metadata.get('page') + 1, # 1-based page numbering | |
| chunk_id=chunk_id, | |
| start_char=char_count + start_char, | |
| end_char=char_count + end_char | |
| ) | |
| chunks.append(doc_chunk) | |
| # Store metadata for later retrieval | |
| self.chunk_metadata[chunk_id] = { | |
| 'page': doc_chunk.page_number, | |
| 'start_char': doc_chunk.start_char, | |
| 'end_char': doc_chunk.end_char | |
| } | |
| char_count += len(text) | |
| return chunks | |
| def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma: | |
| """Create vector store with metadata""" | |
| texts = [chunk.content for chunk in chunks] | |
| metadatas = [{ | |
| 'chunk_id': chunk.chunk_id, | |
| 'page': chunk.page_number, | |
| 'start_char': chunk.start_char, | |
| 'end_char': chunk.end_char | |
| } for chunk in chunks] | |
| vector_store = Chroma.from_texts( | |
| texts=texts, | |
| metadatas=metadatas, | |
| embedding=self.embeddings | |
| ) | |
| return vector_store | |
| def generate_summary_with_sources( | |
| self, | |
| vector_store: Chroma, | |
| query: str = "Summarize the main points of this document" | |
| ) -> List[Dict]: | |
| """Generate summary with source citations, returning structured JSON data""" | |
| # Retrieve relevant chunks with metadata | |
| relevant_docs = vector_store.similarity_search_with_score(query, k=5) | |
| # Prepare context and track sources | |
| contexts = [] | |
| sources = [] | |
| for doc, score in relevant_docs: | |
| chunk_id = doc.metadata['chunk_id'] | |
| context = doc.page_content | |
| contexts.append(context) | |
| sources.append({ | |
| 'content': context, | |
| 'page': doc.metadata['page'], | |
| 'chunk_id': chunk_id, | |
| 'relevance_score': score | |
| }) | |
| prompt = PromptTemplate( | |
| template=self.system_prompt, | |
| input_variables=["context"] | |
| ) | |
| llm = "" | |
| if (self.model == default_model): | |
| llm = ChatOpenAI( | |
| temperature=0, | |
| model_name="gpt-4o-mini", | |
| api_key=self.openai_api_key | |
| ) | |
| else: | |
| llm = HuggingFaceEndpoint( | |
| repo_id=self.model, | |
| task="text-generation", | |
| max_new_tokens=1100, | |
| do_sample=False, | |
| huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
| ) | |
| response = llm.predict(prompt.format(context="\n\n".join(contexts))) | |
| # Split the response into paragraphs | |
| summaries = [p.strip() for p in response.split('\n\n') if p.strip()] | |
| # Create structured output | |
| structured_output = [] | |
| for idx, summary in enumerate(summaries): | |
| # Associate each summary with the most relevant source | |
| structured_output.append({ | |
| "content": summary, | |
| "source": { | |
| "page": sources[min(idx, len(sources)-1)]['page'], | |
| "text": sources[min(idx, len(sources)-1)]['content'][:200] + "...", | |
| "relevance_score": sources[min(idx, len(sources)-1)]['relevance_score'] | |
| } | |
| }) | |
| return structured_output | |
| def get_source_context(self, chunk_id: str, window: int = 100) -> Dict: | |
| """Get extended context around a specific chunk""" | |
| metadata = self.chunk_metadata.get(chunk_id) | |
| if not metadata: | |
| return None | |
| return { | |
| 'page': metadata['page'], | |
| 'start_char': metadata['start_char'], | |
| 'end_char': metadata['end_char'] | |
| } | |
| def get_llm_summary_answer_by_cursor(serializer, listaPDFs): | |
| # By Luan | |
| allPdfsChunks = [] | |
| # Initialize summarizer | |
| summarizer = DocumentSummarizer( | |
| openai_api_key=os.environ.get("OPENAI_API_KEY"), | |
| embedding=serializer["hf_embedding"], | |
| chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]}, | |
| system_prompt=serializer["system_prompt"], | |
| model=serializer["model"] | |
| ) | |
| # Load and process document | |
| for pdf in listaPDFs: | |
| pdf_path = pdf | |
| chunks = summarizer.load_and_split_document(pdf_path) | |
| allPdfsChunks = allPdfsChunks + chunks | |
| vector_store = summarizer.create_vector_store(allPdfsChunks) | |
| # Generate structured summary | |
| structured_summaries = summarizer.generate_summary_with_sources(vector_store) | |
| # Print or return the structured data | |
| # print(structured_summaries) | |
| json_data = json.dumps(structured_summaries) | |
| print("\n\n") | |
| print(json_data) | |
| return structured_summaries | |
| # If you need to send to frontend, you can just return structured_summaries | |
| # It will be in the format: | |
| # [ | |
| # { | |
| # "content": "Summary point 1...", | |
| # "source": { | |
| # "page": 1, | |
| # "text": "Source text...", | |
| # "relevance_score": 0.95 | |
| # } | |
| # }, | |
| # ... | |
| # ] | |
| if __name__ == "__main__": | |
| get_llm_summary_answer_by_cursor() | |