import gradio as gr import os from PyPDF2 import PdfReader from llama_index.core.schema import TextNode from langchain_google_genai import GoogleGenerativeAIEmbeddings import chromadb os.environ["GOOGLE_API_KEY"] = "AIzaSyBrCisSoUqfhFvP2L3bXLhOUUZl9kHLbL0" # Initialize the ChromaDB client and collection chroma_client = chromadb.Client() chroma_collection = chroma_client.create_collection("user_uploaded_docs") # Function to extract text from PDF def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text # Chunk text into smaller pieces def chunk_text(text, max_length=2500): return [text[i:i + max_length] for i in range(0, len(text), max_length)] # Initialize the embedding model embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") # Function to handle the embedding process and store in ChromaDB def process_documents(pdf_files): for pdf_file in pdf_files: # Extract text from the PDF pdf_text = extract_text_from_pdf(pdf_file) # Chunk the extracted text chunks = chunk_text(pdf_text) # Embed chunks and store in ChromaDB chunk_embeddings = [] nodes = [] for i, chunk in enumerate(chunks): node = TextNode( text=chunk, metadata={ "filename": os.path.basename(pdf_file.name), "chunk_index": i, "length": len(chunk), } ) nodes.append(node) chunk_embeddings.append(chunk) # Perform batch embedding embeddings_batch = embeddings.embed_documents(chunk_embeddings) # Store each chunk with its embedding in ChromaDB for i, node in enumerate(nodes): node.embedding = embeddings_batch[i] chroma_collection.add( documents=[node.text], embeddings=[node.embedding], metadatas=[node.metadata], ids=[f"{node.metadata['filename']}_{i}"] ) return "Files have been successfully processed and embedded!" # Function to query ChromaDB and retrieve relevant documents def query_documents(user_query): query_embedding = embeddings.embed_query(user_query) # Perform the query on ChromaDB results = chroma_collection.query( query_embeddings=[query_embedding], n_results=3 # Return the top 3 most relevant documents ) response = "" for doc, metadata in zip(results['documents'][0], results['metadatas'][0]): response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n" return response # Gradio interface combining document upload and query features with gr.Blocks() as demo: pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files") process_btn = gr.Button("Process PDFs") process_output = gr.Textbox(label="wait before success message for the document process") query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...") query_btn = gr.Button("Query Documents") query_output = gr.Textbox(label="retrieved documents") process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output]) query_btn.click(query_documents, inputs=[query_input], outputs=[query_output]) demo.launch()