import os import PyPDF2 import nltk from nltk.tokenize import sent_tokenize from sentence_transformers import SentenceTransformer import faiss import streamlit as st from groq import Groq # Download the punkt resource at runtime (in case it wasn't downloaded during build) nltk.download('punkt') # Set the API key directly GROQ_API_KEY = "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" # Initialize Groq Client client = Groq(api_key=GROQ_API_KEY) # Test the client response = client.chat.completions.create( messages=[{"role": "user", "content": "Test query to verify Groq API"}], model="llama3-8b-8192", ) print(response.choices[0].message.content) # Load Sentence Transformer Model model = SentenceTransformer('all-MiniLM-L6-v2') # Initialize FAISS Index dimension = 384 # Dimension of the embeddings index = faiss.IndexFlatL2(dimension) # Function to Extract Text from PDF def extract_text_from_pdf(pdf_file): pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text # Function to Chunk and Tokenize Text def chunk_and_tokenize(text): sentences = sent_tokenize(text) chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)] return chunks # Function to Create Embeddings def create_embeddings(chunks): embeddings = model.encode(chunks) return embeddings # Function to Query Groq def query_groq(prompt): response = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama3-8b-8192", ) return response.choices[0].message.content # Streamlit Frontend st.title("RAG-based PDF Query App") uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") if uploaded_file: text = extract_text_from_pdf(uploaded_file) st.write("Extracted Text:") st.write(text[:500]) # Display first 500 characters chunks = chunk_and_tokenize(text) st.write(f"Text divided into {len(chunks)} chunks.") embeddings = create_embeddings(chunks) index.add(embeddings) st.write("Embeddings created and stored in FAISS database.") query = st.text_input("Enter your query:") if query: # Find the most relevant chunk query_embedding = model.encode([query]) _, indices = index.search(query_embedding, 1) relevant_chunk = chunks[indices[0][0]] # Query Groq response = query_groq(relevant_chunk) st.write("Response from Groq:") st.write(response)