RAG / app.py
EngrNarmeen's picture
Update app.py
0f7414c verified
import os
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import streamlit as st
from groq import Groq
# Download the punkt resource at runtime (in case it wasn't downloaded during build)
nltk.download('punkt')
# Set the API key directly
GROQ_API_KEY = "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"
# Initialize Groq Client
client = Groq(api_key=GROQ_API_KEY)
# Test the client
response = client.chat.completions.create(
messages=[{"role": "user", "content": "Test query to verify Groq API"}],
model="llama3-8b-8192",
)
print(response.choices[0].message.content)
# Load Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize FAISS Index
dimension = 384 # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)
# Function to Extract Text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to Chunk and Tokenize Text
def chunk_and_tokenize(text):
sentences = sent_tokenize(text)
chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)]
return chunks
# Function to Create Embeddings
def create_embeddings(chunks):
embeddings = model.encode(chunks)
return embeddings
# Function to Query Groq
def query_groq(prompt):
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192",
)
return response.choices[0].message.content
# Streamlit Frontend
st.title("RAG-based PDF Query App")
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file:
text = extract_text_from_pdf(uploaded_file)
st.write("Extracted Text:")
st.write(text[:500]) # Display first 500 characters
chunks = chunk_and_tokenize(text)
st.write(f"Text divided into {len(chunks)} chunks.")
embeddings = create_embeddings(chunks)
index.add(embeddings)
st.write("Embeddings created and stored in FAISS database.")
query = st.text_input("Enter your query:")
if query:
# Find the most relevant chunk
query_embedding = model.encode([query])
_, indices = index.search(query_embedding, 1)
relevant_chunk = chunks[indices[0][0]]
# Query Groq
response = query_groq(relevant_chunk)
st.write("Response from Groq:")
st.write(response)