Spaces:
Sleeping
Sleeping
import os | |
import PyPDF2 | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import streamlit as st | |
from groq import Groq | |
# Download the punkt resource at runtime (in case it wasn't downloaded during build) | |
nltk.download('punkt') | |
# Set the API key directly | |
GROQ_API_KEY = "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" | |
# Initialize Groq Client | |
client = Groq(api_key=GROQ_API_KEY) | |
# Test the client | |
response = client.chat.completions.create( | |
messages=[{"role": "user", "content": "Test query to verify Groq API"}], | |
model="llama3-8b-8192", | |
) | |
print(response.choices[0].message.content) | |
# Load Sentence Transformer Model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Initialize FAISS Index | |
dimension = 384 # Dimension of the embeddings | |
index = faiss.IndexFlatL2(dimension) | |
# Function to Extract Text from PDF | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
# Function to Chunk and Tokenize Text | |
def chunk_and_tokenize(text): | |
sentences = sent_tokenize(text) | |
chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)] | |
return chunks | |
# Function to Create Embeddings | |
def create_embeddings(chunks): | |
embeddings = model.encode(chunks) | |
return embeddings | |
# Function to Query Groq | |
def query_groq(prompt): | |
response = client.chat.completions.create( | |
messages=[{"role": "user", "content": prompt}], | |
model="llama3-8b-8192", | |
) | |
return response.choices[0].message.content | |
# Streamlit Frontend | |
st.title("RAG-based PDF Query App") | |
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") | |
if uploaded_file: | |
text = extract_text_from_pdf(uploaded_file) | |
st.write("Extracted Text:") | |
st.write(text[:500]) # Display first 500 characters | |
chunks = chunk_and_tokenize(text) | |
st.write(f"Text divided into {len(chunks)} chunks.") | |
embeddings = create_embeddings(chunks) | |
index.add(embeddings) | |
st.write("Embeddings created and stored in FAISS database.") | |
query = st.text_input("Enter your query:") | |
if query: | |
# Find the most relevant chunk | |
query_embedding = model.encode([query]) | |
_, indices = index.search(query_embedding, 1) | |
relevant_chunk = chunks[indices[0][0]] | |
# Query Groq | |
response = query_groq(relevant_chunk) | |
st.write("Response from Groq:") | |
st.write(response) | |