AreebaAliAsghar commited on
Commit
e9edc62
·
verified ·
1 Parent(s): 8c76a79

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +77 -0
  2. apt.txt +1 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ from pypdf import PdfReader
4
+ import gradio as gr
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
+ import numpy as np
8
+
9
+ # Load embedding model
10
+ model = SentenceTransformer('all-MiniLM-L6-v2')
11
+
12
+ # Global state to persist embeddings and chunks
13
+ index = None
14
+ chunks = []
15
+
16
+ # Step 1: Extract text from uploaded PDFs
17
+ def extract_text_from_pdfs(files):
18
+ all_text = ""
19
+ for file in files:
20
+ reader = PdfReader(file.name)
21
+ for page in reader.pages:
22
+ text = page.extract_text()
23
+ if text:
24
+ all_text += text + "\n"
25
+ return all_text
26
+
27
+ # Step 2: Chunk text
28
+ def chunk_text(text, chunk_size=500, overlap=50):
29
+ words = text.split()
30
+ result = []
31
+ for i in range(0, len(words), chunk_size - overlap):
32
+ chunk = " ".join(words[i:i + chunk_size])
33
+ result.append(chunk)
34
+ return result
35
+
36
+ # Step 3: Embed and store chunks
37
+ def create_index(text_chunks):
38
+ global index, chunks
39
+ chunks = text_chunks
40
+ embeddings = model.encode(chunks)
41
+ index = faiss.IndexFlatL2(len(embeddings[0]))
42
+ index.add(np.array(embeddings))
43
+
44
+ # Step 4: Retrieve top relevant chunks
45
+ def get_top_chunks(query, k=3):
46
+ query_vec = model.encode([query])
47
+ D, I = index.search(np.array(query_vec), k)
48
+ return [chunks[i] for i in I[0]]
49
+
50
+ # Step 5: Fake LLM response (replace with real API call if needed)
51
+ def call_llm(context, question):
52
+ return f"Answer (simulated): Based on context:\n\n{context}\n\nQuestion: {question}"
53
+
54
+ # Step 6: Gradio main function
55
+ def rag_pipeline(files, question):
56
+ text = extract_text_from_pdfs(files)
57
+ text_chunks = chunk_text(text)
58
+ create_index(text_chunks)
59
+ top_chunks = get_top_chunks(question)
60
+ context = "\n".join(top_chunks)
61
+ answer = call_llm(context, question)
62
+ return answer
63
+
64
+ # Step 7: Gradio UI
65
+ demo = gr.Interface(
66
+ fn=rag_pipeline,
67
+ inputs=[
68
+ gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
69
+ gr.Textbox(lines=2, label="Ask a question")
70
+ ],
71
+ outputs="text",
72
+ title="RAG PDF Chatbot",
73
+ description="Upload PDFs and ask questions based on their content"
74
+ )
75
+
76
+ if __name__ == "__main__":
77
+ demo.launch()
apt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pypdf
3
+ sentence-transformers
4
+ faiss-cpu
5
+ numpy