Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -31,12 +31,18 @@ class PDFChatbot:
|
|
31 |
pdf_directory = "data"
|
32 |
|
33 |
# Duyệt qua các file trong thư mục và đọc từng file PDF
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
chunks = []
|
41 |
current_chunk = []
|
42 |
current_length = 0
|
|
|
31 |
pdf_directory = "data"
|
32 |
|
33 |
# Duyệt qua các file trong thư mục và đọc từng file PDF
|
34 |
+
for filename in os.listdir(pdf_directory):
|
35 |
+
if filename.lower().endswith(".pdf"):
|
36 |
+
pdf_path = os.path.join(pdf_directory, filename)
|
37 |
+
with open(pdf_path, "rb") as pdf_file:
|
38 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
39 |
+
text = ""
|
40 |
+
for page_num in range(len(pdf_reader.pages)):
|
41 |
+
page = pdf_reader.pages[page_num]
|
42 |
+
text += page.extract_text() + "\n"
|
43 |
+
|
44 |
+
# Optional: split into words
|
45 |
+
words = text.split()
|
46 |
chunks = []
|
47 |
current_chunk = []
|
48 |
current_length = 0
|