bsenst commited on
Commit
e4322e9
·
1 Parent(s): dc8b376

add app and requirements

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +186 -0
  3. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import zipfile
4
+ from io import BytesIO
5
+ from PyPDF2 import PdfReader
6
+ from keybert import KeyBERT
7
+ from sentence_transformers import SentenceTransformer, util
8
+
9
+ # Initialize KeyBERT and Sentence Transformer
10
+ kw_model = KeyBERT()
11
+ semantic_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
12
+
13
+ def main():
14
+ st.title("PDF Topic Grouping App")
15
+
16
+ # Step 1: Upload PDFs
17
+ uploaded_files = st.file_uploader("Upload PDFs", type="pdf", accept_multiple_files=True)
18
+
19
+ if not uploaded_files:
20
+ st.info("Please upload PDFs to continue.")
21
+ return
22
+
23
+ # Extract text and keywords from PDFs
24
+ pdf_texts = {}
25
+ keywords_set = set()
26
+
27
+ # Initialize a progress bar
28
+ progress = st.progress(0)
29
+ total_files = len(uploaded_files)
30
+ processed_files = 0
31
+
32
+ st.info("Extracting keywords from PDFs...")
33
+ for uploaded_file in uploaded_files:
34
+ pdf_name = uploaded_file.name
35
+ try:
36
+ # Read PDF content
37
+ reader = PdfReader(uploaded_file)
38
+ text = ""
39
+ for page in reader.pages:
40
+ text += page.extract_text()
41
+ pdf_texts[pdf_name] = text.lower()
42
+
43
+ # Extract keywords using KeyBERT
44
+ extracted_keywords = kw_model.extract_keywords(text, top_n=5)
45
+ for kw, _ in extracted_keywords:
46
+ keywords_set.add(kw.lower())
47
+
48
+ except Exception as e:
49
+ st.error(f"Failed to process {pdf_name}: {e}")
50
+ continue
51
+
52
+ # Update progress
53
+ processed_files += 1
54
+ progress.progress(processed_files / total_files)
55
+
56
+ # Mark progress as complete
57
+ progress.progress(1.0)
58
+
59
+ # Display extracted keywords and let the user select topics
60
+ st.write("Extracted Keywords:")
61
+ selected_keywords = st.multiselect(
62
+ "Select at least two keywords/topics for grouping:",
63
+ list(keywords_set),
64
+ default=list(keywords_set)[:2]
65
+ )
66
+
67
+ # Add a confirmation button to proceed
68
+ if st.button("Confirm Keyword Selection"):
69
+ if len(selected_keywords) < 2:
70
+ st.error("Please select at least two keywords.")
71
+ else:
72
+ st.success("Keyword selection confirmed. Processing PDFs...")
73
+ proceed_with_keywords = True
74
+ else:
75
+ proceed_with_keywords = False
76
+
77
+ # Ensure the process does not continue until confirmation
78
+ if not proceed_with_keywords:
79
+ st.stop()
80
+
81
+ # Initialize a progress bar for embedding precomputation
82
+ st.info("Precomputing embeddings for all PDFs...")
83
+ progress = st.progress(0)
84
+ total_pdfs = len(pdf_texts)
85
+ processed_pdfs = 0
86
+
87
+ pdf_embeddings = {}
88
+ for pdf_name, text in pdf_texts.items():
89
+ try:
90
+ # Compute embedding for the PDF
91
+ pdf_embeddings[pdf_name] = semantic_model.encode(text, convert_to_tensor=True)
92
+ except Exception as e:
93
+ st.error(f"Failed to compute embedding for {pdf_name}: {e}")
94
+ continue
95
+
96
+ # Update progress
97
+ processed_pdfs += 1
98
+ progress.progress(processed_pdfs / total_pdfs)
99
+
100
+ # Mark progress as complete
101
+ progress.progress(1.0)
102
+
103
+ # Initialize a progress bar for keyword embedding precomputation
104
+ st.info("Precomputing embeddings for selected keywords...")
105
+ progress = st.progress(0)
106
+ total_keywords = len(selected_keywords)
107
+ processed_keywords = 0
108
+
109
+ keyword_embeddings = {}
110
+ for keyword in selected_keywords:
111
+ try:
112
+ # Compute embedding for the keyword
113
+ keyword_embeddings[keyword] = semantic_model.encode(keyword, convert_to_tensor=True)
114
+ except Exception as e:
115
+ st.error(f"Failed to compute embedding for keyword '{keyword}': {e}")
116
+ continue
117
+
118
+ # Update progress
119
+ processed_keywords += 1
120
+ progress.progress(processed_keywords / total_keywords)
121
+
122
+ # Mark progress as complete
123
+ progress.progress(1.0)
124
+
125
+ # Group PDFs by the most relevant topic
126
+ pdf_groups = {keyword: [] for keyword in selected_keywords}
127
+ st.info("Assigning PDFs to the most relevant topic...")
128
+
129
+ for pdf_name, text_embedding in pdf_embeddings.items():
130
+ max_similarity = -1
131
+ best_keyword = None
132
+
133
+ # Find the most similar keyword for this PDF
134
+ for keyword, keyword_embedding in keyword_embeddings.items():
135
+ similarity = util.pytorch_cos_sim(text_embedding, keyword_embedding).item()
136
+ if similarity > max_similarity:
137
+ max_similarity = similarity
138
+ best_keyword = keyword
139
+
140
+ # Assign the PDF to the best matching keyword
141
+ if best_keyword:
142
+ pdf_groups[best_keyword].append(pdf_name)
143
+
144
+ # Save grouped PDFs into folders
145
+ output_folder = "grouped_pdfs"
146
+ os.makedirs(output_folder, exist_ok=True)
147
+ for keyword, pdf_names in pdf_groups.items():
148
+ keyword_folder = os.path.join(output_folder, keyword)
149
+ os.makedirs(keyword_folder, exist_ok=True)
150
+ for pdf_name in pdf_names:
151
+ try:
152
+ matched_file = next(f for f in uploaded_files if f.name == pdf_name)
153
+ with open(os.path.join(keyword_folder, pdf_name), "wb") as f:
154
+ f.write(matched_file.getvalue())
155
+ except StopIteration:
156
+ st.error(f"File {pdf_name} not found in uploaded files.")
157
+ continue
158
+
159
+ # Zip the folders
160
+ zip_buffer = BytesIO()
161
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
162
+ for root, _, files in os.walk(output_folder):
163
+ for file in files:
164
+ file_path = os.path.join(root, file)
165
+ zip_file.write(file_path, os.path.relpath(file_path, output_folder))
166
+ zip_buffer.seek(0)
167
+
168
+ # Clean up temporary folders
169
+ for root, dirs, files in os.walk(output_folder, topdown=False):
170
+ for file in files:
171
+ os.remove(os.path.join(root, file))
172
+ for dir in dirs:
173
+ os.rmdir(os.path.join(root, dir))
174
+ os.rmdir(output_folder)
175
+
176
+ # Step 4: Download zipped file
177
+ st.success("PDFs processed and grouped successfully!")
178
+ st.download_button(
179
+ label="Download Grouped PDFs",
180
+ data=zip_buffer,
181
+ file_name="grouped_pdfs.zip",
182
+ mime="application/zip"
183
+ )
184
+
185
+ if __name__ == "__main__":
186
+ main()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ keybert
4
+ sentence-transformers