bsenst commited on
Commit
1971dc9
·
1 Parent(s): e4322e9

improve workflow

Browse files
Files changed (1) hide show
  1. app.py +68 -81
app.py CHANGED
@@ -20,141 +20,128 @@ def main():
20
  st.info("Please upload PDFs to continue.")
21
  return
22
 
23
- # Extract text and keywords from PDFs
24
- pdf_texts = {}
25
- keywords_set = set()
26
-
27
- # Initialize a progress bar
28
- progress = st.progress(0)
29
- total_files = len(uploaded_files)
30
- processed_files = 0
31
-
32
- st.info("Extracting keywords from PDFs...")
33
- for uploaded_file in uploaded_files:
34
- pdf_name = uploaded_file.name
35
- try:
36
- # Read PDF content
37
- reader = PdfReader(uploaded_file)
38
- text = ""
39
- for page in reader.pages:
40
- text += page.extract_text()
41
- pdf_texts[pdf_name] = text.lower()
42
-
43
- # Extract keywords using KeyBERT
44
- extracted_keywords = kw_model.extract_keywords(text, top_n=5)
45
- for kw, _ in extracted_keywords:
46
- keywords_set.add(kw.lower())
47
-
48
- except Exception as e:
49
- st.error(f"Failed to process {pdf_name}: {e}")
50
- continue
51
-
52
- # Update progress
53
- processed_files += 1
54
- progress.progress(processed_files / total_files)
55
-
56
- # Mark progress as complete
57
- progress.progress(1.0)
 
 
58
 
59
  # Display extracted keywords and let the user select topics
60
- st.write("Extracted Keywords:")
61
  selected_keywords = st.multiselect(
62
  "Select at least two keywords/topics for grouping:",
63
- list(keywords_set),
64
- default=list(keywords_set)[:2]
65
  )
66
 
67
- # Add a confirmation button to proceed
68
  if st.button("Confirm Keyword Selection"):
69
  if len(selected_keywords) < 2:
70
- st.error("Please select at least two keywords.")
71
  else:
72
- st.success("Keyword selection confirmed. Processing PDFs...")
73
- proceed_with_keywords = True
74
  else:
75
- proceed_with_keywords = False
76
 
77
- # Ensure the process does not continue until confirmation
78
- if not proceed_with_keywords:
79
  st.stop()
80
 
81
- # Initialize a progress bar for embedding precomputation
82
- st.info("Precomputing embeddings for all PDFs...")
83
- progress = st.progress(0)
84
- total_pdfs = len(pdf_texts)
85
- processed_pdfs = 0
86
 
 
 
 
87
  pdf_embeddings = {}
88
- for pdf_name, text in pdf_texts.items():
 
 
 
89
  try:
90
- # Compute embedding for the PDF
91
  pdf_embeddings[pdf_name] = semantic_model.encode(text, convert_to_tensor=True)
92
  except Exception as e:
93
  st.error(f"Failed to compute embedding for {pdf_name}: {e}")
94
- continue
 
95
 
96
- # Update progress
97
- processed_pdfs += 1
98
- progress.progress(processed_pdfs / total_pdfs)
99
 
100
- # Mark progress as complete
101
- progress.progress(1.0)
102
-
103
- # Initialize a progress bar for keyword embedding precomputation
104
  st.info("Precomputing embeddings for selected keywords...")
105
- progress = st.progress(0)
 
 
106
  total_keywords = len(selected_keywords)
107
- processed_keywords = 0
108
 
109
- keyword_embeddings = {}
110
- for keyword in selected_keywords:
111
  try:
112
- # Compute embedding for the keyword
113
  keyword_embeddings[keyword] = semantic_model.encode(keyword, convert_to_tensor=True)
114
  except Exception as e:
115
  st.error(f"Failed to compute embedding for keyword '{keyword}': {e}")
116
- continue
117
-
118
- # Update progress
119
- processed_keywords += 1
120
- progress.progress(processed_keywords / total_keywords)
121
 
122
- # Mark progress as complete
123
- progress.progress(1.0)
124
 
125
  # Group PDFs by the most relevant topic
 
126
  pdf_groups = {keyword: [] for keyword in selected_keywords}
127
- st.info("Assigning PDFs to the most relevant topic...")
128
 
129
  for pdf_name, text_embedding in pdf_embeddings.items():
130
- max_similarity = -1
131
  best_keyword = None
 
132
 
133
- # Find the most similar keyword for this PDF
134
  for keyword, keyword_embedding in keyword_embeddings.items():
135
  similarity = util.pytorch_cos_sim(text_embedding, keyword_embedding).item()
136
  if similarity > max_similarity:
137
  max_similarity = similarity
138
  best_keyword = keyword
139
 
140
- # Assign the PDF to the best matching keyword
141
  if best_keyword:
142
  pdf_groups[best_keyword].append(pdf_name)
143
 
144
  # Save grouped PDFs into folders
145
  output_folder = "grouped_pdfs"
146
  os.makedirs(output_folder, exist_ok=True)
 
147
  for keyword, pdf_names in pdf_groups.items():
148
  keyword_folder = os.path.join(output_folder, keyword)
149
  os.makedirs(keyword_folder, exist_ok=True)
150
  for pdf_name in pdf_names:
151
- try:
152
- matched_file = next(f for f in uploaded_files if f.name == pdf_name)
153
  with open(os.path.join(keyword_folder, pdf_name), "wb") as f:
154
  f.write(matched_file.getvalue())
155
- except StopIteration:
156
- st.error(f"File {pdf_name} not found in uploaded files.")
157
- continue
158
 
159
  # Zip the folders
160
  zip_buffer = BytesIO()
 
20
  st.info("Please upload PDFs to continue.")
21
  return
22
 
23
+ # Check if uploaded files have changed
24
+ uploaded_file_names = [f.name for f in uploaded_files]
25
+ if "uploaded_files" not in st.session_state or st.session_state.uploaded_files != uploaded_file_names:
26
+ st.session_state.uploaded_files = uploaded_file_names
27
+ st.session_state.keywords_set = None
28
+
29
+ # Extract text and keywords from PDFs if not already done
30
+ if st.session_state.keywords_set is None:
31
+ st.info("Extracting keywords from PDFs...")
32
+ pdf_texts = {}
33
+ keywords_set = set()
34
+
35
+ progress1 = st.progress(0)
36
+ total_files = len(uploaded_files)
37
+
38
+ for i, uploaded_file in enumerate(uploaded_files):
39
+ pdf_name = uploaded_file.name
40
+ try:
41
+ reader = PdfReader(uploaded_file)
42
+ text = "".join(page.extract_text() for page in reader.pages)
43
+ pdf_texts[pdf_name] = text.lower()
44
+
45
+ extracted_keywords = kw_model.extract_keywords(text, top_n=5)
46
+ for kw, _ in extracted_keywords:
47
+ keywords_set.add(kw.lower())
48
+ except Exception as e:
49
+ st.error(f"Failed to process {pdf_name}: {e}")
50
+ finally:
51
+ progress1.progress((i + 1) / total_files)
52
+
53
+ if not pdf_texts:
54
+ st.error("No PDFs could be processed.")
55
+ return
56
+
57
+ progress1.progress(1.0)
58
+ st.session_state.pdf_texts = pdf_texts
59
+ st.session_state.keywords_set = keywords_set
60
 
61
  # Display extracted keywords and let the user select topics
 
62
  selected_keywords = st.multiselect(
63
  "Select at least two keywords/topics for grouping:",
64
+ list(st.session_state.keywords_set),
65
+ default=list(st.session_state.keywords_set)[:2]
66
  )
67
 
 
68
  if st.button("Confirm Keyword Selection"):
69
  if len(selected_keywords) < 2:
70
+ st.error("Please select at least two keywords to continue.")
71
  else:
72
+ st.session_state.selected_keywords = selected_keywords
73
+ st.session_state.keywords_confirmed = True
74
  else:
75
+ st.session_state.keywords_confirmed = False
76
 
77
+ if not st.session_state.get("keywords_confirmed", False):
 
78
  st.stop()
79
 
80
+ st.success("Keyword selection confirmed. Processing PDFs...")
 
 
 
 
81
 
82
+ # Precompute embeddings for PDFs
83
+ st.info("Precomputing embeddings for PDFs...")
84
+ progress2 = st.progress(0)
85
  pdf_embeddings = {}
86
+ pdf_texts = st.session_state.pdf_texts
87
+ total_pdfs = len(pdf_texts)
88
+
89
+ for i, (pdf_name, text) in enumerate(pdf_texts.items()):
90
  try:
 
91
  pdf_embeddings[pdf_name] = semantic_model.encode(text, convert_to_tensor=True)
92
  except Exception as e:
93
  st.error(f"Failed to compute embedding for {pdf_name}: {e}")
94
+ finally:
95
+ progress2.progress((i + 1) / total_pdfs)
96
 
97
+ progress2.progress(1.0)
 
 
98
 
99
+ # Precompute embeddings for selected keywords
 
 
 
100
  st.info("Precomputing embeddings for selected keywords...")
101
+ progress3 = st.progress(0)
102
+ selected_keywords = st.session_state.selected_keywords
103
+ keyword_embeddings = {}
104
  total_keywords = len(selected_keywords)
 
105
 
106
+ for i, keyword in enumerate(selected_keywords):
 
107
  try:
 
108
  keyword_embeddings[keyword] = semantic_model.encode(keyword, convert_to_tensor=True)
109
  except Exception as e:
110
  st.error(f"Failed to compute embedding for keyword '{keyword}': {e}")
111
+ finally:
112
+ progress3.progress((i + 1) / total_keywords)
 
 
 
113
 
114
+ progress3.progress(1.0)
 
115
 
116
  # Group PDFs by the most relevant topic
117
+ st.info("Assigning PDFs to the most relevant topics...")
118
  pdf_groups = {keyword: [] for keyword in selected_keywords}
 
119
 
120
  for pdf_name, text_embedding in pdf_embeddings.items():
 
121
  best_keyword = None
122
+ max_similarity = -1
123
 
 
124
  for keyword, keyword_embedding in keyword_embeddings.items():
125
  similarity = util.pytorch_cos_sim(text_embedding, keyword_embedding).item()
126
  if similarity > max_similarity:
127
  max_similarity = similarity
128
  best_keyword = keyword
129
 
 
130
  if best_keyword:
131
  pdf_groups[best_keyword].append(pdf_name)
132
 
133
  # Save grouped PDFs into folders
134
  output_folder = "grouped_pdfs"
135
  os.makedirs(output_folder, exist_ok=True)
136
+
137
  for keyword, pdf_names in pdf_groups.items():
138
  keyword_folder = os.path.join(output_folder, keyword)
139
  os.makedirs(keyword_folder, exist_ok=True)
140
  for pdf_name in pdf_names:
141
+ matched_file = next((f for f in uploaded_files if f.name == pdf_name), None)
142
+ if matched_file:
143
  with open(os.path.join(keyword_folder, pdf_name), "wb") as f:
144
  f.write(matched_file.getvalue())
 
 
 
145
 
146
  # Zip the folders
147
  zip_buffer = BytesIO()