spagestic commited on
Commit
56a57a9
·
1 Parent(s): 065887d

removed old code

Browse files
_app.py DELETED
@@ -1,191 +0,0 @@
1
- import gradio as gr
2
- from gradio_pdf import PDF
3
- from pdf_text_extractor import PDFTextExtractor
4
- from dotenv import load_dotenv
5
-
6
- load_dotenv()
7
-
8
- def main():
9
- """Main function to create and launch the interface."""
10
- def process_pdf(pdf_file):
11
- """Process PDF and extract text, then explanations, then audio, updating UI at each step."""
12
- if pdf_file is None:
13
- yield "", "No PDF uploaded", "", None, gr.update(visible=False)
14
- return
15
-
16
- try:
17
- extractor = PDFTextExtractor()
18
-
19
- # Step 1: Extract text
20
- # Show "Extracting text..." message
21
- yield "", gr.update(value="Extracting text..."), "", None, gr.update(visible=False)
22
- extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
23
-
24
- if not extracted_text or extracted_text.strip() == "":
25
- yield extracted_text, status, "No text available to explain.", None, gr.update(visible=False)
26
- return
27
-
28
- # Show extracted text immediately, explanations/audio loading
29
- yield extracted_text, status, gr.update(value="Generating explanations..."), None, gr.update(visible=False)
30
-
31
- # Step 2: Generate explanations
32
- try:
33
- explanations = extractor.generate_explanations(extracted_text)
34
-
35
- # Show explanations immediately, update status for audio loading
36
- yield extracted_text, gr.update(value="Generating audio..."), explanations, None, gr.update(visible=False)
37
-
38
- # Step 3: Generate audio
39
- try:
40
- from ui.chatterbox.generate_tts_audio import generate_tts_audio
41
-
42
- # Clean up the text for better TTS
43
- clean_text = explanations.strip()
44
-
45
- # Limit text length for TTS (assuming 1000 character limit)
46
- if len(clean_text) > 1000:
47
- sentences = clean_text[:950].split('.')
48
- if len(sentences) > 1:
49
- clean_text = '.'.join(sentences[:-1]) + '.'
50
- else:
51
- clean_text = clean_text[:950]
52
- clean_text += " [Text has been truncated for audio generation]"
53
-
54
- audio_result = generate_tts_audio(clean_text, None)
55
-
56
- # Show everything, update status to complete
57
- yield extracted_text, gr.update(value="All steps complete!"), explanations, audio_result, gr.update(visible=True)
58
-
59
- except Exception as audio_error:
60
- # Show explanations, update status with audio error
61
- yield extracted_text, gr.update(value=f"Audio generation failed: {str(audio_error)}"), explanations, None, gr.update(visible=False)
62
-
63
- except Exception as explanation_error:
64
- # Show extracted text, but indicate explanation error
65
- yield extracted_text, status, f"Error generating explanations: {str(explanation_error)}", None, gr.update(visible=False)
66
-
67
- except Exception as e:
68
- yield "", f"Error processing PDF: {str(e)}", "", None, gr.update(visible=False)
69
-
70
- def generate_explanations(extracted_text):
71
- """Generate explanations for extracted text"""
72
- if not extracted_text or extracted_text.strip() == "":
73
- return "No text available to explain. Please extract text from a PDF first."
74
-
75
- try:
76
- # Initialize extractor
77
- extractor = PDFTextExtractor()
78
-
79
- # Generate explanations
80
- explanations = extractor.generate_explanations(extracted_text)
81
- return explanations
82
-
83
- except Exception as e:
84
- return f"Error generating explanations: {str(e)}"
85
-
86
- def generate_audio(explanation_text):
87
- """Generate TTS audio for explanations"""
88
- if not explanation_text or explanation_text.strip() == "":
89
- raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
90
-
91
- try:
92
- # Import the TTS function
93
- from ui.chatterbox.generate_tts_audio import generate_tts_audio
94
-
95
- # Clean up the text for better TTS
96
- clean_text = explanation_text.strip()
97
-
98
- # Limit text length for TTS (assuming 1000 character limit)
99
- if len(clean_text) > 1000:
100
- # Truncate at sentence boundary if possible
101
- sentences = clean_text[:950].split('.')
102
- if len(sentences) > 1:
103
- clean_text = '.'.join(sentences[:-1]) + '.'
104
- else:
105
- clean_text = clean_text[:950]
106
- clean_text += " [Text has been truncated for audio generation]"
107
-
108
- # Generate audio and make it visible
109
- audio_result = generate_tts_audio(clean_text, None)
110
- return audio_result, gr.update(visible=True)
111
-
112
- except Exception as e:
113
- raise gr.Error(f"Error generating audio: {str(e)}")
114
- # Create the interface with side-by-side layout
115
- with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as demo:
116
- # Inject fullscreen CSS
117
- gr.HTML("""
118
- <style>
119
- html, body, #root, .gradio-container {
120
- height: 100% !important;
121
- width: 100% !important;
122
- margin: 0 !important;
123
- padding: 0 !important;
124
- }
125
- .gradio-container {
126
- max-width: 100vw !important;
127
- min-height: 100vh !important;
128
- box-sizing: border-box;
129
- }
130
- </style>
131
- """)
132
-
133
- gr.Markdown("# 🔍 PDF Text Extractor")
134
- gr.Markdown("Upload a PDF on the left to automatically extract and view text on the right.")
135
-
136
- with gr.Row(equal_height=True):
137
- # Left column - PDF Display
138
- with gr.Column(scale=1):
139
- gr.Markdown("### 📄 PDF Document")
140
- pdf_input = PDF(
141
- label="Upload and View PDF",
142
- height=600,
143
- interactive=True
144
- )
145
-
146
- status_output = gr.Textbox(
147
- label="Status",
148
- lines=2,
149
- placeholder="Upload a PDF to see status...",
150
- interactive=False
151
- )
152
- # Right column - Extracted Content with Tabs
153
- with gr.Column(scale=1):
154
- gr.Markdown("### 📝 Extracted Content")
155
-
156
- with gr.Tabs():
157
- with gr.TabItem("Extracted Text"):
158
- text_output = gr.Textbox(
159
- label="Extracted Text",
160
- lines=20,
161
- placeholder="Upload a PDF to automatically extract text...",
162
- show_copy_button=True,
163
- interactive=False
164
- )
165
- with gr.TabItem("Explanation Script"):
166
- explanation_output = gr.Textbox(
167
- label="Generated Explanation Script",
168
- lines=15,
169
- placeholder="Explanations will be automatically generated after text extraction...",
170
- show_copy_button=True,
171
- interactive=False
172
- )
173
-
174
- # Audio generation section (below tabs)
175
- gr.Markdown("### 🔊 Audio Generation")
176
- audio_output = gr.Audio(
177
- label="Generated Explanation Audio",
178
- interactive=False,
179
- visible=False
180
- ) # Set up automatic processing on PDF upload (now handles all steps)
181
- pdf_input.upload(
182
- fn=process_pdf,
183
- inputs=[pdf_input],
184
- outputs=[text_output, status_output, explanation_output, audio_output, audio_output]
185
- )
186
-
187
- return demo
188
-
189
- if __name__ == "__main__":
190
- demo = main()
191
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api.py DELETED
@@ -1,48 +0,0 @@
1
- """
2
- API for testing the TextExplainer using FastAPI.
3
- """
4
-
5
- from fastapi import FastAPI, HTTPException, Form
6
- from pydantic import BaseModel
7
- from utils.text_explainer import TextExplainer
8
- import os
9
- from dotenv import load_dotenv
10
-
11
- # Load environment variables
12
- load_dotenv()
13
-
14
- app = FastAPI(title="Text Explainer API")
15
-
16
- class ExplainRequest(BaseModel):
17
- text: str
18
-
19
- class ExplainResponseSection(BaseModel):
20
- heading: str
21
- content: str
22
- explanation: str
23
- level: int
24
-
25
- class ExplainResponse(BaseModel):
26
- sections: list[ExplainResponseSection]
27
- chat_history: list[dict] # Add chat history to the response
28
-
29
- @app.post("/explain-text", response_model=ExplainResponse)
30
- def explain_text(request: ExplainRequest = None, text: str = Form(None)):
31
- # Accept either JSON or form data
32
- input_text = None
33
- if request and request.text:
34
- input_text = request.text
35
- elif text:
36
- input_text = text
37
- if not input_text or not input_text.strip():
38
- raise HTTPException(status_code=400, detail="Text is required.")
39
- try:
40
- explainer = TextExplainer()
41
- explained_sections = explainer.explain_all_sections(input_text)
42
- chat_history = explainer.get_chat_history() # Get chat history
43
- return {
44
- "sections": explained_sections,
45
- "chat_history": chat_history
46
- }
47
- except Exception as e:
48
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,35 +1,17 @@
1
- """
2
- PDF Text Extractor Application
3
- Main entry point for the PDF Text Extractor application.
4
- """
5
 
6
- import os
 
 
7
  from dotenv import load_dotenv
8
- from ui import create_interface
9
- from utils.config import check_api_key, get_app_config
10
 
11
  def main():
12
- """Main function to launch the application."""
13
-
14
- # Load environment variables from .env file
15
- load_dotenv()
16
-
17
- # Check for API key
18
- check_api_key()
19
-
20
- # Create and launch the interface
21
- interface = create_interface()
22
-
23
- # Get application configuration
24
- app_config = get_app_config()
25
-
26
- # Launch with appropriate settings
27
- interface.launch(
28
- # server_port=app_config["server_port"],
29
- debug=app_config["debug"],
30
- quiet=app_config["quiet"],
31
- max_file_size=app_config["max_file_size"]
32
- )
33
 
34
  if __name__ == "__main__":
35
- main()
 
 
1
+ """Main entry point for the PDF Explainer app."""
 
 
 
2
 
3
+ import gradio as gr
4
+ from src.processors.pdf_processor import PDFProcessor
5
+ from src.ui_components.interface import build_interface
6
  from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
 
10
  def main():
11
+ pdf_processor = PDFProcessor()
12
+ demo = build_interface(pdf_processor.process_pdf)
13
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  if __name__ == "__main__":
16
+ demo = main()
17
+ demo.launch()
pdf_text_extractor.py DELETED
@@ -1,287 +0,0 @@
1
- import base64
2
- import os
3
- from typing import Optional, Tuple, List, Dict, Any
4
- from mistralai import Mistral
5
- from utils.text_explainer import TextExplainer
6
-
7
- class PDFTextExtractor:
8
- """PDF text extraction using Mistral AI OCR."""
9
-
10
- def __init__(self):
11
- """Initialize the PDF text extractor with Mistral AI client."""
12
- self.api_key = os.environ.get("MISTRAL_API_KEY")
13
- if not self.api_key:
14
- raise ValueError("MISTRAL_API_KEY environment variable is required")
15
- self.client = Mistral(api_key=self.api_key)
16
- self.text_explainer = TextExplainer()
17
-
18
- def encode_pdf(self, pdf_path: str) -> Optional[str]:
19
- """
20
- Encode the PDF file to base64.
21
-
22
- Args:
23
- pdf_path: Path to the PDF file
24
-
25
- Returns:
26
- Base64 encoded string or None if error
27
- """
28
- try:
29
- with open(pdf_path, "rb") as pdf_file:
30
- return base64.b64encode(pdf_file.read()).decode('utf-8')
31
- except FileNotFoundError:
32
- print(f"Error: The file {pdf_path} was not found.")
33
- return None
34
- except Exception as e:
35
- print(f"Error encoding PDF: {e}")
36
- return None
37
-
38
- def extract_text_from_pdf(self, pdf_file) -> Tuple[str, str, List[Dict[str, Any]]]:
39
- """
40
- Extract text and images from uploaded PDF using Mistral AI OCR.
41
-
42
- Args:
43
- pdf_file: Gradio file object
44
-
45
- Returns:
46
- Tuple of (extracted_text, status_message, images_data)
47
- """
48
- if pdf_file is None:
49
- return "", "Please upload a PDF file.", []
50
-
51
- try:
52
- # Get the file path from Gradio file object
53
- pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
54
-
55
- # Encode PDF to base64
56
- base64_pdf = self.encode_pdf(pdf_path)
57
- if base64_pdf is None:
58
- return "", "Failed to encode PDF file.", []
59
-
60
- # Process with Mistral OCR
61
- print(f"🔄 Processing PDF with Mistral OCR...")
62
- ocr_response = self.client.ocr.process(
63
- model="mistral-ocr-latest",
64
- document={
65
- "type": "document_url",
66
- "document_url": f"data:application/pdf;base64,{base64_pdf}"
67
- },
68
- include_image_base64=True
69
- )
70
-
71
- # Enhanced debugging and response parsing
72
- print("🔍 Analyzing OCR Response Structure...")
73
- print(f" Type: {type(ocr_response)}")
74
- print(f" String representation: {str(ocr_response)[:500]}...")
75
-
76
- # Check if it's a simple object with attributes
77
- if hasattr(ocr_response, '__dict__'):
78
- print(f" Object attributes: {list(ocr_response.__dict__.keys())}")
79
- for key, value in ocr_response.__dict__.items():
80
- print(f" {key}: {type(value)} = {str(value)[:100]}...")
81
-
82
- # Check if it has commonly expected attributes
83
- common_attrs = ['text', 'content', 'result', 'data', 'output', 'extracted_text', 'ocr_text', 'choices', 'message']
84
- for attr in common_attrs:
85
- if hasattr(ocr_response, attr):
86
- value = getattr(ocr_response, attr)
87
- print(f" Has '{attr}': {type(value)} = {str(value)[:100]}...")
88
-
89
- # Check if it's iterable but not a string
90
- try:
91
- if hasattr(ocr_response, '__iter__') and not isinstance(ocr_response, str):
92
- print(f" Iterable with {len(list(ocr_response))} items")
93
- for i, item in enumerate(ocr_response):
94
- if i < 3: # Show first 3 items
95
- print(f" Item {i}: {type(item)} = {str(item)[:100]}...")
96
- except Exception as e:
97
- print(f" Error checking iteration: {e}")
98
-
99
- # Advanced text extraction with multiple strategies
100
- extracted_text = ""
101
- extraction_method = "none"
102
- extracted_images = []
103
-
104
- # Strategy 1: Mistral OCR specific - pages with markdown content and images
105
- if hasattr(ocr_response, 'pages') and ocr_response.pages:
106
- pages = ocr_response.pages
107
- if isinstance(pages, list) and len(pages) > 0:
108
- page_texts = []
109
-
110
- for i, page in enumerate(pages):
111
- # Extract text
112
- if hasattr(page, 'markdown') and page.markdown:
113
- page_texts.append(page.markdown)
114
- print(f"✅ Found text in page {i} markdown: {len(page.markdown)} characters")
115
-
116
- # Extract images
117
- if hasattr(page, 'images') and page.images:
118
- for j, img in enumerate(page.images):
119
- image_data = {
120
- 'page': i,
121
- 'image_id': f"img-{i}-{j}",
122
- 'top_left_x': getattr(img, 'top_left_x', 0),
123
- 'top_left_y': getattr(img, 'top_left_y', 0),
124
- 'bottom_right_x': getattr(img, 'bottom_right_x', 0),
125
- 'bottom_right_y': getattr(img, 'bottom_right_y', 0),
126
- 'base64': getattr(img, 'image_base64', '')
127
- }
128
- extracted_images.append(image_data)
129
- print(f"✅ Found image in page {i}, image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
130
-
131
- if page_texts:
132
- extracted_text = "\n\n".join(page_texts)
133
- extraction_method = f"pages_markdown_{len(page_texts)}_pages"
134
-
135
- # Try to extract images from other response structures if no images found yet
136
- if not extracted_images:
137
- # Check if response has images attribute directly
138
- if hasattr(ocr_response, 'images') and ocr_response.images:
139
- for j, img in enumerate(ocr_response.images):
140
- image_data = {
141
- 'page': 0,
142
- 'image_id': getattr(img, 'id', f"img-{j}"),
143
- 'top_left_x': getattr(img, 'top_left_x', 0),
144
- 'top_left_y': getattr(img, 'top_left_y', 0),
145
- 'bottom_right_x': getattr(img, 'bottom_right_x', 0),
146
- 'bottom_right_y': getattr(img, 'bottom_right_y', 0),
147
- 'base64': getattr(img, 'image_base64', '')
148
- }
149
- extracted_images.append(image_data)
150
- print(f"✅ Found image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
151
-
152
- # Continue with fallback strategies for text extraction
153
- if not extracted_text:
154
- # Strategy 2: Direct text attribute (fallback)
155
- if hasattr(ocr_response, 'text') and ocr_response.text:
156
- extracted_text = str(ocr_response.text)
157
- extraction_method = "direct_text_attribute"
158
-
159
- # Strategy 3: Content attribute (fallback)
160
- elif hasattr(ocr_response, 'content') and ocr_response.content:
161
- content = ocr_response.content
162
- if isinstance(content, str):
163
- extracted_text = content
164
- extraction_method = "content_attribute_string"
165
- elif hasattr(content, 'text'):
166
- extracted_text = str(content.text)
167
- extraction_method = "content_text_attribute"
168
- else:
169
- extracted_text = str(content)
170
- extraction_method = "content_attribute_converted"
171
-
172
- # Strategy 4: Result attribute (fallback)
173
- elif hasattr(ocr_response, 'result'):
174
- result = ocr_response.result
175
- if isinstance(result, str):
176
- extracted_text = result
177
- extraction_method = "result_string"
178
- elif hasattr(result, 'text'):
179
- extracted_text = str(result.text)
180
- extraction_method = "result_text_attribute"
181
- elif isinstance(result, dict) and 'text' in result:
182
- extracted_text = str(result['text'])
183
- extraction_method = "result_dict_text"
184
- else:
185
- extracted_text = str(result)
186
- extraction_method = "result_converted"
187
-
188
- # Strategy 5: Choices attribute (ChatGPT-style response - fallback)
189
- elif hasattr(ocr_response, 'choices') and ocr_response.choices:
190
- choices = ocr_response.choices
191
- if isinstance(choices, list) and len(choices) > 0:
192
- choice = choices[0]
193
- if hasattr(choice, 'message') and hasattr(choice.message, 'content'):
194
- extracted_text = str(choice.message.content)
195
- extraction_method = "choices_message_content"
196
- elif hasattr(choice, 'text'):
197
- extracted_text = str(choice.text)
198
- extraction_method = "choices_text"
199
- else:
200
- extracted_text = str(choice)
201
- extraction_method = "choices_converted"
202
-
203
- # Strategy 6: Dict-like access (fallback)
204
- elif hasattr(ocr_response, 'get') or isinstance(ocr_response, dict):
205
- for key in ['text', 'content', 'result', 'extracted_text', 'ocr_text', 'output']:
206
- if hasattr(ocr_response, 'get'):
207
- value = ocr_response.get(key)
208
- else:
209
- value = ocr_response.get(key) if isinstance(ocr_response, dict) else None
210
-
211
- if value:
212
- extracted_text = str(value)
213
- extraction_method = f"dict_key_{key}"
214
- break
215
-
216
- # Strategy 7: Inspect all attributes for string-like content (fallback)
217
- elif hasattr(ocr_response, '__dict__'):
218
- for key, value in ocr_response.__dict__.items():
219
- if isinstance(value, str) and len(value) > 20: # Likely text content
220
- extracted_text = value
221
- extraction_method = f"attribute_{key}"
222
- break
223
- elif hasattr(value, 'text') and isinstance(value.text, str):
224
- extracted_text = str(value.text)
225
- extraction_method = f"nested_text_in_{key}"
226
- break
227
-
228
- # Strategy 8: Convert entire response to string if it seems to contain text (fallback)
229
- if not extracted_text:
230
- response_str = str(ocr_response)
231
- if len(response_str) > 50 and not response_str.startswith('<'): # Not an object reference
232
- extracted_text = response_str
233
- extraction_method = "full_response_string"
234
-
235
- print(f"🎯 Extraction method used: {extraction_method}")
236
- print(f"📏 Extracted text length: {len(extracted_text)} characters")
237
- print(f"🖼️ Extracted images: {len(extracted_images)}")
238
-
239
- if extracted_text:
240
- status = f"✅ Successfully extracted text from PDF ({len(extracted_text)} characters)"
241
- if extracted_images:
242
- status += f" and {len(extracted_images)} image(s)"
243
- else:
244
- extracted_text = "No text could be extracted from this PDF."
245
- status = "⚠️ OCR completed but no text was found in response."
246
- if extracted_images:
247
- status = f"✅ Successfully extracted {len(extracted_images)} image(s) from PDF, but no text was found."
248
- print(f"❌ No extractable text found in OCR response")
249
-
250
- return extracted_text, status, extracted_images
251
-
252
- except Exception as e:
253
- error_msg = f"Error processing PDF: {str(e)}"
254
- print(error_msg)
255
- return "", f"❌ {error_msg}", []
256
-
257
- def generate_explanations(self, extracted_text: str) -> str:
258
- """
259
- Generate explanations for the extracted text sections.
260
-
261
- Args:
262
- extracted_text: The extracted text from PDF
263
-
264
- Returns:
265
- Formatted explanations for all sections
266
- """
267
- try:
268
- if not extracted_text or extracted_text.strip() == "":
269
- return "No text available to explain."
270
-
271
- if extracted_text.startswith("No text could be extracted"):
272
- return "Cannot generate explanations - no text was extracted from the PDF."
273
-
274
- print("🤖 Generating explanations for extracted text...")
275
- explained_sections = self.text_explainer.explain_all_sections(extracted_text)
276
-
277
- if not explained_sections:
278
- return "No sections found to explain in the extracted text."
279
-
280
- formatted_explanations = self.text_explainer.format_explanations_for_display(explained_sections)
281
- return formatted_explanations
282
-
283
- except Exception as e:
284
- error_msg = f"Error generating explanations: {str(e)}"
285
- print(error_msg)
286
- return f"❌ {error_msg}"
287
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sample_text.md DELETED
@@ -1,15 +0,0 @@
1
- # Sporting Exchanges between China and the United States, 1980-1984: Inevitable Politics and Excessive Political Strings
2
-
3
- Y. Andrew Hao and Thomas M. Hunt<br>Department of Kinesiology and Health Education, The University of Texas at Austin, Austin, TX, USA
4
-
5
- #### Abstract
6
-
7
- Sino-US sporting exchanges between 1980 and 1984 largely paralleled the patterns of the larger bilateral relations between the two nations. The over-politicization of sports by the two governments - and especially by the PRC - created the parallelism. Curiously, scholars of sport and international relations have paid little attention to Sino-US athletic interactions in this period, an oversight that needs to be remedied in light of the reciprocal correlations between international sport and international politics. Indeed, Sino-US athletic exchanges in the context of their bilateral relations underscores the mutual connections between sport and diplomacy.
8
-
9
- On January 1, 1979 - eight years after the initiation of the 'Ping-pong diplomacy' and seven after then-US President Richard Nixon's visit to China - the United States diplomatically recognized the People's Republic of China (PRC) with its capital in Beijing and rescinded its recognition of the Republic of China (ROC). At the end of the same year, the International Olympic Committee (IOC) welcomed the Chinese Olympic Committee back to the Olympic Movement. ${ }^{1}$ The Republic of China Olympic Committee, which previously monopolized the seat of China but only governed the sporting affairs of Taiwan and surrounding breakaway islands, was forced to change its name, flag and anthem.
10
-
11
- The two incidents' proximity in time was more a coincidence than not - the US recognition did not directly cause China's reinstatement into the Olympics. Rather, both were trophies that Beijing garnered thanks to its rising power and strategic advantage in world politics. The Sino-US rapprochement resulted from changing power dynamics within the Sino-USSR-US strategic triangle in the 1970s: having parted way with the Soviet Union, China befriended the United States, which, under the 'Nixon doctrine', had offered an olive branch; the ensuing deterioration of SovietUS relations drove Beijing and Washington to enter into a closer relationship after 1978. ${ }^{2}$ The Olympic reinstatement, however, took place directly as the result of
12
-
13
- [^0]
14
- [^0]: CONTACT Thomas M. Hunt (1) [email protected] (2) Department of Kinesiology and Health Education, The University of Texas at Austin, 2109 San Jacinto Blvd, Stop D3700, Austin, TX 78712-1415, USA
15
- (C) 2019 Informa UK Limited, trading as Taylor \& Francis Group
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/app.py DELETED
@@ -1,17 +0,0 @@
1
- """Main entry point for the PDF Explainer app."""
2
-
3
- import gradio as gr
4
- from processors.pdf_processor import PDFProcessor
5
- from ui_components.interface import build_interface
6
- from dotenv import load_dotenv
7
-
8
- load_dotenv()
9
-
10
- def main():
11
- pdf_processor = PDFProcessor()
12
- demo = build_interface(pdf_processor.process_pdf)
13
- return demo
14
-
15
- if __name__ == "__main__":
16
- demo = main()
17
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ui_components/interface.py CHANGED
@@ -46,7 +46,7 @@ def build_interface(process_pdf_fn):
46
  audio_output = gr.Audio(
47
  label="Generated Explanation Audio",
48
  interactive=False,
49
- visible=False
50
  )
51
 
52
  pdf_input.upload(
 
46
  audio_output = gr.Audio(
47
  label="Generated Explanation Audio",
48
  interactive=False,
49
+ visible=False,
50
  )
51
 
52
  pdf_input.upload(
tests/test_ocr_direct.py DELETED
@@ -1,234 +0,0 @@
1
- """
2
- Quick OCR Test Script
3
- Tests the Mistral AI OCR functionality directly without the Gradio interface.
4
- """
5
-
6
- import base64
7
- import os
8
- import tempfile
9
- from mistralai import Mistral
10
- from dotenv import load_dotenv
11
-
12
- # Load environment variables
13
- load_dotenv()
14
-
15
- def create_simple_pdf_content():
16
- """Create a minimal PDF in memory for testing."""
17
- # Simple PDF content (this is a basic PDF structure)
18
- pdf_content = """%PDF-1.4
19
- 1 0 obj
20
- <<
21
- /Type /Catalog
22
- /Pages 2 0 R
23
- >>
24
- endobj
25
-
26
- 2 0 obj
27
- <<
28
- /Type /Pages
29
- /Kids [3 0 R]
30
- /Count 1
31
- >>
32
- endobj
33
-
34
- 3 0 obj
35
- <<
36
- /Type /Page
37
- /Parent 2 0 R
38
- /MediaBox [0 0 612 792]
39
- /Contents 4 0 R
40
- /Resources <<
41
- /Font <<
42
- /F1 5 0 R
43
- >>
44
- >>
45
- >>
46
- endobj
47
-
48
- 4 0 obj
49
- <<
50
- /Length 44
51
- >>
52
- stream
53
- BT
54
- /F1 12 Tf
55
- 72 720 Td
56
- (Hello World! Test OCR) Tj
57
- ET
58
- endstream
59
- endobj
60
-
61
- 5 0 obj
62
- <<
63
- /Type /Font
64
- /Subtype /Type1
65
- /BaseFont /Helvetica
66
- >>
67
- endobj
68
-
69
- xref
70
- 0 6
71
- 0000000000 65535 f
72
- 0000000010 00000 n
73
- 0000000079 00000 n
74
- 0000000173 00000 n
75
- 0000000301 00000 n
76
- 0000000380 00000 n
77
- trailer
78
- <<
79
- /Size 6
80
- /Root 1 0 R
81
- >>
82
- startxref
83
- 456
84
- %%EOF"""
85
-
86
- return pdf_content.encode('utf-8')
87
-
88
- def test_mistral_ocr():
89
- """Test the Mistral OCR functionality directly."""
90
-
91
- print("🧪 Starting Mistral OCR Test...")
92
-
93
- # Check API key
94
- api_key = os.environ.get("MISTRAL_API_KEY")
95
- if not api_key:
96
- print("❌ MISTRAL_API_KEY environment variable not found")
97
- print(" Please set it in your .env file or environment")
98
- return False
99
-
100
- print(f"✅ API key found: {api_key[:8]}...")
101
-
102
- try:
103
- # Initialize Mistral client
104
- client = Mistral(api_key=api_key)
105
- print("✅ Mistral client initialized")
106
-
107
- # Create a simple test PDF
108
- pdf_content = create_simple_pdf_content()
109
- base64_pdf = base64.b64encode(pdf_content).decode('utf-8')
110
- print(f"✅ Test PDF created ({len(pdf_content)} bytes)")
111
-
112
- # Test the OCR endpoint
113
- print("🔄 Sending OCR request to Mistral...")
114
-
115
- response = client.ocr.process(
116
- model="mistral-ocr-latest",
117
- document={
118
- "type": "document_url",
119
- "document_url": f"data:application/pdf;base64,{base64_pdf}"
120
- },
121
- include_image_base64=True
122
- )
123
-
124
- print("✅ OCR request completed")
125
-
126
- # Analyze the response
127
- print("\n🔍 RESPONSE ANALYSIS:")
128
- print(f"Response type: {type(response)}")
129
- print(f"Response string: {str(response)[:200]}...")
130
-
131
- if hasattr(response, '__dict__'):
132
- print(f"Response attributes: {list(response.__dict__.keys())}")
133
- for key, value in response.__dict__.items():
134
- print(f" {key}: {type(value)} = {str(value)[:100]}...")
135
- # Test all possible text extraction methods
136
- print("\n🎯 TESTING TEXT EXTRACTION METHODS:")
137
-
138
- methods = [
139
- ("response.pages[].markdown", lambda r: "\n".join([page.markdown for page in r.pages]) if hasattr(r, 'pages') and r.pages and all(hasattr(p, 'markdown') for p in r.pages) else None),
140
- ("response.text", lambda r: getattr(r, 'text', None)),
141
- ("response.content", lambda r: getattr(r, 'content', None)),
142
- ("response.result", lambda r: getattr(r, 'result', None)),
143
- ("response.data", lambda r: getattr(r, 'data', None)),
144
- ("response['text']", lambda r: r.get('text') if hasattr(r, 'get') else None),
145
- ("response['content']", lambda r: r.get('content') if hasattr(r, 'get') else None),
146
- ]
147
-
148
- extracted_text = None
149
- successful_method = None
150
-
151
- for method_name, method_func in methods:
152
- try:
153
- result = method_func(response)
154
- if result:
155
- print(f"✅ {method_name}: Found content ({len(str(result))} chars)")
156
- print(f" Content: {str(result)[:100]}...")
157
- if not extracted_text: # Use the first successful method
158
- extracted_text = str(result)
159
- successful_method = method_name
160
- else:
161
- print(f"❌ {method_name}: No content found")
162
- except Exception as e:
163
- print(f"❌ {method_name}: Error - {e}")
164
-
165
- if extracted_text:
166
- print(f"\n🎉 SUCCESSFULLY EXTRACTED TEXT using {successful_method}:")
167
- print(f"📝 Full extracted text: '{extracted_text}'")
168
- else:
169
- print(f"\n❌ NO TEXT EXTRACTED from any method")
170
-
171
- return True
172
-
173
- except Exception as e:
174
- print(f"❌ OCR test failed: {e}")
175
- print(f" Error type: {type(e)}")
176
-
177
- # If it's a 401 error, the API key might be invalid
178
- if "401" in str(e) or "unauthorized" in str(e).lower():
179
- print(" This might be an API key issue. Please check your MISTRAL_API_KEY")
180
-
181
- return False
182
-
183
- def test_api_connectivity():
184
- """Test basic connectivity to Mistral API."""
185
-
186
- print("🌐 Testing API connectivity...")
187
-
188
- api_key = os.environ.get("MISTRAL_API_KEY")
189
- if not api_key:
190
- print("❌ No API key found")
191
- return False
192
-
193
- try:
194
- client = Mistral(api_key=api_key)
195
-
196
- # Try a simple API call (if available)
197
- # Note: This might fail if the endpoint doesn't exist, but it tests connectivity
198
- print("🔄 Testing API connection...")
199
-
200
- # The exact method to test connectivity may vary based on Mistral's API
201
- # For now, we'll just try to initialize and catch any immediate errors
202
- print("✅ Mistral client appears to be working")
203
- return True
204
-
205
- except Exception as e:
206
- print(f"❌ API connectivity test failed: {e}")
207
- return False
208
-
209
- def main():
210
- """Main test function."""
211
-
212
- print("🚀 Mistral OCR Quick Test")
213
- print("=" * 40)
214
-
215
- # Test API connectivity first
216
- if not test_api_connectivity():
217
- print("\n❌ Basic connectivity test failed")
218
- return
219
-
220
- print("\n" + "="*40)
221
-
222
- # Test OCR functionality
223
- if test_mistral_ocr():
224
- print("\n✅ OCR test completed - check the response analysis above")
225
- else:
226
- print("\n❌ OCR test failed")
227
-
228
- print("\n💡 Next steps:")
229
- print(" 1. If the test worked, run: python main.py")
230
- print(" 2. If there were errors, check the API key and try again")
231
- print(" 3. Use the response analysis to improve text extraction")
232
-
233
- if __name__ == "__main__":
234
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_setup.py DELETED
@@ -1,62 +0,0 @@
1
- """
2
- Test script for PDF Extractor setup validation
3
- """
4
-
5
- import sys
6
- import os
7
- from dotenv import load_dotenv
8
-
9
- def test_imports():
10
- """Test if all required packages are importable."""
11
- try:
12
- import gradio as gr
13
- print("✅ Gradio imported successfully")
14
-
15
- import mistralai
16
- print("✅ Mistral AI imported successfully")
17
-
18
- from dotenv import load_dotenv
19
- print("✅ python-dotenv imported successfully")
20
-
21
- return True
22
- except ImportError as e:
23
- print(f"❌ Import error: {e}")
24
- return False
25
-
26
- def test_environment():
27
- """Test environment variable setup."""
28
- load_dotenv()
29
-
30
- api_key = os.environ.get("MISTRAL_API_KEY")
31
- if api_key:
32
- # Don't print the actual key, just confirm it exists
33
- print("✅ MISTRAL_API_KEY environment variable is set")
34
- return True
35
- else:
36
- print("⚠️ MISTRAL_API_KEY not found in environment")
37
- print(" Please copy .env.example to .env and add your API key")
38
- return False
39
-
40
- def main():
41
- """Run all tests."""
42
- print("🔍 PDF Extractor Setup Validation")
43
- print("=" * 40)
44
-
45
- import_success = test_imports()
46
- env_success = test_environment()
47
-
48
- print("\n" + "=" * 40)
49
- if import_success:
50
- print("✅ All packages are properly installed")
51
- if env_success:
52
- print("✅ Environment is configured correctly")
53
- print("🚀 Ready to run: python main.py")
54
- else:
55
- print("⚠️ Environment needs configuration")
56
- print("📝 Next step: Set up your .env file")
57
- else:
58
- print("❌ Package installation incomplete")
59
- print("📝 Next step: pip install -r requirements.txt")
60
-
61
- if __name__ == "__main__":
62
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/__init__.py DELETED
@@ -1,15 +0,0 @@
1
- """UI components for PDF Text Extractor."""
2
- from ui.interface import create_interface
3
- from ui.handlers import copy_text, download_text, process_images_for_display
4
- from ui.components import (
5
- create_header, create_upload_section, create_action_button,
6
- create_text_display, create_action_buttons, create_image_gallery,
7
- apply_custom_css
8
- )
9
-
10
- __all__ = [
11
- "create_interface", "copy_text", "download_text", "process_images_for_display",
12
- "create_header", "create_upload_section", "create_action_button",
13
- "create_text_display", "create_action_buttons", "create_image_gallery",
14
- "apply_custom_css"
15
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/chatterbox/check_api_health.py DELETED
@@ -1,13 +0,0 @@
1
- def check_api_health():
2
- import requests
3
- import os
4
- HEALTH_ENDPOINT = os.getenv("HEALTH_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/health")
5
- try:
6
- response = requests.get(HEALTH_ENDPOINT, timeout=10)
7
- if response.status_code == 200:
8
- data = response.json()
9
- return f"✅ API Status: {data.get('status', 'Unknown')} | Model Loaded: {data.get('model_loaded', False)}"
10
- else:
11
- return f"⚠️ API returned status code: {response.status_code}"
12
- except Exception as e:
13
- return f"❌ API Health Check Failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/chatterbox/custom_css.py DELETED
@@ -1,9 +0,0 @@
1
- custom_css = """
2
- .gradio-container {
3
- max-width: 1200px !important;
4
- }
5
- .status-box {
6
- padding: 10px;
7
- border-radius: 5px;
8
- }
9
- """
 
 
 
 
 
 
 
 
 
 
ui/chatterbox/generate_sample_text.py DELETED
@@ -1,10 +0,0 @@
1
- def generate_sample_text():
2
- import random
3
- samples = [
4
- "Hello! This is a test of the Chatterbox TTS system running on Modal.",
5
- "The quick brown fox jumps over the lazy dog.",
6
- "Welcome to the future of text-to-speech technology.",
7
- "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse.",
8
- "This is an example of voice cloning using artificial intelligence.",
9
- ]
10
- return random.choice(samples)
 
 
 
 
 
 
 
 
 
 
 
ui/chatterbox/generate_tts_audio.py DELETED
@@ -1,113 +0,0 @@
1
- def generate_tts_audio(text_input: str, audio_prompt_input, progress=None):
2
- import os
3
- import requests
4
- import tempfile
5
- import soundfile as sf
6
- import numpy as np
7
- import gradio as gr
8
-
9
- GENERATE_AUDIO_ENDPOINT = os.getenv("GENERATE_AUDIO_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/generate_audio")
10
- GENERATE_WITH_FILE_ENDPOINT = os.getenv("GENERATE_WITH_FILE_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/generate_with_file")
11
-
12
- if not text_input or len(text_input.strip()) == 0:
13
- raise gr.Error("Please enter some text to synthesize.")
14
- if len(text_input) > 1000:
15
- raise gr.Error("Text is too long. Maximum 1000 characters allowed.")
16
-
17
- if progress: progress(0.1, desc="Preparing request...")
18
-
19
- try:
20
- if audio_prompt_input is None:
21
- if progress: progress(0.3, desc="Sending request to API...")
22
- payload = {"text": text_input}
23
- response = requests.post(
24
- GENERATE_AUDIO_ENDPOINT,
25
- json=payload,
26
- headers={"Content-Type": "application/json"},
27
- timeout=120,
28
- stream=True
29
- )
30
- if response.status_code != 200:
31
- raise gr.Error(f"API Error: {response.status_code} - {response.text}")
32
-
33
- if progress: progress(0.6, desc="Streaming audio response...")
34
-
35
- # Get content length if available for progress tracking
36
- content_length = response.headers.get('content-length')
37
- if content_length:
38
- content_length = int(content_length)
39
-
40
- bytes_downloaded = 0
41
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
42
- for chunk in response.iter_content(chunk_size=8192):
43
- if chunk:
44
- temp_file.write(chunk)
45
- bytes_downloaded += len(chunk)
46
-
47
- # Update progress based on bytes downloaded
48
- if content_length and progress:
49
- download_progress = min(0.3, (bytes_downloaded / content_length) * 0.3)
50
- progress(0.6 + download_progress, desc=f"Downloading audio... ({bytes_downloaded // 1024}KB)")
51
- elif progress:
52
- # If no content length, just show bytes downloaded
53
- progress(0.6, desc=f"Downloading audio... ({bytes_downloaded // 1024}KB)")
54
-
55
- temp_path = temp_file.name
56
-
57
- if progress: progress(0.9, desc="Processing audio...")
58
- audio_data, sample_rate = sf.read(temp_path)
59
- os.unlink(temp_path)
60
- if progress: progress(1.0, desc="Complete!")
61
- return (sample_rate, audio_data)
62
-
63
- else:
64
- if progress: progress(0.3, desc="Preparing voice prompt...")
65
- files = {'text': (None, text_input)}
66
- with open(audio_prompt_input, 'rb') as f:
67
- audio_content = f.read()
68
- files['voice_prompt'] = ('voice_prompt.wav', audio_content, 'audio/wav')
69
-
70
- if progress: progress(0.5, desc="Sending request with voice cloning...")
71
- response = requests.post(
72
- GENERATE_WITH_FILE_ENDPOINT,
73
- files=files,
74
- timeout=180,
75
- stream=True
76
- )
77
- if response.status_code != 200:
78
- raise gr.Error(f"API Error: {response.status_code} - {response.text}")
79
-
80
- if progress: progress(0.8, desc="Streaming cloned voice response...")
81
-
82
- # Get content length if available for progress tracking
83
- content_length = response.headers.get('content-length')
84
- if content_length:
85
- content_length = int(content_length)
86
-
87
- bytes_downloaded = 0
88
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
89
- for chunk in response.iter_content(chunk_size=8192):
90
- if chunk:
91
- temp_file.write(chunk)
92
- bytes_downloaded += len(chunk)
93
-
94
- # Update progress based on bytes downloaded for voice cloning
95
- if content_length and progress:
96
- download_progress = min(0.15, (bytes_downloaded / content_length) * 0.15)
97
- progress(0.8 + download_progress, desc=f"Downloading cloned audio... ({bytes_downloaded // 1024}KB)")
98
- elif progress:
99
- progress(0.8, desc=f"Downloading cloned audio... ({bytes_downloaded // 1024}KB)")
100
-
101
- temp_path = temp_file.name
102
-
103
- audio_data, sample_rate = sf.read(temp_path)
104
- os.unlink(temp_path)
105
- if progress: progress(1.0, desc="Voice cloning complete!")
106
- return (sample_rate, audio_data)
107
-
108
- except requests.exceptions.Timeout:
109
- raise gr.Error("Request timed out. The API might be under heavy load. Please try again.")
110
- except requests.exceptions.ConnectionError:
111
- raise gr.Error("Unable to connect to the API. Please check if the endpoint URL is correct.")
112
- except Exception as e:
113
- raise gr.Error(f"Error generating audio: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/chatterbox/update_char_count.py DELETED
@@ -1,3 +0,0 @@
1
- def update_char_count(text):
2
- count = len(text) if text else 0
3
- return f"{count}/1000"
 
 
 
 
ui/components/apply_custom_css.py DELETED
@@ -1,23 +0,0 @@
1
- import gradio as gr
2
-
3
- def apply_custom_css() -> gr.HTML:
4
- """
5
- Apply custom CSS styling.
6
-
7
- Returns:
8
- gr.HTML: HTML component with CSS styles
9
- """
10
- return gr.HTML("""
11
- <style>
12
- .gradio-container {
13
- max-width: 900px !important;
14
- }
15
- .output-markdown {
16
- font-family: 'Courier New', monospace;
17
- }
18
- .image-gallery-caption {
19
- text-align: center;
20
- font-size: 0.9em;
21
- }
22
- </style>
23
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/components/create_action_button.py DELETED
@@ -1,10 +0,0 @@
1
- import gradio as gr
2
-
3
- def create_action_button() -> gr.Button:
4
- """
5
- Create the extract text action button.
6
-
7
- Returns:
8
- gr.Button: Action button component
9
- """
10
- return gr.Button("Extract Text & Images", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
ui/components/create_action_buttons.py DELETED
@@ -1,14 +0,0 @@
1
- import gradio as gr
2
- from typing import Tuple
3
-
4
- def create_action_buttons() -> Tuple[gr.Button, gr.Button]:
5
- """
6
- Create copy and download action buttons.
7
-
8
- Returns:
9
- Tuple[gr.Button, gr.Button]: Copy and download button components
10
- """
11
- copy_btn = gr.Button("📋 Copy to Clipboard")
12
- download_btn = gr.Button("📥 Download as Text File")
13
-
14
- return copy_btn, download_btn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/components/create_header.py DELETED
@@ -1,25 +0,0 @@
1
-
2
- import gradio as gr
3
-
4
- def create_header() -> gr.Markdown:
5
- """
6
- Create the application header.
7
-
8
- Returns:
9
- gr.Markdown: Header component
10
- """
11
- return gr.Markdown("""
12
- # 🔍 PDF Text Extractor with AI Explanations
13
-
14
- Extract text and images from PDF files using Mistral AI's OCR technology, then get simple explanations for each section.
15
-
16
- **Instructions:**
17
- 1. Upload a PDF file using the file selector below
18
- 2. Wait for processing to complete
19
- 3. View the extracted text and images
20
- 4. Click "Generate Explanations" to get AI-powered explanations of each section
21
- 5. Use the Copy or Download buttons to save the extracted text or explanations
22
-
23
- **Supported:** PDF files up to 10MB
24
- """)
25
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/components/create_image_gallery.py DELETED
@@ -1,19 +0,0 @@
1
- import gradio as gr
2
-
3
- def create_image_gallery() -> gr.Gallery:
4
- """
5
- Create the image gallery component.
6
-
7
- Returns:
8
- gr.Gallery: Image gallery component
9
- """
10
- return gr.Gallery(
11
- label="Extracted Images",
12
- columns=3,
13
- rows=2,
14
- object_fit="contain",
15
- height="auto",
16
- visible=True,
17
- show_label=True,
18
- elem_id="image_gallery"
19
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/components/create_text_display.py DELETED
@@ -1,25 +0,0 @@
1
- import gradio as gr
2
- from typing import Tuple
3
-
4
- def create_text_display() -> Tuple[gr.Textbox, gr.Textbox]:
5
- """
6
- Create the text output and status display components.
7
-
8
- Returns:
9
- Tuple[gr.Textbox, gr.Textbox]: Text output and status components
10
- """
11
- text_output = gr.Textbox(
12
- label="Extracted Text",
13
- lines=10,
14
- max_lines=20,
15
- placeholder="Extracted text will appear here...",
16
- show_copy_button=True
17
- )
18
-
19
- status_output = gr.Textbox(
20
- label="Status",
21
- lines=2,
22
- placeholder="Upload a PDF to see status..."
23
- )
24
-
25
- return text_output, status_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/components/create_upload_section.py DELETED
@@ -1,14 +0,0 @@
1
- import gradio as gr
2
-
3
- def create_upload_section() -> gr.File:
4
- """
5
- Create the file upload component.
6
-
7
- Returns:
8
- gr.File: File upload component
9
- """
10
- return gr.File(
11
- label="Upload PDF File",
12
- file_types=[".pdf"],
13
- file_count="single"
14
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/handlers.py DELETED
@@ -1,104 +0,0 @@
1
- """
2
- Event handlers for UI components.
3
- Contains functions that handle user interactions with the interface.
4
- """
5
-
6
- import os
7
- import tempfile
8
- from typing import Optional, List, Dict, Any
9
- from utils.pdf_image_extractor import PDFImageExtractor
10
-
11
- def copy_text(text: str) -> str:
12
- """
13
- Handle Copy button click.
14
-
15
- Args:
16
- text: Text to copy to clipboard
17
-
18
- Returns:
19
- str: The input text (unchanged)
20
- """
21
- return text
22
-
23
- def download_text(text: str) -> Optional[str]:
24
- """
25
- Handle Download button click.
26
-
27
- Args:
28
- text: Text to download
29
-
30
- Returns:
31
- Optional[str]: Path to the created text file or None if text is empty
32
- """
33
- import tempfile
34
- import os
35
-
36
- if not text:
37
- return None
38
-
39
- # Create a temporary file to hold the text
40
- temp_dir = tempfile.gettempdir()
41
- filename = "extracted_text.txt"
42
- file_path = os.path.join(temp_dir, filename)
43
-
44
- # Write the text to the file
45
- with open(file_path, "w", encoding="utf-8") as f:
46
- f.write(text)
47
-
48
- return file_path
49
-
50
- def process_images_for_display(images_data: List[Dict[str, Any]], pdf_path: str = None) -> List:
51
- """
52
- Process images for display in the Gradio gallery.
53
-
54
- Args:
55
- images_data: List of image data dictionaries from OCR response
56
- pdf_path: Path to the original PDF file for image extraction
57
-
58
- Returns:
59
- List: List of image paths for gallery display
60
- """
61
- if not images_data:
62
- return []
63
-
64
- # If we have PDF path and bounding box data, extract images from PDF
65
- if pdf_path and os.path.exists(pdf_path):
66
- print("🖼️ Extracting images from PDF using bounding box coordinates...")
67
- extracted_paths = PDFImageExtractor.extract_images_from_pdf(pdf_path, images_data)
68
- if extracted_paths:
69
- return extracted_paths
70
-
71
- # Fallback: extract all images from PDF if bounding box extraction failed
72
- print("🔄 Fallback: Extracting all images from PDF...")
73
- extracted_paths = PDFImageExtractor.extract_all_images_from_pdf(pdf_path)
74
- if extracted_paths:
75
- return extracted_paths[:len(images_data)] # Limit to expected number of images
76
-
77
- # Fallback: use base64 data from OCR response
78
- print("🔄 Using base64 image data from OCR response...")
79
- gallery_images = []
80
- temp_dir = tempfile.gettempdir()
81
-
82
- for index, img_data in enumerate(images_data):
83
- try:
84
- # Get image base64 data
85
- base64_data = img_data.get('base64', '')
86
- if not base64_data:
87
- continue
88
-
89
- # Create a temporary file to save the image
90
- img_filename = f"extracted_image_fallback_{index}.jpg"
91
- img_path = os.path.join(temp_dir, img_filename)
92
-
93
- # Convert base64 to image file
94
- import base64
95
- with open(img_path, "wb") as img_file:
96
- img_file.write(base64.b64decode(base64_data))
97
-
98
- # Add path to gallery list (Gradio Gallery expects a list of paths)
99
- gallery_images.append(img_path)
100
-
101
- except Exception as e:
102
- print(f"Error processing image {index}: {str(e)}")
103
-
104
- return gallery_images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/interface.py DELETED
@@ -1,223 +0,0 @@
1
- """
2
- Interface creation module for PDF Text Extractor.
3
- Defines the Gradio interface components and layout.
4
- """
5
-
6
- import gradio as gr
7
- from gradio_pdf import PDF
8
- from pdf_text_extractor import PDFTextExtractor
9
- from ui.handlers import process_images_for_display
10
- from .components.create_header import create_header
11
- from .components.create_upload_section import create_upload_section
12
- from .components.create_action_button import create_action_button
13
- from .components.create_image_gallery import create_image_gallery
14
- from .components.apply_custom_css import apply_custom_css
15
- from .chatterbox.generate_tts_audio import generate_tts_audio
16
-
17
- def create_dummy_interface() -> gr.Blocks:
18
- """
19
- Create a simple interface for when the API key is not configured.
20
-
21
- Returns:
22
- gr.Blocks: Gradio interface with disabled functionality
23
- """
24
- with gr.Blocks(title="PDF Text Extractor") as interface:
25
- gr.Markdown("""
26
- # 🔍 PDF Text Extractor
27
-
28
- ⚠️ **API key not configured.** Please set MISTRAL_API_KEY environment variable and restart the application.
29
- """)
30
-
31
- # Create layout similar to main interface but disabled
32
- with gr.Row(equal_height=True):
33
- # Left column - PDF Display
34
- with gr.Column(scale=1):
35
- gr.Markdown("### 📄 PDF Document")
36
- PDF(
37
- label="Upload and View PDF (Disabled)",
38
- height=700,
39
- interactive=False
40
- )
41
-
42
- gr.Textbox(
43
- label="Status",
44
- lines=2,
45
- value="❌ MISTRAL_API_KEY environment variable is not set. Please set it and restart the application.",
46
- interactive=False
47
- )
48
-
49
- # Right column - Extracted Content
50
- with gr.Column(scale=1):
51
- gr.Markdown("### 📝 Extracted Content")
52
-
53
- gr.Textbox(
54
- label="Extracted Text",
55
- lines=25,
56
- value="API key not configured. Text extraction is unavailable.",
57
- interactive=False
58
- )
59
-
60
- return interface
61
-
62
- def create_main_interface(extractor: PDFTextExtractor) -> gr.Blocks:
63
- """
64
- Create the main application interface.
65
-
66
- Args:
67
- extractor: PDFTextExtractor instance
68
-
69
- Returns:
70
- gr.Blocks: Gradio interface with full functionality
71
- """
72
-
73
- def process_pdf_wrapper(pdf_file):
74
- """Process PDF with the extractor from closure"""
75
- extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
76
- # Get PDF file path for image extraction
77
- pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file if pdf_file else None
78
- gallery_images = process_images_for_display(images_data, pdf_path)
79
- return extracted_text, status, gallery_images
80
-
81
- def generate_explanations_wrapper(extracted_text):
82
- """Generate explanations for extracted text"""
83
- if not extracted_text or extracted_text.strip() == "":
84
- return "No text available to explain. Please extract text from a PDF first."
85
-
86
- explanations = extractor.generate_explanations(extracted_text)
87
- # The explanation_status is now implicitly handled by the content of 'explanations'
88
- return explanations
89
-
90
- def generate_explanation_audio_wrapper(explanations_text):
91
- """Generate TTS audio for explanations using Chatterbox API"""
92
- if not explanations_text or explanations_text.strip() == "":
93
- raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
94
-
95
- # Clean up the text for better TTS
96
- clean_text = explanations_text.strip()
97
-
98
- # Limit text length for TTS (Chatterbox has a 1000 character limit)
99
- if len(clean_text) > 1000:
100
- # Truncate at sentence boundary if possible
101
- sentences = clean_text[:950].split('.')
102
- if len(sentences) > 1:
103
- clean_text = '.'.join(sentences[:-1]) + '.'
104
- else:
105
- clean_text = clean_text[:950]
106
- clean_text += " [Text has been truncated for audio generation]"
107
-
108
- # Call the TTS function directly - it already handles gr.Error exceptions properly
109
- return generate_tts_audio(clean_text, None)
110
- def tts_click_handler(explanations_text):
111
- """Handle TTS button click with proper output handling"""
112
- try:
113
- audio_result = generate_explanation_audio_wrapper(explanations_text)
114
- return audio_result, gr.update(visible=True)
115
- except gr.Error:
116
- # Re-raise Gradio errors as-is (they're already properly formatted)
117
- raise
118
- except Exception as e:
119
- # Only wrap non-Gradio exceptions
120
- raise gr.Error(f"Unexpected error generating audio: {str(e)}")
121
-
122
- with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as interface:
123
- # Add the header
124
- create_header()
125
-
126
- # Create main layout with PDF on left and content on right
127
- with gr.Row(equal_height=True):
128
- # Left column - PDF Display
129
- with gr.Column(scale=1):
130
- gr.Markdown("### 📄 PDF Document")
131
- pdf_input = PDF(
132
- label="Upload and View PDF",
133
- height=700,
134
- interactive=True
135
- )
136
-
137
- # Status display below PDF
138
- status_output = gr.Textbox(
139
- label="Status",
140
- lines=2,
141
- placeholder="Upload a PDF to see status...",
142
- interactive=False
143
- )
144
-
145
- # Right column - Extracted Content
146
- with gr.Column(scale=1):
147
- gr.Markdown("### 📝 Extracted Content")
148
-
149
- # Create tabs for text, explanations, and images
150
- with gr.Tabs():
151
- with gr.TabItem("Extracted Text"):
152
- text_output = gr.Textbox(
153
- label="Extracted Text",
154
- lines=25,
155
- max_lines=30,
156
- placeholder="Upload a PDF to automatically extract text...",
157
- show_copy_button=True
158
- )
159
-
160
- with gr.TabItem("📚 Explanations"):
161
- with gr.Row():
162
- explain_btn = gr.Button("🤖 Generate Explanations", variant="secondary", size="lg")
163
- tts_btn = gr.Button("🔊 Generate Audio", variant="secondary", size="lg")
164
-
165
- explanations_output = gr.Textbox(
166
- label="Text Explanations",
167
- lines=20,
168
- max_lines=25,
169
- placeholder="Click 'Generate Explanations' after extracting text to get simple explanations of each section...",
170
- show_copy_button=True
171
- )
172
-
173
- # Add audio output for explanations
174
- explanation_audio_output = gr.Audio(
175
- label="Explanation Audio",
176
- interactive=False,
177
- visible=False
178
- )
179
-
180
- with gr.TabItem("Extracted Images"):
181
- image_gallery = create_image_gallery()
182
- image_info = gr.Markdown("Images extracted from the PDF will appear here.")
183
- # Set up automatic PDF processing on upload
184
- pdf_input.upload(
185
- fn=process_pdf_wrapper,
186
- inputs=[pdf_input],
187
- outputs=[text_output, status_output, image_gallery]
188
- )
189
-
190
- # Handle explanation generation
191
- explain_btn.click(
192
- fn=generate_explanations_wrapper,
193
- inputs=[text_output],
194
- outputs=[explanations_output], # Removed explanation_status from outputs
195
- show_progress=True
196
- )
197
-
198
- # Handle TTS generation for explanations
199
- tts_btn.click(
200
- fn=tts_click_handler,
201
- inputs=[explanations_output],
202
- outputs=[explanation_audio_output, explanation_audio_output],
203
- show_progress=True
204
- )
205
- # Apply custom CSS styling
206
- apply_custom_css()
207
-
208
- return interface
209
-
210
- def create_interface() -> gr.Blocks:
211
- """
212
- Create and configure the Gradio interface.
213
-
214
- Returns:
215
- gr.Blocks: Configured Gradio interface
216
- """
217
- # Initialize the PDF extractor
218
- try:
219
- extractor = PDFTextExtractor()
220
- return create_main_interface(extractor)
221
- except ValueError as e:
222
- # Create a dummy interface if API key is missing
223
- return create_dummy_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/__init__.py DELETED
@@ -1,5 +0,0 @@
1
- """Utility functions for PDF Text Extractor."""
2
- from utils.config import check_api_key, get_app_config
3
- from utils.text_explainer import TextExplainer
4
-
5
- __all__ = ["check_api_key", "get_app_config", "TextExplainer"]
 
 
 
 
 
 
utils/config.py DELETED
@@ -1,40 +0,0 @@
1
- """
2
- Configuration utilities for PDF Text Extractor.
3
- Contains functions for handling environment variables and app configuration.
4
- """
5
-
6
- import os
7
- from typing import Dict, Any
8
-
9
- def check_api_key() -> bool:
10
- """
11
- Check if the Mistral API key is set in environment variables.
12
-
13
- Returns:
14
- bool: True if API key is set, False otherwise
15
- """
16
- api_key = os.environ.get("MISTRAL_API_KEY")
17
- if not api_key:
18
- print("⚠️ Warning: MISTRAL_API_KEY environment variable is not set.")
19
- print(" Please set it before using the PDF extraction functionality.")
20
- print(" Example: export MISTRAL_API_KEY='your-api-key-here'")
21
- print()
22
- return False
23
- return True
24
-
25
- def get_app_config() -> Dict[str, Any]:
26
- """
27
- Get application configuration settings.
28
-
29
- Returns:
30
- Dict[str, Any]: Application configuration settings
31
- """
32
- return {
33
- "server_port": 7862, # Use a different port to avoid conflicts
34
- "debug": True, # Enable debug mode for development
35
- "quiet": False, # Show startup messages
36
- "max_file_size": "10mb" # Limit PDF file size
37
- # Uncomment the following to enable external access and public link sharing:
38
- # "server_name": "0.0.0.0", # Allow external access
39
- # "share": True, # Create public link
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/pdf_image_extractor.py DELETED
@@ -1,155 +0,0 @@
1
- """
2
- PDF Image Extraction utilities.
3
- Extracts images from PDF using bounding box coordinates.
4
- """
5
-
6
- import os
7
- import tempfile
8
- from typing import List, Dict, Any, Optional
9
- import fitz # PyMuPDF
10
- from PIL import Image
11
- import base64
12
- import io
13
-
14
-
15
- class PDFImageExtractor:
16
- """Extract images from PDF using bounding box coordinates."""
17
-
18
- @staticmethod
19
- def extract_images_from_pdf(pdf_path: str, images_data: List[Dict[str, Any]]) -> List[str]:
20
- """
21
- Extract images from PDF using bounding box coordinates.
22
-
23
- Args:
24
- pdf_path: Path to the PDF file
25
- images_data: List of image data with bounding box coordinates
26
-
27
- Returns:
28
- List[str]: List of paths to extracted image files
29
- """
30
- if not images_data:
31
- return []
32
-
33
- try:
34
- # Open the PDF document
35
- pdf_doc = fitz.open(pdf_path)
36
- extracted_image_paths = []
37
- temp_dir = tempfile.gettempdir()
38
-
39
- for index, img_data in enumerate(images_data):
40
- try:
41
- page_num = img_data.get('page', 0)
42
-
43
- # Ensure page number is valid
44
- if page_num >= len(pdf_doc):
45
- print(f"Warning: Page {page_num} not found in PDF (max: {len(pdf_doc)-1})")
46
- continue
47
-
48
- # Get the page
49
- page = pdf_doc[page_num]
50
-
51
- # Get bounding box coordinates
52
- top_left_x = img_data.get('top_left_x', 0)
53
- top_left_y = img_data.get('top_left_y', 0)
54
- bottom_right_x = img_data.get('bottom_right_x', 0)
55
- bottom_right_y = img_data.get('bottom_right_y', 0)
56
-
57
- # Create a rectangle for the bounding box
58
- # PyMuPDF uses (x0, y0, x1, y1) format
59
- bbox = fitz.Rect(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
60
-
61
- # Render the page as a pixmap with high resolution
62
- mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
63
- pix = page.get_pixmap(matrix=mat, clip=bbox)
64
-
65
- # Convert pixmap to PIL Image
66
- img_data_bytes = pix.tobytes("png")
67
- img = Image.open(io.BytesIO(img_data_bytes))
68
-
69
- # Save the image to a temporary file
70
- img_filename = f"extracted_image_page{page_num}_{index}.png"
71
- img_path = os.path.join(temp_dir, img_filename)
72
- img.save(img_path, "PNG")
73
-
74
- extracted_image_paths.append(img_path)
75
- print(f"✅ Extracted image {index} from page {page_num}: {img_path}")
76
-
77
- except Exception as e:
78
- print(f"Error extracting image {index}: {str(e)}")
79
-
80
- # Fallback: try to use base64 data if available
81
- base64_data = img_data.get('base64', '')
82
- if base64_data:
83
- try:
84
- img_filename = f"extracted_image_base64_{index}.jpg"
85
- img_path = os.path.join(temp_dir, img_filename)
86
-
87
- with open(img_path, "wb") as img_file:
88
- img_file.write(base64.b64decode(base64_data))
89
-
90
- extracted_image_paths.append(img_path)
91
- print(f"✅ Used base64 data for image {index}: {img_path}")
92
- except Exception as e2:
93
- print(f"Error using base64 data for image {index}: {str(e2)}")
94
-
95
- pdf_doc.close()
96
- return extracted_image_paths
97
-
98
- except Exception as e:
99
- print(f"Error opening PDF file: {str(e)}")
100
- return []
101
-
102
- @staticmethod
103
- def extract_all_images_from_pdf(pdf_path: str) -> List[str]:
104
- """
105
- Extract all images from PDF without using bounding boxes.
106
- This is a fallback method when no bounding box data is available.
107
-
108
- Args:
109
- pdf_path: Path to the PDF file
110
-
111
- Returns:
112
- List[str]: List of paths to extracted image files
113
- """
114
- try:
115
- pdf_doc = fitz.open(pdf_path)
116
- extracted_image_paths = []
117
- temp_dir = tempfile.gettempdir()
118
-
119
- for page_num in range(len(pdf_doc)):
120
- page = pdf_doc[page_num]
121
- image_list = page.get_images()
122
-
123
- for img_index, img in enumerate(image_list):
124
- try:
125
- # Get image data
126
- xref = img[0]
127
- pix = fitz.Pixmap(pdf_doc, xref)
128
-
129
- # Convert to PNG if CMYK
130
- if pix.n - pix.alpha < 4: # GRAY or RGB
131
- img_data = pix.tobytes("png")
132
- else: # CMYK: convert to RGB first
133
- pix1 = fitz.Pixmap(fitz.csRGB, pix)
134
- img_data = pix1.tobytes("png")
135
- pix1 = None
136
-
137
- # Save image
138
- img_filename = f"all_images_page{page_num}_img{img_index}.png"
139
- img_path = os.path.join(temp_dir, img_filename)
140
-
141
- with open(img_path, "wb") as f:
142
- f.write(img_data)
143
-
144
- extracted_image_paths.append(img_path)
145
- pix = None
146
-
147
- except Exception as e:
148
- print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
149
-
150
- pdf_doc.close()
151
- return extracted_image_paths
152
-
153
- except Exception as e:
154
- print(f"Error extracting all images from PDF: {str(e)}")
155
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/text_explainer.py DELETED
@@ -1,341 +0,0 @@
1
- """
2
- Text Explanation utilities using Mistral AI.
3
- Splits text by markdown headings and generates contextual explanations for each section.
4
- Maintains chat history to provide coherent explanations that build upon previous sections.
5
- """
6
-
7
- import os
8
- import re
9
- from typing import List, Dict, Tuple, Optional
10
- from mistralai import Mistral
11
-
12
-
13
- class TextExplainer:
14
- """Generate explanations for text sections using Mistral AI."""
15
-
16
- def __init__(self):
17
- """Initialize the text explainer with Mistral AI client."""
18
- self.api_key = os.environ.get("MISTRAL_API_KEY")
19
- if not self.api_key:
20
- raise ValueError("MISTRAL_API_KEY environment variable is required")
21
- self.client = Mistral(api_key=self.api_key)
22
- self.chat_history = []
23
-
24
- def get_topic(self, text: str) -> Optional[str]:
25
- """
26
- Extract the main topic from the text using Mistral AI with structured output.
27
-
28
- Args:
29
- text: Input text to analyze
30
-
31
- Returns:
32
- Main topic as a string or None if not found
33
- """
34
- try:
35
- # Define the JSON schema for structured output
36
- topic_schema = {
37
- "type": "json_schema",
38
- "json_schema": {
39
- "schema": {
40
- "type": "object",
41
- "properties": {
42
- "main_topic": {
43
- "type": "string",
44
- "title": "Main Topic",
45
- "description": "The primary / general topic or subject of the text"
46
- },
47
- },
48
- "required": ["main_topic"],
49
- "additionalProperties": False
50
- },
51
- "name": "topic_extraction",
52
- "strict": True
53
- }
54
- }
55
-
56
- response = self.client.chat.complete(
57
- model="ministral-8b-2410", # Using a more recent model that supports structured output
58
- messages=[
59
- {
60
- "role": "system",
61
- "content": "You are an expert in summarizing texts. Extract the main topic from the provided text."
62
- },
63
- {
64
- "role": "user",
65
- "content": f"Analyze this text and extract the main topic:\n\n{text[:2000]}..." # Limit to first 2000 characters for performance
66
- }
67
- ],
68
- temperature=0.3, # Lower temperature for more consistent structured output
69
- max_tokens=200,
70
- response_format=topic_schema
71
- )
72
-
73
- if hasattr(response, 'choices') and response.choices:
74
- # Parse the structured JSON response
75
- import json
76
- try:
77
- topic_data = json.loads(response.choices[0].message.content)
78
- main_topic = topic_data.get("main_topic", "").strip()
79
- confidence = topic_data.get("confidence", 0.0)
80
- secondary_topics = topic_data.get("secondary_topics", [])
81
-
82
- # Log the structured output for debugging
83
- print(f"📊 Topic extraction - Main: '{main_topic}', Confidence: {confidence:.2f}")
84
- if secondary_topics:
85
- print(f"🔍 Secondary topics: {', '.join(secondary_topics)}")
86
-
87
- return main_topic if main_topic else None
88
- except json.JSONDecodeError as json_err:
89
- print(f"Error parsing JSON response: {json_err}")
90
- # Fallback to raw content if JSON parsing fails
91
- return response.choices[0].message.content.strip()
92
- return None
93
- except Exception as e:
94
- print(f"Error extracting topic: {str(e)}")
95
- return None
96
-
97
- def split_text_by_headings(self, text: str) -> List[Dict[str, str]]:
98
- """
99
- Split text into sections based on markdown headings.
100
-
101
- Args:
102
- text: Input text with markdown headings
103
-
104
- Returns:
105
- List of dictionaries with 'heading' and 'content' keys
106
- """
107
- if not text:
108
- return []
109
-
110
- # Split by markdown headings (# ## ### etc.)
111
- sections = []
112
-
113
- # Regex to find headings and their content
114
- # Matches: # Heading, ## Heading, ### Heading, etc.
115
- heading_pattern = r'^(#{1,6})\s+(.+?)$'
116
-
117
- lines = text.split('\n')
118
- current_heading = None
119
- current_content = []
120
- current_level = 0
121
-
122
- for line in lines:
123
- heading_match = re.match(heading_pattern, line.strip())
124
-
125
- if heading_match:
126
- # Save previous section if it exists
127
- if current_heading and current_content:
128
- content_text = '\n'.join(current_content).strip()
129
- if content_text: # Only add if there's actual content
130
- sections.append({
131
- 'heading': current_heading,
132
- 'content': content_text,
133
- 'level': current_level
134
- })
135
-
136
- # Start new section
137
- level = len(heading_match.group(1)) # Count the # characters
138
- current_heading = heading_match.group(2).strip()
139
- current_level = level
140
- current_content = []
141
- else:
142
- # Add line to current content if we have a heading
143
- if current_heading is not None:
144
- current_content.append(line)
145
-
146
- # Don't forget the last section
147
- if current_heading and current_content:
148
- content_text = '\n'.join(current_content).strip()
149
- if content_text:
150
- sections.append({
151
- 'heading': current_heading,
152
- 'content': content_text,
153
- 'level': current_level
154
- })
155
-
156
- # If no headings found, treat entire text as one section
157
- if not sections and text.strip():
158
- sections.append({
159
- 'heading': 'Document Content',
160
- 'content': text.strip(),
161
- 'level': 1
162
- })
163
- return sections
164
-
165
- def generate_explanation(self, topic: str, heading: str, content: str, section_number: int = 1, total_sections: int = 1) -> str:
166
- """
167
- Generate an explanation for a text section using Mistral AI with chat history context.
168
-
169
- Args:
170
- topic: General topic of the document
171
- heading: Section heading
172
- content: Section content
173
- section_number: Current section number (for context)
174
- total_sections: Total number of sections (for context)
175
-
176
- Returns:
177
- Generated explanation in simple terms
178
- """
179
- try:
180
- # Build the current user message
181
- prompt = f"""
182
- **Section {section_number} of {total_sections}**
183
- **Section Heading:** {heading}
184
-
185
- **Section Content:**
186
- {content}
187
-
188
- **Your Explanation:**"""
189
-
190
- # If this is the first section, initialize with system prompt
191
- if section_number == 1:
192
- system_prompt = f"""You are an expert teacher who explains complex topics in simple, easy-to-understand terms.
193
-
194
- I will give you sections of text with their headings on the topic of "{topic}", and I want you to explain what each section is about in simple language, by breaking down any complex concepts or terminology. You should also explain why this information might be important or useful, use examples or analogies when helpful, and keep the explanation engaging and educational.
195
-
196
- Make your explanation clear enough for someone without prior knowledge of the topic to understand. As you explain each section, consider how it relates to the previous sections you've already explained to provide coherent, contextual explanations throughout the document.
197
-
198
- Do not mention anything far irrelevant from the topic of "{topic}". Do not repeat information unnecessarily, but build on previous explanations to create a comprehensive understanding of the topic. Avoid using the term 'section' and use the actual section heading instead. No need to mention the section number in your explanation.
199
- """
200
-
201
- # Initialize chat history with system message
202
- self.chat_history = [
203
- {
204
- "role": "system",
205
- "content": system_prompt
206
- }
207
- ]
208
-
209
- # Check if content is too small (less than 200 characters)
210
- if len(content) < 200:
211
- print(f"📋 Skipping API call for short content in '{heading}' ({len(content)} chars < 200)")
212
- # Add the user prompt to chat history for context in subsequent queries
213
- self.chat_history.append({
214
- "role": "user",
215
- "content": prompt
216
- })
217
- # Return a simple message indicating the content was too short
218
- return f"This section contains minimal content ({len(content)} characters). The information has been noted for context in subsequent explanations."
219
-
220
- # Add the current user message to chat history
221
- self.chat_history.append({
222
- "role": "user",
223
- "content": prompt
224
- })
225
-
226
- # Call Mistral AI for explanation with full chat history
227
- response = self.client.chat.complete(
228
- model="mistral-small-2503",
229
- messages=self.chat_history,
230
- temperature=0.7, # Some creativity but still focused
231
- # max_tokens=1000 # Reasonable explanation length
232
- )
233
-
234
- # Extract the explanation from response
235
- if hasattr(response, 'choices') and response.choices:
236
- explanation = response.choices[0].message.content
237
-
238
- # Add the assistant's response to chat history
239
- self.chat_history.append({
240
- "role": "assistant",
241
- "content": explanation
242
- })
243
-
244
- return explanation.strip()
245
- else:
246
- return f"Could not generate explanation for section: {heading}"
247
-
248
- except Exception as e:
249
- print(f"Error generating explanation for '{heading}': {str(e)}")
250
- return f"Error generating explanation for this section: {str(e)}"
251
-
252
- def explain_all_sections(self, text: str) -> List[Dict[str, str]]:
253
- """
254
- Split text by headings and generate explanations for all sections with chat history context.
255
-
256
- Args:
257
- text: Input text with markdown headings
258
-
259
- Returns:
260
- List of dictionaries with 'heading', 'content', 'explanation', and 'level' keys
261
- """
262
- sections = self.split_text_by_headings(text)
263
-
264
- if not sections:
265
- return []
266
-
267
- print(f"🔍 Found {len(sections)} sections to explain...")
268
-
269
- # Extract the main topic from the text
270
- print("🎯 Extracting main topic...")
271
- topic = self.get_topic(text)
272
- if topic:
273
- print(f"📋 Main topic identified: {topic}")
274
- else:
275
- topic = "General Content" # Fallback topic
276
- print("⚠️ Could not identify main topic, using fallback")
277
-
278
- # Reset chat history for new document
279
- self.chat_history = []
280
-
281
- explained_sections = []
282
-
283
- for i, section in enumerate(sections, 1):
284
- print(f"📝 Generating explanation for section {i}/{len(sections)}: {section['heading'][:50]}...")
285
-
286
- # Pass topic, section content, and context information
287
- explanation = self.generate_explanation(
288
- topic,
289
- section['heading'],
290
- section['content'],
291
- section_number=i,
292
- total_sections=len(sections)
293
- )
294
-
295
- explained_sections.append({
296
- 'heading': section['heading'],
297
- 'content': section['content'],
298
- 'explanation': explanation,
299
- 'level': section['level']
300
- })
301
-
302
- print(f"✅ Generated explanations for all {len(explained_sections)} sections")
303
- return explained_sections
304
-
305
- def reset_chat_history(self):
306
- """Reset the chat history for a new document or conversation."""
307
- self.chat_history = []
308
-
309
- def get_chat_history(self) -> List[Dict[str, str]]:
310
- """Get the current chat history for debugging purposes."""
311
- return self.chat_history.copy()
312
-
313
- def get_chat_history_summary(self) -> str:
314
- """Get a summary of the current chat history."""
315
- if not self.chat_history:
316
- return "No chat history available."
317
-
318
- summary = f"Chat history contains {len(self.chat_history)} messages:\n"
319
- for i, message in enumerate(self.chat_history, 1):
320
- role = message['role']
321
- content_preview = message['content'][:100] + "..." if len(message['content']) > 100 else message['content']
322
- summary += f"{i}. {role.upper()}: {content_preview}\n"
323
-
324
- return summary
325
-
326
- def format_explanations_for_display(self, explained_sections: List[Dict[str, str]]) -> str:
327
- """
328
- Concatenate only the explanations from all sections for display, filtering out placeholder explanations for minimal content.
329
- Args:
330
- explained_sections: List of sections with explanations
331
- Returns:
332
- Concatenated explanations as a single string
333
- """
334
- if not explained_sections:
335
- return "No sections found to explain."
336
- skip_phrase = "This section contains minimal content"
337
- return "\n\n".join(
338
- section['explanation']
339
- for section in explained_sections
340
- if section.get('explanation') and not section['explanation'].strip().startswith(skip_phrase)
341
- )