Spaces:
Sleeping
Sleeping
removed old code
Browse files- _app.py +0 -191
- api.py +0 -48
- app.py +11 -29
- pdf_text_extractor.py +0 -287
- sample_text.md +0 -15
- src/app.py +0 -17
- src/ui_components/interface.py +1 -1
- tests/test_ocr_direct.py +0 -234
- tests/test_setup.py +0 -62
- ui/__init__.py +0 -15
- ui/chatterbox/check_api_health.py +0 -13
- ui/chatterbox/custom_css.py +0 -9
- ui/chatterbox/generate_sample_text.py +0 -10
- ui/chatterbox/generate_tts_audio.py +0 -113
- ui/chatterbox/update_char_count.py +0 -3
- ui/components/apply_custom_css.py +0 -23
- ui/components/create_action_button.py +0 -10
- ui/components/create_action_buttons.py +0 -14
- ui/components/create_header.py +0 -25
- ui/components/create_image_gallery.py +0 -19
- ui/components/create_text_display.py +0 -25
- ui/components/create_upload_section.py +0 -14
- ui/handlers.py +0 -104
- ui/interface.py +0 -223
- utils/__init__.py +0 -5
- utils/config.py +0 -40
- utils/pdf_image_extractor.py +0 -155
- utils/text_explainer.py +0 -341
_app.py
DELETED
@@ -1,191 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from gradio_pdf import PDF
|
3 |
-
from pdf_text_extractor import PDFTextExtractor
|
4 |
-
from dotenv import load_dotenv
|
5 |
-
|
6 |
-
load_dotenv()
|
7 |
-
|
8 |
-
def main():
|
9 |
-
"""Main function to create and launch the interface."""
|
10 |
-
def process_pdf(pdf_file):
|
11 |
-
"""Process PDF and extract text, then explanations, then audio, updating UI at each step."""
|
12 |
-
if pdf_file is None:
|
13 |
-
yield "", "No PDF uploaded", "", None, gr.update(visible=False)
|
14 |
-
return
|
15 |
-
|
16 |
-
try:
|
17 |
-
extractor = PDFTextExtractor()
|
18 |
-
|
19 |
-
# Step 1: Extract text
|
20 |
-
# Show "Extracting text..." message
|
21 |
-
yield "", gr.update(value="Extracting text..."), "", None, gr.update(visible=False)
|
22 |
-
extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
|
23 |
-
|
24 |
-
if not extracted_text or extracted_text.strip() == "":
|
25 |
-
yield extracted_text, status, "No text available to explain.", None, gr.update(visible=False)
|
26 |
-
return
|
27 |
-
|
28 |
-
# Show extracted text immediately, explanations/audio loading
|
29 |
-
yield extracted_text, status, gr.update(value="Generating explanations..."), None, gr.update(visible=False)
|
30 |
-
|
31 |
-
# Step 2: Generate explanations
|
32 |
-
try:
|
33 |
-
explanations = extractor.generate_explanations(extracted_text)
|
34 |
-
|
35 |
-
# Show explanations immediately, update status for audio loading
|
36 |
-
yield extracted_text, gr.update(value="Generating audio..."), explanations, None, gr.update(visible=False)
|
37 |
-
|
38 |
-
# Step 3: Generate audio
|
39 |
-
try:
|
40 |
-
from ui.chatterbox.generate_tts_audio import generate_tts_audio
|
41 |
-
|
42 |
-
# Clean up the text for better TTS
|
43 |
-
clean_text = explanations.strip()
|
44 |
-
|
45 |
-
# Limit text length for TTS (assuming 1000 character limit)
|
46 |
-
if len(clean_text) > 1000:
|
47 |
-
sentences = clean_text[:950].split('.')
|
48 |
-
if len(sentences) > 1:
|
49 |
-
clean_text = '.'.join(sentences[:-1]) + '.'
|
50 |
-
else:
|
51 |
-
clean_text = clean_text[:950]
|
52 |
-
clean_text += " [Text has been truncated for audio generation]"
|
53 |
-
|
54 |
-
audio_result = generate_tts_audio(clean_text, None)
|
55 |
-
|
56 |
-
# Show everything, update status to complete
|
57 |
-
yield extracted_text, gr.update(value="All steps complete!"), explanations, audio_result, gr.update(visible=True)
|
58 |
-
|
59 |
-
except Exception as audio_error:
|
60 |
-
# Show explanations, update status with audio error
|
61 |
-
yield extracted_text, gr.update(value=f"Audio generation failed: {str(audio_error)}"), explanations, None, gr.update(visible=False)
|
62 |
-
|
63 |
-
except Exception as explanation_error:
|
64 |
-
# Show extracted text, but indicate explanation error
|
65 |
-
yield extracted_text, status, f"Error generating explanations: {str(explanation_error)}", None, gr.update(visible=False)
|
66 |
-
|
67 |
-
except Exception as e:
|
68 |
-
yield "", f"Error processing PDF: {str(e)}", "", None, gr.update(visible=False)
|
69 |
-
|
70 |
-
def generate_explanations(extracted_text):
|
71 |
-
"""Generate explanations for extracted text"""
|
72 |
-
if not extracted_text or extracted_text.strip() == "":
|
73 |
-
return "No text available to explain. Please extract text from a PDF first."
|
74 |
-
|
75 |
-
try:
|
76 |
-
# Initialize extractor
|
77 |
-
extractor = PDFTextExtractor()
|
78 |
-
|
79 |
-
# Generate explanations
|
80 |
-
explanations = extractor.generate_explanations(extracted_text)
|
81 |
-
return explanations
|
82 |
-
|
83 |
-
except Exception as e:
|
84 |
-
return f"Error generating explanations: {str(e)}"
|
85 |
-
|
86 |
-
def generate_audio(explanation_text):
|
87 |
-
"""Generate TTS audio for explanations"""
|
88 |
-
if not explanation_text or explanation_text.strip() == "":
|
89 |
-
raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
|
90 |
-
|
91 |
-
try:
|
92 |
-
# Import the TTS function
|
93 |
-
from ui.chatterbox.generate_tts_audio import generate_tts_audio
|
94 |
-
|
95 |
-
# Clean up the text for better TTS
|
96 |
-
clean_text = explanation_text.strip()
|
97 |
-
|
98 |
-
# Limit text length for TTS (assuming 1000 character limit)
|
99 |
-
if len(clean_text) > 1000:
|
100 |
-
# Truncate at sentence boundary if possible
|
101 |
-
sentences = clean_text[:950].split('.')
|
102 |
-
if len(sentences) > 1:
|
103 |
-
clean_text = '.'.join(sentences[:-1]) + '.'
|
104 |
-
else:
|
105 |
-
clean_text = clean_text[:950]
|
106 |
-
clean_text += " [Text has been truncated for audio generation]"
|
107 |
-
|
108 |
-
# Generate audio and make it visible
|
109 |
-
audio_result = generate_tts_audio(clean_text, None)
|
110 |
-
return audio_result, gr.update(visible=True)
|
111 |
-
|
112 |
-
except Exception as e:
|
113 |
-
raise gr.Error(f"Error generating audio: {str(e)}")
|
114 |
-
# Create the interface with side-by-side layout
|
115 |
-
with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as demo:
|
116 |
-
# Inject fullscreen CSS
|
117 |
-
gr.HTML("""
|
118 |
-
<style>
|
119 |
-
html, body, #root, .gradio-container {
|
120 |
-
height: 100% !important;
|
121 |
-
width: 100% !important;
|
122 |
-
margin: 0 !important;
|
123 |
-
padding: 0 !important;
|
124 |
-
}
|
125 |
-
.gradio-container {
|
126 |
-
max-width: 100vw !important;
|
127 |
-
min-height: 100vh !important;
|
128 |
-
box-sizing: border-box;
|
129 |
-
}
|
130 |
-
</style>
|
131 |
-
""")
|
132 |
-
|
133 |
-
gr.Markdown("# 🔍 PDF Text Extractor")
|
134 |
-
gr.Markdown("Upload a PDF on the left to automatically extract and view text on the right.")
|
135 |
-
|
136 |
-
with gr.Row(equal_height=True):
|
137 |
-
# Left column - PDF Display
|
138 |
-
with gr.Column(scale=1):
|
139 |
-
gr.Markdown("### 📄 PDF Document")
|
140 |
-
pdf_input = PDF(
|
141 |
-
label="Upload and View PDF",
|
142 |
-
height=600,
|
143 |
-
interactive=True
|
144 |
-
)
|
145 |
-
|
146 |
-
status_output = gr.Textbox(
|
147 |
-
label="Status",
|
148 |
-
lines=2,
|
149 |
-
placeholder="Upload a PDF to see status...",
|
150 |
-
interactive=False
|
151 |
-
)
|
152 |
-
# Right column - Extracted Content with Tabs
|
153 |
-
with gr.Column(scale=1):
|
154 |
-
gr.Markdown("### 📝 Extracted Content")
|
155 |
-
|
156 |
-
with gr.Tabs():
|
157 |
-
with gr.TabItem("Extracted Text"):
|
158 |
-
text_output = gr.Textbox(
|
159 |
-
label="Extracted Text",
|
160 |
-
lines=20,
|
161 |
-
placeholder="Upload a PDF to automatically extract text...",
|
162 |
-
show_copy_button=True,
|
163 |
-
interactive=False
|
164 |
-
)
|
165 |
-
with gr.TabItem("Explanation Script"):
|
166 |
-
explanation_output = gr.Textbox(
|
167 |
-
label="Generated Explanation Script",
|
168 |
-
lines=15,
|
169 |
-
placeholder="Explanations will be automatically generated after text extraction...",
|
170 |
-
show_copy_button=True,
|
171 |
-
interactive=False
|
172 |
-
)
|
173 |
-
|
174 |
-
# Audio generation section (below tabs)
|
175 |
-
gr.Markdown("### 🔊 Audio Generation")
|
176 |
-
audio_output = gr.Audio(
|
177 |
-
label="Generated Explanation Audio",
|
178 |
-
interactive=False,
|
179 |
-
visible=False
|
180 |
-
) # Set up automatic processing on PDF upload (now handles all steps)
|
181 |
-
pdf_input.upload(
|
182 |
-
fn=process_pdf,
|
183 |
-
inputs=[pdf_input],
|
184 |
-
outputs=[text_output, status_output, explanation_output, audio_output, audio_output]
|
185 |
-
)
|
186 |
-
|
187 |
-
return demo
|
188 |
-
|
189 |
-
if __name__ == "__main__":
|
190 |
-
demo = main()
|
191 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
API for testing the TextExplainer using FastAPI.
|
3 |
-
"""
|
4 |
-
|
5 |
-
from fastapi import FastAPI, HTTPException, Form
|
6 |
-
from pydantic import BaseModel
|
7 |
-
from utils.text_explainer import TextExplainer
|
8 |
-
import os
|
9 |
-
from dotenv import load_dotenv
|
10 |
-
|
11 |
-
# Load environment variables
|
12 |
-
load_dotenv()
|
13 |
-
|
14 |
-
app = FastAPI(title="Text Explainer API")
|
15 |
-
|
16 |
-
class ExplainRequest(BaseModel):
|
17 |
-
text: str
|
18 |
-
|
19 |
-
class ExplainResponseSection(BaseModel):
|
20 |
-
heading: str
|
21 |
-
content: str
|
22 |
-
explanation: str
|
23 |
-
level: int
|
24 |
-
|
25 |
-
class ExplainResponse(BaseModel):
|
26 |
-
sections: list[ExplainResponseSection]
|
27 |
-
chat_history: list[dict] # Add chat history to the response
|
28 |
-
|
29 |
-
@app.post("/explain-text", response_model=ExplainResponse)
|
30 |
-
def explain_text(request: ExplainRequest = None, text: str = Form(None)):
|
31 |
-
# Accept either JSON or form data
|
32 |
-
input_text = None
|
33 |
-
if request and request.text:
|
34 |
-
input_text = request.text
|
35 |
-
elif text:
|
36 |
-
input_text = text
|
37 |
-
if not input_text or not input_text.strip():
|
38 |
-
raise HTTPException(status_code=400, detail="Text is required.")
|
39 |
-
try:
|
40 |
-
explainer = TextExplainer()
|
41 |
-
explained_sections = explainer.explain_all_sections(input_text)
|
42 |
-
chat_history = explainer.get_chat_history() # Get chat history
|
43 |
-
return {
|
44 |
-
"sections": explained_sections,
|
45 |
-
"chat_history": chat_history
|
46 |
-
}
|
47 |
-
except Exception as e:
|
48 |
-
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,35 +1,17 @@
|
|
1 |
-
"""
|
2 |
-
PDF Text Extractor Application
|
3 |
-
Main entry point for the PDF Text Extractor application.
|
4 |
-
"""
|
5 |
|
6 |
-
import
|
|
|
|
|
7 |
from dotenv import load_dotenv
|
8 |
-
|
9 |
-
|
10 |
|
11 |
def main():
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
load_dotenv()
|
16 |
-
|
17 |
-
# Check for API key
|
18 |
-
check_api_key()
|
19 |
-
|
20 |
-
# Create and launch the interface
|
21 |
-
interface = create_interface()
|
22 |
-
|
23 |
-
# Get application configuration
|
24 |
-
app_config = get_app_config()
|
25 |
-
|
26 |
-
# Launch with appropriate settings
|
27 |
-
interface.launch(
|
28 |
-
# server_port=app_config["server_port"],
|
29 |
-
debug=app_config["debug"],
|
30 |
-
quiet=app_config["quiet"],
|
31 |
-
max_file_size=app_config["max_file_size"]
|
32 |
-
)
|
33 |
|
34 |
if __name__ == "__main__":
|
35 |
-
main()
|
|
|
|
1 |
+
"""Main entry point for the PDF Explainer app."""
|
|
|
|
|
|
|
2 |
|
3 |
+
import gradio as gr
|
4 |
+
from src.processors.pdf_processor import PDFProcessor
|
5 |
+
from src.ui_components.interface import build_interface
|
6 |
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
|
10 |
def main():
|
11 |
+
pdf_processor = PDFProcessor()
|
12 |
+
demo = build_interface(pdf_processor.process_pdf)
|
13 |
+
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
if __name__ == "__main__":
|
16 |
+
demo = main()
|
17 |
+
demo.launch()
|
pdf_text_extractor.py
DELETED
@@ -1,287 +0,0 @@
|
|
1 |
-
import base64
|
2 |
-
import os
|
3 |
-
from typing import Optional, Tuple, List, Dict, Any
|
4 |
-
from mistralai import Mistral
|
5 |
-
from utils.text_explainer import TextExplainer
|
6 |
-
|
7 |
-
class PDFTextExtractor:
|
8 |
-
"""PDF text extraction using Mistral AI OCR."""
|
9 |
-
|
10 |
-
def __init__(self):
|
11 |
-
"""Initialize the PDF text extractor with Mistral AI client."""
|
12 |
-
self.api_key = os.environ.get("MISTRAL_API_KEY")
|
13 |
-
if not self.api_key:
|
14 |
-
raise ValueError("MISTRAL_API_KEY environment variable is required")
|
15 |
-
self.client = Mistral(api_key=self.api_key)
|
16 |
-
self.text_explainer = TextExplainer()
|
17 |
-
|
18 |
-
def encode_pdf(self, pdf_path: str) -> Optional[str]:
|
19 |
-
"""
|
20 |
-
Encode the PDF file to base64.
|
21 |
-
|
22 |
-
Args:
|
23 |
-
pdf_path: Path to the PDF file
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
Base64 encoded string or None if error
|
27 |
-
"""
|
28 |
-
try:
|
29 |
-
with open(pdf_path, "rb") as pdf_file:
|
30 |
-
return base64.b64encode(pdf_file.read()).decode('utf-8')
|
31 |
-
except FileNotFoundError:
|
32 |
-
print(f"Error: The file {pdf_path} was not found.")
|
33 |
-
return None
|
34 |
-
except Exception as e:
|
35 |
-
print(f"Error encoding PDF: {e}")
|
36 |
-
return None
|
37 |
-
|
38 |
-
def extract_text_from_pdf(self, pdf_file) -> Tuple[str, str, List[Dict[str, Any]]]:
|
39 |
-
"""
|
40 |
-
Extract text and images from uploaded PDF using Mistral AI OCR.
|
41 |
-
|
42 |
-
Args:
|
43 |
-
pdf_file: Gradio file object
|
44 |
-
|
45 |
-
Returns:
|
46 |
-
Tuple of (extracted_text, status_message, images_data)
|
47 |
-
"""
|
48 |
-
if pdf_file is None:
|
49 |
-
return "", "Please upload a PDF file.", []
|
50 |
-
|
51 |
-
try:
|
52 |
-
# Get the file path from Gradio file object
|
53 |
-
pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
|
54 |
-
|
55 |
-
# Encode PDF to base64
|
56 |
-
base64_pdf = self.encode_pdf(pdf_path)
|
57 |
-
if base64_pdf is None:
|
58 |
-
return "", "Failed to encode PDF file.", []
|
59 |
-
|
60 |
-
# Process with Mistral OCR
|
61 |
-
print(f"🔄 Processing PDF with Mistral OCR...")
|
62 |
-
ocr_response = self.client.ocr.process(
|
63 |
-
model="mistral-ocr-latest",
|
64 |
-
document={
|
65 |
-
"type": "document_url",
|
66 |
-
"document_url": f"data:application/pdf;base64,{base64_pdf}"
|
67 |
-
},
|
68 |
-
include_image_base64=True
|
69 |
-
)
|
70 |
-
|
71 |
-
# Enhanced debugging and response parsing
|
72 |
-
print("🔍 Analyzing OCR Response Structure...")
|
73 |
-
print(f" Type: {type(ocr_response)}")
|
74 |
-
print(f" String representation: {str(ocr_response)[:500]}...")
|
75 |
-
|
76 |
-
# Check if it's a simple object with attributes
|
77 |
-
if hasattr(ocr_response, '__dict__'):
|
78 |
-
print(f" Object attributes: {list(ocr_response.__dict__.keys())}")
|
79 |
-
for key, value in ocr_response.__dict__.items():
|
80 |
-
print(f" {key}: {type(value)} = {str(value)[:100]}...")
|
81 |
-
|
82 |
-
# Check if it has commonly expected attributes
|
83 |
-
common_attrs = ['text', 'content', 'result', 'data', 'output', 'extracted_text', 'ocr_text', 'choices', 'message']
|
84 |
-
for attr in common_attrs:
|
85 |
-
if hasattr(ocr_response, attr):
|
86 |
-
value = getattr(ocr_response, attr)
|
87 |
-
print(f" Has '{attr}': {type(value)} = {str(value)[:100]}...")
|
88 |
-
|
89 |
-
# Check if it's iterable but not a string
|
90 |
-
try:
|
91 |
-
if hasattr(ocr_response, '__iter__') and not isinstance(ocr_response, str):
|
92 |
-
print(f" Iterable with {len(list(ocr_response))} items")
|
93 |
-
for i, item in enumerate(ocr_response):
|
94 |
-
if i < 3: # Show first 3 items
|
95 |
-
print(f" Item {i}: {type(item)} = {str(item)[:100]}...")
|
96 |
-
except Exception as e:
|
97 |
-
print(f" Error checking iteration: {e}")
|
98 |
-
|
99 |
-
# Advanced text extraction with multiple strategies
|
100 |
-
extracted_text = ""
|
101 |
-
extraction_method = "none"
|
102 |
-
extracted_images = []
|
103 |
-
|
104 |
-
# Strategy 1: Mistral OCR specific - pages with markdown content and images
|
105 |
-
if hasattr(ocr_response, 'pages') and ocr_response.pages:
|
106 |
-
pages = ocr_response.pages
|
107 |
-
if isinstance(pages, list) and len(pages) > 0:
|
108 |
-
page_texts = []
|
109 |
-
|
110 |
-
for i, page in enumerate(pages):
|
111 |
-
# Extract text
|
112 |
-
if hasattr(page, 'markdown') and page.markdown:
|
113 |
-
page_texts.append(page.markdown)
|
114 |
-
print(f"✅ Found text in page {i} markdown: {len(page.markdown)} characters")
|
115 |
-
|
116 |
-
# Extract images
|
117 |
-
if hasattr(page, 'images') and page.images:
|
118 |
-
for j, img in enumerate(page.images):
|
119 |
-
image_data = {
|
120 |
-
'page': i,
|
121 |
-
'image_id': f"img-{i}-{j}",
|
122 |
-
'top_left_x': getattr(img, 'top_left_x', 0),
|
123 |
-
'top_left_y': getattr(img, 'top_left_y', 0),
|
124 |
-
'bottom_right_x': getattr(img, 'bottom_right_x', 0),
|
125 |
-
'bottom_right_y': getattr(img, 'bottom_right_y', 0),
|
126 |
-
'base64': getattr(img, 'image_base64', '')
|
127 |
-
}
|
128 |
-
extracted_images.append(image_data)
|
129 |
-
print(f"✅ Found image in page {i}, image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
|
130 |
-
|
131 |
-
if page_texts:
|
132 |
-
extracted_text = "\n\n".join(page_texts)
|
133 |
-
extraction_method = f"pages_markdown_{len(page_texts)}_pages"
|
134 |
-
|
135 |
-
# Try to extract images from other response structures if no images found yet
|
136 |
-
if not extracted_images:
|
137 |
-
# Check if response has images attribute directly
|
138 |
-
if hasattr(ocr_response, 'images') and ocr_response.images:
|
139 |
-
for j, img in enumerate(ocr_response.images):
|
140 |
-
image_data = {
|
141 |
-
'page': 0,
|
142 |
-
'image_id': getattr(img, 'id', f"img-{j}"),
|
143 |
-
'top_left_x': getattr(img, 'top_left_x', 0),
|
144 |
-
'top_left_y': getattr(img, 'top_left_y', 0),
|
145 |
-
'bottom_right_x': getattr(img, 'bottom_right_x', 0),
|
146 |
-
'bottom_right_y': getattr(img, 'bottom_right_y', 0),
|
147 |
-
'base64': getattr(img, 'image_base64', '')
|
148 |
-
}
|
149 |
-
extracted_images.append(image_data)
|
150 |
-
print(f"✅ Found image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
|
151 |
-
|
152 |
-
# Continue with fallback strategies for text extraction
|
153 |
-
if not extracted_text:
|
154 |
-
# Strategy 2: Direct text attribute (fallback)
|
155 |
-
if hasattr(ocr_response, 'text') and ocr_response.text:
|
156 |
-
extracted_text = str(ocr_response.text)
|
157 |
-
extraction_method = "direct_text_attribute"
|
158 |
-
|
159 |
-
# Strategy 3: Content attribute (fallback)
|
160 |
-
elif hasattr(ocr_response, 'content') and ocr_response.content:
|
161 |
-
content = ocr_response.content
|
162 |
-
if isinstance(content, str):
|
163 |
-
extracted_text = content
|
164 |
-
extraction_method = "content_attribute_string"
|
165 |
-
elif hasattr(content, 'text'):
|
166 |
-
extracted_text = str(content.text)
|
167 |
-
extraction_method = "content_text_attribute"
|
168 |
-
else:
|
169 |
-
extracted_text = str(content)
|
170 |
-
extraction_method = "content_attribute_converted"
|
171 |
-
|
172 |
-
# Strategy 4: Result attribute (fallback)
|
173 |
-
elif hasattr(ocr_response, 'result'):
|
174 |
-
result = ocr_response.result
|
175 |
-
if isinstance(result, str):
|
176 |
-
extracted_text = result
|
177 |
-
extraction_method = "result_string"
|
178 |
-
elif hasattr(result, 'text'):
|
179 |
-
extracted_text = str(result.text)
|
180 |
-
extraction_method = "result_text_attribute"
|
181 |
-
elif isinstance(result, dict) and 'text' in result:
|
182 |
-
extracted_text = str(result['text'])
|
183 |
-
extraction_method = "result_dict_text"
|
184 |
-
else:
|
185 |
-
extracted_text = str(result)
|
186 |
-
extraction_method = "result_converted"
|
187 |
-
|
188 |
-
# Strategy 5: Choices attribute (ChatGPT-style response - fallback)
|
189 |
-
elif hasattr(ocr_response, 'choices') and ocr_response.choices:
|
190 |
-
choices = ocr_response.choices
|
191 |
-
if isinstance(choices, list) and len(choices) > 0:
|
192 |
-
choice = choices[0]
|
193 |
-
if hasattr(choice, 'message') and hasattr(choice.message, 'content'):
|
194 |
-
extracted_text = str(choice.message.content)
|
195 |
-
extraction_method = "choices_message_content"
|
196 |
-
elif hasattr(choice, 'text'):
|
197 |
-
extracted_text = str(choice.text)
|
198 |
-
extraction_method = "choices_text"
|
199 |
-
else:
|
200 |
-
extracted_text = str(choice)
|
201 |
-
extraction_method = "choices_converted"
|
202 |
-
|
203 |
-
# Strategy 6: Dict-like access (fallback)
|
204 |
-
elif hasattr(ocr_response, 'get') or isinstance(ocr_response, dict):
|
205 |
-
for key in ['text', 'content', 'result', 'extracted_text', 'ocr_text', 'output']:
|
206 |
-
if hasattr(ocr_response, 'get'):
|
207 |
-
value = ocr_response.get(key)
|
208 |
-
else:
|
209 |
-
value = ocr_response.get(key) if isinstance(ocr_response, dict) else None
|
210 |
-
|
211 |
-
if value:
|
212 |
-
extracted_text = str(value)
|
213 |
-
extraction_method = f"dict_key_{key}"
|
214 |
-
break
|
215 |
-
|
216 |
-
# Strategy 7: Inspect all attributes for string-like content (fallback)
|
217 |
-
elif hasattr(ocr_response, '__dict__'):
|
218 |
-
for key, value in ocr_response.__dict__.items():
|
219 |
-
if isinstance(value, str) and len(value) > 20: # Likely text content
|
220 |
-
extracted_text = value
|
221 |
-
extraction_method = f"attribute_{key}"
|
222 |
-
break
|
223 |
-
elif hasattr(value, 'text') and isinstance(value.text, str):
|
224 |
-
extracted_text = str(value.text)
|
225 |
-
extraction_method = f"nested_text_in_{key}"
|
226 |
-
break
|
227 |
-
|
228 |
-
# Strategy 8: Convert entire response to string if it seems to contain text (fallback)
|
229 |
-
if not extracted_text:
|
230 |
-
response_str = str(ocr_response)
|
231 |
-
if len(response_str) > 50 and not response_str.startswith('<'): # Not an object reference
|
232 |
-
extracted_text = response_str
|
233 |
-
extraction_method = "full_response_string"
|
234 |
-
|
235 |
-
print(f"🎯 Extraction method used: {extraction_method}")
|
236 |
-
print(f"📏 Extracted text length: {len(extracted_text)} characters")
|
237 |
-
print(f"🖼️ Extracted images: {len(extracted_images)}")
|
238 |
-
|
239 |
-
if extracted_text:
|
240 |
-
status = f"✅ Successfully extracted text from PDF ({len(extracted_text)} characters)"
|
241 |
-
if extracted_images:
|
242 |
-
status += f" and {len(extracted_images)} image(s)"
|
243 |
-
else:
|
244 |
-
extracted_text = "No text could be extracted from this PDF."
|
245 |
-
status = "⚠️ OCR completed but no text was found in response."
|
246 |
-
if extracted_images:
|
247 |
-
status = f"✅ Successfully extracted {len(extracted_images)} image(s) from PDF, but no text was found."
|
248 |
-
print(f"❌ No extractable text found in OCR response")
|
249 |
-
|
250 |
-
return extracted_text, status, extracted_images
|
251 |
-
|
252 |
-
except Exception as e:
|
253 |
-
error_msg = f"Error processing PDF: {str(e)}"
|
254 |
-
print(error_msg)
|
255 |
-
return "", f"❌ {error_msg}", []
|
256 |
-
|
257 |
-
def generate_explanations(self, extracted_text: str) -> str:
|
258 |
-
"""
|
259 |
-
Generate explanations for the extracted text sections.
|
260 |
-
|
261 |
-
Args:
|
262 |
-
extracted_text: The extracted text from PDF
|
263 |
-
|
264 |
-
Returns:
|
265 |
-
Formatted explanations for all sections
|
266 |
-
"""
|
267 |
-
try:
|
268 |
-
if not extracted_text or extracted_text.strip() == "":
|
269 |
-
return "No text available to explain."
|
270 |
-
|
271 |
-
if extracted_text.startswith("No text could be extracted"):
|
272 |
-
return "Cannot generate explanations - no text was extracted from the PDF."
|
273 |
-
|
274 |
-
print("🤖 Generating explanations for extracted text...")
|
275 |
-
explained_sections = self.text_explainer.explain_all_sections(extracted_text)
|
276 |
-
|
277 |
-
if not explained_sections:
|
278 |
-
return "No sections found to explain in the extracted text."
|
279 |
-
|
280 |
-
formatted_explanations = self.text_explainer.format_explanations_for_display(explained_sections)
|
281 |
-
return formatted_explanations
|
282 |
-
|
283 |
-
except Exception as e:
|
284 |
-
error_msg = f"Error generating explanations: {str(e)}"
|
285 |
-
print(error_msg)
|
286 |
-
return f"❌ {error_msg}"
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample_text.md
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
# Sporting Exchanges between China and the United States, 1980-1984: Inevitable Politics and Excessive Political Strings
|
2 |
-
|
3 |
-
Y. Andrew Hao and Thomas M. Hunt<br>Department of Kinesiology and Health Education, The University of Texas at Austin, Austin, TX, USA
|
4 |
-
|
5 |
-
#### Abstract
|
6 |
-
|
7 |
-
Sino-US sporting exchanges between 1980 and 1984 largely paralleled the patterns of the larger bilateral relations between the two nations. The over-politicization of sports by the two governments - and especially by the PRC - created the parallelism. Curiously, scholars of sport and international relations have paid little attention to Sino-US athletic interactions in this period, an oversight that needs to be remedied in light of the reciprocal correlations between international sport and international politics. Indeed, Sino-US athletic exchanges in the context of their bilateral relations underscores the mutual connections between sport and diplomacy.
|
8 |
-
|
9 |
-
On January 1, 1979 - eight years after the initiation of the 'Ping-pong diplomacy' and seven after then-US President Richard Nixon's visit to China - the United States diplomatically recognized the People's Republic of China (PRC) with its capital in Beijing and rescinded its recognition of the Republic of China (ROC). At the end of the same year, the International Olympic Committee (IOC) welcomed the Chinese Olympic Committee back to the Olympic Movement. ${ }^{1}$ The Republic of China Olympic Committee, which previously monopolized the seat of China but only governed the sporting affairs of Taiwan and surrounding breakaway islands, was forced to change its name, flag and anthem.
|
10 |
-
|
11 |
-
The two incidents' proximity in time was more a coincidence than not - the US recognition did not directly cause China's reinstatement into the Olympics. Rather, both were trophies that Beijing garnered thanks to its rising power and strategic advantage in world politics. The Sino-US rapprochement resulted from changing power dynamics within the Sino-USSR-US strategic triangle in the 1970s: having parted way with the Soviet Union, China befriended the United States, which, under the 'Nixon doctrine', had offered an olive branch; the ensuing deterioration of SovietUS relations drove Beijing and Washington to enter into a closer relationship after 1978. ${ }^{2}$ The Olympic reinstatement, however, took place directly as the result of
|
12 |
-
|
13 |
-
[^0]
|
14 |
-
[^0]: CONTACT Thomas M. Hunt (1) [email protected] (2) Department of Kinesiology and Health Education, The University of Texas at Austin, 2109 San Jacinto Blvd, Stop D3700, Austin, TX 78712-1415, USA
|
15 |
-
(C) 2019 Informa UK Limited, trading as Taylor \& Francis Group
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/app.py
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
"""Main entry point for the PDF Explainer app."""
|
2 |
-
|
3 |
-
import gradio as gr
|
4 |
-
from processors.pdf_processor import PDFProcessor
|
5 |
-
from ui_components.interface import build_interface
|
6 |
-
from dotenv import load_dotenv
|
7 |
-
|
8 |
-
load_dotenv()
|
9 |
-
|
10 |
-
def main():
|
11 |
-
pdf_processor = PDFProcessor()
|
12 |
-
demo = build_interface(pdf_processor.process_pdf)
|
13 |
-
return demo
|
14 |
-
|
15 |
-
if __name__ == "__main__":
|
16 |
-
demo = main()
|
17 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ui_components/interface.py
CHANGED
@@ -46,7 +46,7 @@ def build_interface(process_pdf_fn):
|
|
46 |
audio_output = gr.Audio(
|
47 |
label="Generated Explanation Audio",
|
48 |
interactive=False,
|
49 |
-
visible=False
|
50 |
)
|
51 |
|
52 |
pdf_input.upload(
|
|
|
46 |
audio_output = gr.Audio(
|
47 |
label="Generated Explanation Audio",
|
48 |
interactive=False,
|
49 |
+
visible=False,
|
50 |
)
|
51 |
|
52 |
pdf_input.upload(
|
tests/test_ocr_direct.py
DELETED
@@ -1,234 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Quick OCR Test Script
|
3 |
-
Tests the Mistral AI OCR functionality directly without the Gradio interface.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import base64
|
7 |
-
import os
|
8 |
-
import tempfile
|
9 |
-
from mistralai import Mistral
|
10 |
-
from dotenv import load_dotenv
|
11 |
-
|
12 |
-
# Load environment variables
|
13 |
-
load_dotenv()
|
14 |
-
|
15 |
-
def create_simple_pdf_content():
|
16 |
-
"""Create a minimal PDF in memory for testing."""
|
17 |
-
# Simple PDF content (this is a basic PDF structure)
|
18 |
-
pdf_content = """%PDF-1.4
|
19 |
-
1 0 obj
|
20 |
-
<<
|
21 |
-
/Type /Catalog
|
22 |
-
/Pages 2 0 R
|
23 |
-
>>
|
24 |
-
endobj
|
25 |
-
|
26 |
-
2 0 obj
|
27 |
-
<<
|
28 |
-
/Type /Pages
|
29 |
-
/Kids [3 0 R]
|
30 |
-
/Count 1
|
31 |
-
>>
|
32 |
-
endobj
|
33 |
-
|
34 |
-
3 0 obj
|
35 |
-
<<
|
36 |
-
/Type /Page
|
37 |
-
/Parent 2 0 R
|
38 |
-
/MediaBox [0 0 612 792]
|
39 |
-
/Contents 4 0 R
|
40 |
-
/Resources <<
|
41 |
-
/Font <<
|
42 |
-
/F1 5 0 R
|
43 |
-
>>
|
44 |
-
>>
|
45 |
-
>>
|
46 |
-
endobj
|
47 |
-
|
48 |
-
4 0 obj
|
49 |
-
<<
|
50 |
-
/Length 44
|
51 |
-
>>
|
52 |
-
stream
|
53 |
-
BT
|
54 |
-
/F1 12 Tf
|
55 |
-
72 720 Td
|
56 |
-
(Hello World! Test OCR) Tj
|
57 |
-
ET
|
58 |
-
endstream
|
59 |
-
endobj
|
60 |
-
|
61 |
-
5 0 obj
|
62 |
-
<<
|
63 |
-
/Type /Font
|
64 |
-
/Subtype /Type1
|
65 |
-
/BaseFont /Helvetica
|
66 |
-
>>
|
67 |
-
endobj
|
68 |
-
|
69 |
-
xref
|
70 |
-
0 6
|
71 |
-
0000000000 65535 f
|
72 |
-
0000000010 00000 n
|
73 |
-
0000000079 00000 n
|
74 |
-
0000000173 00000 n
|
75 |
-
0000000301 00000 n
|
76 |
-
0000000380 00000 n
|
77 |
-
trailer
|
78 |
-
<<
|
79 |
-
/Size 6
|
80 |
-
/Root 1 0 R
|
81 |
-
>>
|
82 |
-
startxref
|
83 |
-
456
|
84 |
-
%%EOF"""
|
85 |
-
|
86 |
-
return pdf_content.encode('utf-8')
|
87 |
-
|
88 |
-
def test_mistral_ocr():
|
89 |
-
"""Test the Mistral OCR functionality directly."""
|
90 |
-
|
91 |
-
print("🧪 Starting Mistral OCR Test...")
|
92 |
-
|
93 |
-
# Check API key
|
94 |
-
api_key = os.environ.get("MISTRAL_API_KEY")
|
95 |
-
if not api_key:
|
96 |
-
print("❌ MISTRAL_API_KEY environment variable not found")
|
97 |
-
print(" Please set it in your .env file or environment")
|
98 |
-
return False
|
99 |
-
|
100 |
-
print(f"✅ API key found: {api_key[:8]}...")
|
101 |
-
|
102 |
-
try:
|
103 |
-
# Initialize Mistral client
|
104 |
-
client = Mistral(api_key=api_key)
|
105 |
-
print("✅ Mistral client initialized")
|
106 |
-
|
107 |
-
# Create a simple test PDF
|
108 |
-
pdf_content = create_simple_pdf_content()
|
109 |
-
base64_pdf = base64.b64encode(pdf_content).decode('utf-8')
|
110 |
-
print(f"✅ Test PDF created ({len(pdf_content)} bytes)")
|
111 |
-
|
112 |
-
# Test the OCR endpoint
|
113 |
-
print("🔄 Sending OCR request to Mistral...")
|
114 |
-
|
115 |
-
response = client.ocr.process(
|
116 |
-
model="mistral-ocr-latest",
|
117 |
-
document={
|
118 |
-
"type": "document_url",
|
119 |
-
"document_url": f"data:application/pdf;base64,{base64_pdf}"
|
120 |
-
},
|
121 |
-
include_image_base64=True
|
122 |
-
)
|
123 |
-
|
124 |
-
print("✅ OCR request completed")
|
125 |
-
|
126 |
-
# Analyze the response
|
127 |
-
print("\n🔍 RESPONSE ANALYSIS:")
|
128 |
-
print(f"Response type: {type(response)}")
|
129 |
-
print(f"Response string: {str(response)[:200]}...")
|
130 |
-
|
131 |
-
if hasattr(response, '__dict__'):
|
132 |
-
print(f"Response attributes: {list(response.__dict__.keys())}")
|
133 |
-
for key, value in response.__dict__.items():
|
134 |
-
print(f" {key}: {type(value)} = {str(value)[:100]}...")
|
135 |
-
# Test all possible text extraction methods
|
136 |
-
print("\n🎯 TESTING TEXT EXTRACTION METHODS:")
|
137 |
-
|
138 |
-
methods = [
|
139 |
-
("response.pages[].markdown", lambda r: "\n".join([page.markdown for page in r.pages]) if hasattr(r, 'pages') and r.pages and all(hasattr(p, 'markdown') for p in r.pages) else None),
|
140 |
-
("response.text", lambda r: getattr(r, 'text', None)),
|
141 |
-
("response.content", lambda r: getattr(r, 'content', None)),
|
142 |
-
("response.result", lambda r: getattr(r, 'result', None)),
|
143 |
-
("response.data", lambda r: getattr(r, 'data', None)),
|
144 |
-
("response['text']", lambda r: r.get('text') if hasattr(r, 'get') else None),
|
145 |
-
("response['content']", lambda r: r.get('content') if hasattr(r, 'get') else None),
|
146 |
-
]
|
147 |
-
|
148 |
-
extracted_text = None
|
149 |
-
successful_method = None
|
150 |
-
|
151 |
-
for method_name, method_func in methods:
|
152 |
-
try:
|
153 |
-
result = method_func(response)
|
154 |
-
if result:
|
155 |
-
print(f"✅ {method_name}: Found content ({len(str(result))} chars)")
|
156 |
-
print(f" Content: {str(result)[:100]}...")
|
157 |
-
if not extracted_text: # Use the first successful method
|
158 |
-
extracted_text = str(result)
|
159 |
-
successful_method = method_name
|
160 |
-
else:
|
161 |
-
print(f"❌ {method_name}: No content found")
|
162 |
-
except Exception as e:
|
163 |
-
print(f"❌ {method_name}: Error - {e}")
|
164 |
-
|
165 |
-
if extracted_text:
|
166 |
-
print(f"\n🎉 SUCCESSFULLY EXTRACTED TEXT using {successful_method}:")
|
167 |
-
print(f"📝 Full extracted text: '{extracted_text}'")
|
168 |
-
else:
|
169 |
-
print(f"\n❌ NO TEXT EXTRACTED from any method")
|
170 |
-
|
171 |
-
return True
|
172 |
-
|
173 |
-
except Exception as e:
|
174 |
-
print(f"❌ OCR test failed: {e}")
|
175 |
-
print(f" Error type: {type(e)}")
|
176 |
-
|
177 |
-
# If it's a 401 error, the API key might be invalid
|
178 |
-
if "401" in str(e) or "unauthorized" in str(e).lower():
|
179 |
-
print(" This might be an API key issue. Please check your MISTRAL_API_KEY")
|
180 |
-
|
181 |
-
return False
|
182 |
-
|
183 |
-
def test_api_connectivity():
|
184 |
-
"""Test basic connectivity to Mistral API."""
|
185 |
-
|
186 |
-
print("🌐 Testing API connectivity...")
|
187 |
-
|
188 |
-
api_key = os.environ.get("MISTRAL_API_KEY")
|
189 |
-
if not api_key:
|
190 |
-
print("❌ No API key found")
|
191 |
-
return False
|
192 |
-
|
193 |
-
try:
|
194 |
-
client = Mistral(api_key=api_key)
|
195 |
-
|
196 |
-
# Try a simple API call (if available)
|
197 |
-
# Note: This might fail if the endpoint doesn't exist, but it tests connectivity
|
198 |
-
print("🔄 Testing API connection...")
|
199 |
-
|
200 |
-
# The exact method to test connectivity may vary based on Mistral's API
|
201 |
-
# For now, we'll just try to initialize and catch any immediate errors
|
202 |
-
print("✅ Mistral client appears to be working")
|
203 |
-
return True
|
204 |
-
|
205 |
-
except Exception as e:
|
206 |
-
print(f"❌ API connectivity test failed: {e}")
|
207 |
-
return False
|
208 |
-
|
209 |
-
def main():
|
210 |
-
"""Main test function."""
|
211 |
-
|
212 |
-
print("🚀 Mistral OCR Quick Test")
|
213 |
-
print("=" * 40)
|
214 |
-
|
215 |
-
# Test API connectivity first
|
216 |
-
if not test_api_connectivity():
|
217 |
-
print("\n❌ Basic connectivity test failed")
|
218 |
-
return
|
219 |
-
|
220 |
-
print("\n" + "="*40)
|
221 |
-
|
222 |
-
# Test OCR functionality
|
223 |
-
if test_mistral_ocr():
|
224 |
-
print("\n✅ OCR test completed - check the response analysis above")
|
225 |
-
else:
|
226 |
-
print("\n❌ OCR test failed")
|
227 |
-
|
228 |
-
print("\n💡 Next steps:")
|
229 |
-
print(" 1. If the test worked, run: python main.py")
|
230 |
-
print(" 2. If there were errors, check the API key and try again")
|
231 |
-
print(" 3. Use the response analysis to improve text extraction")
|
232 |
-
|
233 |
-
if __name__ == "__main__":
|
234 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_setup.py
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Test script for PDF Extractor setup validation
|
3 |
-
"""
|
4 |
-
|
5 |
-
import sys
|
6 |
-
import os
|
7 |
-
from dotenv import load_dotenv
|
8 |
-
|
9 |
-
def test_imports():
|
10 |
-
"""Test if all required packages are importable."""
|
11 |
-
try:
|
12 |
-
import gradio as gr
|
13 |
-
print("✅ Gradio imported successfully")
|
14 |
-
|
15 |
-
import mistralai
|
16 |
-
print("✅ Mistral AI imported successfully")
|
17 |
-
|
18 |
-
from dotenv import load_dotenv
|
19 |
-
print("✅ python-dotenv imported successfully")
|
20 |
-
|
21 |
-
return True
|
22 |
-
except ImportError as e:
|
23 |
-
print(f"❌ Import error: {e}")
|
24 |
-
return False
|
25 |
-
|
26 |
-
def test_environment():
|
27 |
-
"""Test environment variable setup."""
|
28 |
-
load_dotenv()
|
29 |
-
|
30 |
-
api_key = os.environ.get("MISTRAL_API_KEY")
|
31 |
-
if api_key:
|
32 |
-
# Don't print the actual key, just confirm it exists
|
33 |
-
print("✅ MISTRAL_API_KEY environment variable is set")
|
34 |
-
return True
|
35 |
-
else:
|
36 |
-
print("⚠️ MISTRAL_API_KEY not found in environment")
|
37 |
-
print(" Please copy .env.example to .env and add your API key")
|
38 |
-
return False
|
39 |
-
|
40 |
-
def main():
|
41 |
-
"""Run all tests."""
|
42 |
-
print("🔍 PDF Extractor Setup Validation")
|
43 |
-
print("=" * 40)
|
44 |
-
|
45 |
-
import_success = test_imports()
|
46 |
-
env_success = test_environment()
|
47 |
-
|
48 |
-
print("\n" + "=" * 40)
|
49 |
-
if import_success:
|
50 |
-
print("✅ All packages are properly installed")
|
51 |
-
if env_success:
|
52 |
-
print("✅ Environment is configured correctly")
|
53 |
-
print("🚀 Ready to run: python main.py")
|
54 |
-
else:
|
55 |
-
print("⚠️ Environment needs configuration")
|
56 |
-
print("📝 Next step: Set up your .env file")
|
57 |
-
else:
|
58 |
-
print("❌ Package installation incomplete")
|
59 |
-
print("📝 Next step: pip install -r requirements.txt")
|
60 |
-
|
61 |
-
if __name__ == "__main__":
|
62 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/__init__.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
"""UI components for PDF Text Extractor."""
|
2 |
-
from ui.interface import create_interface
|
3 |
-
from ui.handlers import copy_text, download_text, process_images_for_display
|
4 |
-
from ui.components import (
|
5 |
-
create_header, create_upload_section, create_action_button,
|
6 |
-
create_text_display, create_action_buttons, create_image_gallery,
|
7 |
-
apply_custom_css
|
8 |
-
)
|
9 |
-
|
10 |
-
__all__ = [
|
11 |
-
"create_interface", "copy_text", "download_text", "process_images_for_display",
|
12 |
-
"create_header", "create_upload_section", "create_action_button",
|
13 |
-
"create_text_display", "create_action_buttons", "create_image_gallery",
|
14 |
-
"apply_custom_css"
|
15 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/chatterbox/check_api_health.py
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
def check_api_health():
|
2 |
-
import requests
|
3 |
-
import os
|
4 |
-
HEALTH_ENDPOINT = os.getenv("HEALTH_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/health")
|
5 |
-
try:
|
6 |
-
response = requests.get(HEALTH_ENDPOINT, timeout=10)
|
7 |
-
if response.status_code == 200:
|
8 |
-
data = response.json()
|
9 |
-
return f"✅ API Status: {data.get('status', 'Unknown')} | Model Loaded: {data.get('model_loaded', False)}"
|
10 |
-
else:
|
11 |
-
return f"⚠️ API returned status code: {response.status_code}"
|
12 |
-
except Exception as e:
|
13 |
-
return f"❌ API Health Check Failed: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/chatterbox/custom_css.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
custom_css = """
|
2 |
-
.gradio-container {
|
3 |
-
max-width: 1200px !important;
|
4 |
-
}
|
5 |
-
.status-box {
|
6 |
-
padding: 10px;
|
7 |
-
border-radius: 5px;
|
8 |
-
}
|
9 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/chatterbox/generate_sample_text.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
def generate_sample_text():
|
2 |
-
import random
|
3 |
-
samples = [
|
4 |
-
"Hello! This is a test of the Chatterbox TTS system running on Modal.",
|
5 |
-
"The quick brown fox jumps over the lazy dog.",
|
6 |
-
"Welcome to the future of text-to-speech technology.",
|
7 |
-
"Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse.",
|
8 |
-
"This is an example of voice cloning using artificial intelligence.",
|
9 |
-
]
|
10 |
-
return random.choice(samples)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/chatterbox/generate_tts_audio.py
DELETED
@@ -1,113 +0,0 @@
|
|
1 |
-
def generate_tts_audio(text_input: str, audio_prompt_input, progress=None):
|
2 |
-
import os
|
3 |
-
import requests
|
4 |
-
import tempfile
|
5 |
-
import soundfile as sf
|
6 |
-
import numpy as np
|
7 |
-
import gradio as gr
|
8 |
-
|
9 |
-
GENERATE_AUDIO_ENDPOINT = os.getenv("GENERATE_AUDIO_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/generate_audio")
|
10 |
-
GENERATE_WITH_FILE_ENDPOINT = os.getenv("GENERATE_WITH_FILE_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/generate_with_file")
|
11 |
-
|
12 |
-
if not text_input or len(text_input.strip()) == 0:
|
13 |
-
raise gr.Error("Please enter some text to synthesize.")
|
14 |
-
if len(text_input) > 1000:
|
15 |
-
raise gr.Error("Text is too long. Maximum 1000 characters allowed.")
|
16 |
-
|
17 |
-
if progress: progress(0.1, desc="Preparing request...")
|
18 |
-
|
19 |
-
try:
|
20 |
-
if audio_prompt_input is None:
|
21 |
-
if progress: progress(0.3, desc="Sending request to API...")
|
22 |
-
payload = {"text": text_input}
|
23 |
-
response = requests.post(
|
24 |
-
GENERATE_AUDIO_ENDPOINT,
|
25 |
-
json=payload,
|
26 |
-
headers={"Content-Type": "application/json"},
|
27 |
-
timeout=120,
|
28 |
-
stream=True
|
29 |
-
)
|
30 |
-
if response.status_code != 200:
|
31 |
-
raise gr.Error(f"API Error: {response.status_code} - {response.text}")
|
32 |
-
|
33 |
-
if progress: progress(0.6, desc="Streaming audio response...")
|
34 |
-
|
35 |
-
# Get content length if available for progress tracking
|
36 |
-
content_length = response.headers.get('content-length')
|
37 |
-
if content_length:
|
38 |
-
content_length = int(content_length)
|
39 |
-
|
40 |
-
bytes_downloaded = 0
|
41 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
42 |
-
for chunk in response.iter_content(chunk_size=8192):
|
43 |
-
if chunk:
|
44 |
-
temp_file.write(chunk)
|
45 |
-
bytes_downloaded += len(chunk)
|
46 |
-
|
47 |
-
# Update progress based on bytes downloaded
|
48 |
-
if content_length and progress:
|
49 |
-
download_progress = min(0.3, (bytes_downloaded / content_length) * 0.3)
|
50 |
-
progress(0.6 + download_progress, desc=f"Downloading audio... ({bytes_downloaded // 1024}KB)")
|
51 |
-
elif progress:
|
52 |
-
# If no content length, just show bytes downloaded
|
53 |
-
progress(0.6, desc=f"Downloading audio... ({bytes_downloaded // 1024}KB)")
|
54 |
-
|
55 |
-
temp_path = temp_file.name
|
56 |
-
|
57 |
-
if progress: progress(0.9, desc="Processing audio...")
|
58 |
-
audio_data, sample_rate = sf.read(temp_path)
|
59 |
-
os.unlink(temp_path)
|
60 |
-
if progress: progress(1.0, desc="Complete!")
|
61 |
-
return (sample_rate, audio_data)
|
62 |
-
|
63 |
-
else:
|
64 |
-
if progress: progress(0.3, desc="Preparing voice prompt...")
|
65 |
-
files = {'text': (None, text_input)}
|
66 |
-
with open(audio_prompt_input, 'rb') as f:
|
67 |
-
audio_content = f.read()
|
68 |
-
files['voice_prompt'] = ('voice_prompt.wav', audio_content, 'audio/wav')
|
69 |
-
|
70 |
-
if progress: progress(0.5, desc="Sending request with voice cloning...")
|
71 |
-
response = requests.post(
|
72 |
-
GENERATE_WITH_FILE_ENDPOINT,
|
73 |
-
files=files,
|
74 |
-
timeout=180,
|
75 |
-
stream=True
|
76 |
-
)
|
77 |
-
if response.status_code != 200:
|
78 |
-
raise gr.Error(f"API Error: {response.status_code} - {response.text}")
|
79 |
-
|
80 |
-
if progress: progress(0.8, desc="Streaming cloned voice response...")
|
81 |
-
|
82 |
-
# Get content length if available for progress tracking
|
83 |
-
content_length = response.headers.get('content-length')
|
84 |
-
if content_length:
|
85 |
-
content_length = int(content_length)
|
86 |
-
|
87 |
-
bytes_downloaded = 0
|
88 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
89 |
-
for chunk in response.iter_content(chunk_size=8192):
|
90 |
-
if chunk:
|
91 |
-
temp_file.write(chunk)
|
92 |
-
bytes_downloaded += len(chunk)
|
93 |
-
|
94 |
-
# Update progress based on bytes downloaded for voice cloning
|
95 |
-
if content_length and progress:
|
96 |
-
download_progress = min(0.15, (bytes_downloaded / content_length) * 0.15)
|
97 |
-
progress(0.8 + download_progress, desc=f"Downloading cloned audio... ({bytes_downloaded // 1024}KB)")
|
98 |
-
elif progress:
|
99 |
-
progress(0.8, desc=f"Downloading cloned audio... ({bytes_downloaded // 1024}KB)")
|
100 |
-
|
101 |
-
temp_path = temp_file.name
|
102 |
-
|
103 |
-
audio_data, sample_rate = sf.read(temp_path)
|
104 |
-
os.unlink(temp_path)
|
105 |
-
if progress: progress(1.0, desc="Voice cloning complete!")
|
106 |
-
return (sample_rate, audio_data)
|
107 |
-
|
108 |
-
except requests.exceptions.Timeout:
|
109 |
-
raise gr.Error("Request timed out. The API might be under heavy load. Please try again.")
|
110 |
-
except requests.exceptions.ConnectionError:
|
111 |
-
raise gr.Error("Unable to connect to the API. Please check if the endpoint URL is correct.")
|
112 |
-
except Exception as e:
|
113 |
-
raise gr.Error(f"Error generating audio: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/chatterbox/update_char_count.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
def update_char_count(text):
|
2 |
-
count = len(text) if text else 0
|
3 |
-
return f"{count}/1000"
|
|
|
|
|
|
|
|
ui/components/apply_custom_css.py
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
def apply_custom_css() -> gr.HTML:
|
4 |
-
"""
|
5 |
-
Apply custom CSS styling.
|
6 |
-
|
7 |
-
Returns:
|
8 |
-
gr.HTML: HTML component with CSS styles
|
9 |
-
"""
|
10 |
-
return gr.HTML("""
|
11 |
-
<style>
|
12 |
-
.gradio-container {
|
13 |
-
max-width: 900px !important;
|
14 |
-
}
|
15 |
-
.output-markdown {
|
16 |
-
font-family: 'Courier New', monospace;
|
17 |
-
}
|
18 |
-
.image-gallery-caption {
|
19 |
-
text-align: center;
|
20 |
-
font-size: 0.9em;
|
21 |
-
}
|
22 |
-
</style>
|
23 |
-
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/components/create_action_button.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
def create_action_button() -> gr.Button:
|
4 |
-
"""
|
5 |
-
Create the extract text action button.
|
6 |
-
|
7 |
-
Returns:
|
8 |
-
gr.Button: Action button component
|
9 |
-
"""
|
10 |
-
return gr.Button("Extract Text & Images", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/components/create_action_buttons.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from typing import Tuple
|
3 |
-
|
4 |
-
def create_action_buttons() -> Tuple[gr.Button, gr.Button]:
|
5 |
-
"""
|
6 |
-
Create copy and download action buttons.
|
7 |
-
|
8 |
-
Returns:
|
9 |
-
Tuple[gr.Button, gr.Button]: Copy and download button components
|
10 |
-
"""
|
11 |
-
copy_btn = gr.Button("📋 Copy to Clipboard")
|
12 |
-
download_btn = gr.Button("📥 Download as Text File")
|
13 |
-
|
14 |
-
return copy_btn, download_btn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/components/create_header.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
|
2 |
-
import gradio as gr
|
3 |
-
|
4 |
-
def create_header() -> gr.Markdown:
|
5 |
-
"""
|
6 |
-
Create the application header.
|
7 |
-
|
8 |
-
Returns:
|
9 |
-
gr.Markdown: Header component
|
10 |
-
"""
|
11 |
-
return gr.Markdown("""
|
12 |
-
# 🔍 PDF Text Extractor with AI Explanations
|
13 |
-
|
14 |
-
Extract text and images from PDF files using Mistral AI's OCR technology, then get simple explanations for each section.
|
15 |
-
|
16 |
-
**Instructions:**
|
17 |
-
1. Upload a PDF file using the file selector below
|
18 |
-
2. Wait for processing to complete
|
19 |
-
3. View the extracted text and images
|
20 |
-
4. Click "Generate Explanations" to get AI-powered explanations of each section
|
21 |
-
5. Use the Copy or Download buttons to save the extracted text or explanations
|
22 |
-
|
23 |
-
**Supported:** PDF files up to 10MB
|
24 |
-
""")
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/components/create_image_gallery.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
def create_image_gallery() -> gr.Gallery:
|
4 |
-
"""
|
5 |
-
Create the image gallery component.
|
6 |
-
|
7 |
-
Returns:
|
8 |
-
gr.Gallery: Image gallery component
|
9 |
-
"""
|
10 |
-
return gr.Gallery(
|
11 |
-
label="Extracted Images",
|
12 |
-
columns=3,
|
13 |
-
rows=2,
|
14 |
-
object_fit="contain",
|
15 |
-
height="auto",
|
16 |
-
visible=True,
|
17 |
-
show_label=True,
|
18 |
-
elem_id="image_gallery"
|
19 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/components/create_text_display.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from typing import Tuple
|
3 |
-
|
4 |
-
def create_text_display() -> Tuple[gr.Textbox, gr.Textbox]:
|
5 |
-
"""
|
6 |
-
Create the text output and status display components.
|
7 |
-
|
8 |
-
Returns:
|
9 |
-
Tuple[gr.Textbox, gr.Textbox]: Text output and status components
|
10 |
-
"""
|
11 |
-
text_output = gr.Textbox(
|
12 |
-
label="Extracted Text",
|
13 |
-
lines=10,
|
14 |
-
max_lines=20,
|
15 |
-
placeholder="Extracted text will appear here...",
|
16 |
-
show_copy_button=True
|
17 |
-
)
|
18 |
-
|
19 |
-
status_output = gr.Textbox(
|
20 |
-
label="Status",
|
21 |
-
lines=2,
|
22 |
-
placeholder="Upload a PDF to see status..."
|
23 |
-
)
|
24 |
-
|
25 |
-
return text_output, status_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/components/create_upload_section.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
def create_upload_section() -> gr.File:
|
4 |
-
"""
|
5 |
-
Create the file upload component.
|
6 |
-
|
7 |
-
Returns:
|
8 |
-
gr.File: File upload component
|
9 |
-
"""
|
10 |
-
return gr.File(
|
11 |
-
label="Upload PDF File",
|
12 |
-
file_types=[".pdf"],
|
13 |
-
file_count="single"
|
14 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/handlers.py
DELETED
@@ -1,104 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Event handlers for UI components.
|
3 |
-
Contains functions that handle user interactions with the interface.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
import tempfile
|
8 |
-
from typing import Optional, List, Dict, Any
|
9 |
-
from utils.pdf_image_extractor import PDFImageExtractor
|
10 |
-
|
11 |
-
def copy_text(text: str) -> str:
|
12 |
-
"""
|
13 |
-
Handle Copy button click.
|
14 |
-
|
15 |
-
Args:
|
16 |
-
text: Text to copy to clipboard
|
17 |
-
|
18 |
-
Returns:
|
19 |
-
str: The input text (unchanged)
|
20 |
-
"""
|
21 |
-
return text
|
22 |
-
|
23 |
-
def download_text(text: str) -> Optional[str]:
|
24 |
-
"""
|
25 |
-
Handle Download button click.
|
26 |
-
|
27 |
-
Args:
|
28 |
-
text: Text to download
|
29 |
-
|
30 |
-
Returns:
|
31 |
-
Optional[str]: Path to the created text file or None if text is empty
|
32 |
-
"""
|
33 |
-
import tempfile
|
34 |
-
import os
|
35 |
-
|
36 |
-
if not text:
|
37 |
-
return None
|
38 |
-
|
39 |
-
# Create a temporary file to hold the text
|
40 |
-
temp_dir = tempfile.gettempdir()
|
41 |
-
filename = "extracted_text.txt"
|
42 |
-
file_path = os.path.join(temp_dir, filename)
|
43 |
-
|
44 |
-
# Write the text to the file
|
45 |
-
with open(file_path, "w", encoding="utf-8") as f:
|
46 |
-
f.write(text)
|
47 |
-
|
48 |
-
return file_path
|
49 |
-
|
50 |
-
def process_images_for_display(images_data: List[Dict[str, Any]], pdf_path: str = None) -> List:
|
51 |
-
"""
|
52 |
-
Process images for display in the Gradio gallery.
|
53 |
-
|
54 |
-
Args:
|
55 |
-
images_data: List of image data dictionaries from OCR response
|
56 |
-
pdf_path: Path to the original PDF file for image extraction
|
57 |
-
|
58 |
-
Returns:
|
59 |
-
List: List of image paths for gallery display
|
60 |
-
"""
|
61 |
-
if not images_data:
|
62 |
-
return []
|
63 |
-
|
64 |
-
# If we have PDF path and bounding box data, extract images from PDF
|
65 |
-
if pdf_path and os.path.exists(pdf_path):
|
66 |
-
print("🖼️ Extracting images from PDF using bounding box coordinates...")
|
67 |
-
extracted_paths = PDFImageExtractor.extract_images_from_pdf(pdf_path, images_data)
|
68 |
-
if extracted_paths:
|
69 |
-
return extracted_paths
|
70 |
-
|
71 |
-
# Fallback: extract all images from PDF if bounding box extraction failed
|
72 |
-
print("🔄 Fallback: Extracting all images from PDF...")
|
73 |
-
extracted_paths = PDFImageExtractor.extract_all_images_from_pdf(pdf_path)
|
74 |
-
if extracted_paths:
|
75 |
-
return extracted_paths[:len(images_data)] # Limit to expected number of images
|
76 |
-
|
77 |
-
# Fallback: use base64 data from OCR response
|
78 |
-
print("🔄 Using base64 image data from OCR response...")
|
79 |
-
gallery_images = []
|
80 |
-
temp_dir = tempfile.gettempdir()
|
81 |
-
|
82 |
-
for index, img_data in enumerate(images_data):
|
83 |
-
try:
|
84 |
-
# Get image base64 data
|
85 |
-
base64_data = img_data.get('base64', '')
|
86 |
-
if not base64_data:
|
87 |
-
continue
|
88 |
-
|
89 |
-
# Create a temporary file to save the image
|
90 |
-
img_filename = f"extracted_image_fallback_{index}.jpg"
|
91 |
-
img_path = os.path.join(temp_dir, img_filename)
|
92 |
-
|
93 |
-
# Convert base64 to image file
|
94 |
-
import base64
|
95 |
-
with open(img_path, "wb") as img_file:
|
96 |
-
img_file.write(base64.b64decode(base64_data))
|
97 |
-
|
98 |
-
# Add path to gallery list (Gradio Gallery expects a list of paths)
|
99 |
-
gallery_images.append(img_path)
|
100 |
-
|
101 |
-
except Exception as e:
|
102 |
-
print(f"Error processing image {index}: {str(e)}")
|
103 |
-
|
104 |
-
return gallery_images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/interface.py
DELETED
@@ -1,223 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Interface creation module for PDF Text Extractor.
|
3 |
-
Defines the Gradio interface components and layout.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import gradio as gr
|
7 |
-
from gradio_pdf import PDF
|
8 |
-
from pdf_text_extractor import PDFTextExtractor
|
9 |
-
from ui.handlers import process_images_for_display
|
10 |
-
from .components.create_header import create_header
|
11 |
-
from .components.create_upload_section import create_upload_section
|
12 |
-
from .components.create_action_button import create_action_button
|
13 |
-
from .components.create_image_gallery import create_image_gallery
|
14 |
-
from .components.apply_custom_css import apply_custom_css
|
15 |
-
from .chatterbox.generate_tts_audio import generate_tts_audio
|
16 |
-
|
17 |
-
def create_dummy_interface() -> gr.Blocks:
|
18 |
-
"""
|
19 |
-
Create a simple interface for when the API key is not configured.
|
20 |
-
|
21 |
-
Returns:
|
22 |
-
gr.Blocks: Gradio interface with disabled functionality
|
23 |
-
"""
|
24 |
-
with gr.Blocks(title="PDF Text Extractor") as interface:
|
25 |
-
gr.Markdown("""
|
26 |
-
# 🔍 PDF Text Extractor
|
27 |
-
|
28 |
-
⚠️ **API key not configured.** Please set MISTRAL_API_KEY environment variable and restart the application.
|
29 |
-
""")
|
30 |
-
|
31 |
-
# Create layout similar to main interface but disabled
|
32 |
-
with gr.Row(equal_height=True):
|
33 |
-
# Left column - PDF Display
|
34 |
-
with gr.Column(scale=1):
|
35 |
-
gr.Markdown("### 📄 PDF Document")
|
36 |
-
PDF(
|
37 |
-
label="Upload and View PDF (Disabled)",
|
38 |
-
height=700,
|
39 |
-
interactive=False
|
40 |
-
)
|
41 |
-
|
42 |
-
gr.Textbox(
|
43 |
-
label="Status",
|
44 |
-
lines=2,
|
45 |
-
value="❌ MISTRAL_API_KEY environment variable is not set. Please set it and restart the application.",
|
46 |
-
interactive=False
|
47 |
-
)
|
48 |
-
|
49 |
-
# Right column - Extracted Content
|
50 |
-
with gr.Column(scale=1):
|
51 |
-
gr.Markdown("### 📝 Extracted Content")
|
52 |
-
|
53 |
-
gr.Textbox(
|
54 |
-
label="Extracted Text",
|
55 |
-
lines=25,
|
56 |
-
value="API key not configured. Text extraction is unavailable.",
|
57 |
-
interactive=False
|
58 |
-
)
|
59 |
-
|
60 |
-
return interface
|
61 |
-
|
62 |
-
def create_main_interface(extractor: PDFTextExtractor) -> gr.Blocks:
|
63 |
-
"""
|
64 |
-
Create the main application interface.
|
65 |
-
|
66 |
-
Args:
|
67 |
-
extractor: PDFTextExtractor instance
|
68 |
-
|
69 |
-
Returns:
|
70 |
-
gr.Blocks: Gradio interface with full functionality
|
71 |
-
"""
|
72 |
-
|
73 |
-
def process_pdf_wrapper(pdf_file):
|
74 |
-
"""Process PDF with the extractor from closure"""
|
75 |
-
extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
|
76 |
-
# Get PDF file path for image extraction
|
77 |
-
pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file if pdf_file else None
|
78 |
-
gallery_images = process_images_for_display(images_data, pdf_path)
|
79 |
-
return extracted_text, status, gallery_images
|
80 |
-
|
81 |
-
def generate_explanations_wrapper(extracted_text):
|
82 |
-
"""Generate explanations for extracted text"""
|
83 |
-
if not extracted_text or extracted_text.strip() == "":
|
84 |
-
return "No text available to explain. Please extract text from a PDF first."
|
85 |
-
|
86 |
-
explanations = extractor.generate_explanations(extracted_text)
|
87 |
-
# The explanation_status is now implicitly handled by the content of 'explanations'
|
88 |
-
return explanations
|
89 |
-
|
90 |
-
def generate_explanation_audio_wrapper(explanations_text):
|
91 |
-
"""Generate TTS audio for explanations using Chatterbox API"""
|
92 |
-
if not explanations_text or explanations_text.strip() == "":
|
93 |
-
raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
|
94 |
-
|
95 |
-
# Clean up the text for better TTS
|
96 |
-
clean_text = explanations_text.strip()
|
97 |
-
|
98 |
-
# Limit text length for TTS (Chatterbox has a 1000 character limit)
|
99 |
-
if len(clean_text) > 1000:
|
100 |
-
# Truncate at sentence boundary if possible
|
101 |
-
sentences = clean_text[:950].split('.')
|
102 |
-
if len(sentences) > 1:
|
103 |
-
clean_text = '.'.join(sentences[:-1]) + '.'
|
104 |
-
else:
|
105 |
-
clean_text = clean_text[:950]
|
106 |
-
clean_text += " [Text has been truncated for audio generation]"
|
107 |
-
|
108 |
-
# Call the TTS function directly - it already handles gr.Error exceptions properly
|
109 |
-
return generate_tts_audio(clean_text, None)
|
110 |
-
def tts_click_handler(explanations_text):
|
111 |
-
"""Handle TTS button click with proper output handling"""
|
112 |
-
try:
|
113 |
-
audio_result = generate_explanation_audio_wrapper(explanations_text)
|
114 |
-
return audio_result, gr.update(visible=True)
|
115 |
-
except gr.Error:
|
116 |
-
# Re-raise Gradio errors as-is (they're already properly formatted)
|
117 |
-
raise
|
118 |
-
except Exception as e:
|
119 |
-
# Only wrap non-Gradio exceptions
|
120 |
-
raise gr.Error(f"Unexpected error generating audio: {str(e)}")
|
121 |
-
|
122 |
-
with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as interface:
|
123 |
-
# Add the header
|
124 |
-
create_header()
|
125 |
-
|
126 |
-
# Create main layout with PDF on left and content on right
|
127 |
-
with gr.Row(equal_height=True):
|
128 |
-
# Left column - PDF Display
|
129 |
-
with gr.Column(scale=1):
|
130 |
-
gr.Markdown("### 📄 PDF Document")
|
131 |
-
pdf_input = PDF(
|
132 |
-
label="Upload and View PDF",
|
133 |
-
height=700,
|
134 |
-
interactive=True
|
135 |
-
)
|
136 |
-
|
137 |
-
# Status display below PDF
|
138 |
-
status_output = gr.Textbox(
|
139 |
-
label="Status",
|
140 |
-
lines=2,
|
141 |
-
placeholder="Upload a PDF to see status...",
|
142 |
-
interactive=False
|
143 |
-
)
|
144 |
-
|
145 |
-
# Right column - Extracted Content
|
146 |
-
with gr.Column(scale=1):
|
147 |
-
gr.Markdown("### 📝 Extracted Content")
|
148 |
-
|
149 |
-
# Create tabs for text, explanations, and images
|
150 |
-
with gr.Tabs():
|
151 |
-
with gr.TabItem("Extracted Text"):
|
152 |
-
text_output = gr.Textbox(
|
153 |
-
label="Extracted Text",
|
154 |
-
lines=25,
|
155 |
-
max_lines=30,
|
156 |
-
placeholder="Upload a PDF to automatically extract text...",
|
157 |
-
show_copy_button=True
|
158 |
-
)
|
159 |
-
|
160 |
-
with gr.TabItem("📚 Explanations"):
|
161 |
-
with gr.Row():
|
162 |
-
explain_btn = gr.Button("🤖 Generate Explanations", variant="secondary", size="lg")
|
163 |
-
tts_btn = gr.Button("🔊 Generate Audio", variant="secondary", size="lg")
|
164 |
-
|
165 |
-
explanations_output = gr.Textbox(
|
166 |
-
label="Text Explanations",
|
167 |
-
lines=20,
|
168 |
-
max_lines=25,
|
169 |
-
placeholder="Click 'Generate Explanations' after extracting text to get simple explanations of each section...",
|
170 |
-
show_copy_button=True
|
171 |
-
)
|
172 |
-
|
173 |
-
# Add audio output for explanations
|
174 |
-
explanation_audio_output = gr.Audio(
|
175 |
-
label="Explanation Audio",
|
176 |
-
interactive=False,
|
177 |
-
visible=False
|
178 |
-
)
|
179 |
-
|
180 |
-
with gr.TabItem("Extracted Images"):
|
181 |
-
image_gallery = create_image_gallery()
|
182 |
-
image_info = gr.Markdown("Images extracted from the PDF will appear here.")
|
183 |
-
# Set up automatic PDF processing on upload
|
184 |
-
pdf_input.upload(
|
185 |
-
fn=process_pdf_wrapper,
|
186 |
-
inputs=[pdf_input],
|
187 |
-
outputs=[text_output, status_output, image_gallery]
|
188 |
-
)
|
189 |
-
|
190 |
-
# Handle explanation generation
|
191 |
-
explain_btn.click(
|
192 |
-
fn=generate_explanations_wrapper,
|
193 |
-
inputs=[text_output],
|
194 |
-
outputs=[explanations_output], # Removed explanation_status from outputs
|
195 |
-
show_progress=True
|
196 |
-
)
|
197 |
-
|
198 |
-
# Handle TTS generation for explanations
|
199 |
-
tts_btn.click(
|
200 |
-
fn=tts_click_handler,
|
201 |
-
inputs=[explanations_output],
|
202 |
-
outputs=[explanation_audio_output, explanation_audio_output],
|
203 |
-
show_progress=True
|
204 |
-
)
|
205 |
-
# Apply custom CSS styling
|
206 |
-
apply_custom_css()
|
207 |
-
|
208 |
-
return interface
|
209 |
-
|
210 |
-
def create_interface() -> gr.Blocks:
|
211 |
-
"""
|
212 |
-
Create and configure the Gradio interface.
|
213 |
-
|
214 |
-
Returns:
|
215 |
-
gr.Blocks: Configured Gradio interface
|
216 |
-
"""
|
217 |
-
# Initialize the PDF extractor
|
218 |
-
try:
|
219 |
-
extractor = PDFTextExtractor()
|
220 |
-
return create_main_interface(extractor)
|
221 |
-
except ValueError as e:
|
222 |
-
# Create a dummy interface if API key is missing
|
223 |
-
return create_dummy_interface()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/__init__.py
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
"""Utility functions for PDF Text Extractor."""
|
2 |
-
from utils.config import check_api_key, get_app_config
|
3 |
-
from utils.text_explainer import TextExplainer
|
4 |
-
|
5 |
-
__all__ = ["check_api_key", "get_app_config", "TextExplainer"]
|
|
|
|
|
|
|
|
|
|
|
|
utils/config.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Configuration utilities for PDF Text Extractor.
|
3 |
-
Contains functions for handling environment variables and app configuration.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
from typing import Dict, Any
|
8 |
-
|
9 |
-
def check_api_key() -> bool:
|
10 |
-
"""
|
11 |
-
Check if the Mistral API key is set in environment variables.
|
12 |
-
|
13 |
-
Returns:
|
14 |
-
bool: True if API key is set, False otherwise
|
15 |
-
"""
|
16 |
-
api_key = os.environ.get("MISTRAL_API_KEY")
|
17 |
-
if not api_key:
|
18 |
-
print("⚠️ Warning: MISTRAL_API_KEY environment variable is not set.")
|
19 |
-
print(" Please set it before using the PDF extraction functionality.")
|
20 |
-
print(" Example: export MISTRAL_API_KEY='your-api-key-here'")
|
21 |
-
print()
|
22 |
-
return False
|
23 |
-
return True
|
24 |
-
|
25 |
-
def get_app_config() -> Dict[str, Any]:
|
26 |
-
"""
|
27 |
-
Get application configuration settings.
|
28 |
-
|
29 |
-
Returns:
|
30 |
-
Dict[str, Any]: Application configuration settings
|
31 |
-
"""
|
32 |
-
return {
|
33 |
-
"server_port": 7862, # Use a different port to avoid conflicts
|
34 |
-
"debug": True, # Enable debug mode for development
|
35 |
-
"quiet": False, # Show startup messages
|
36 |
-
"max_file_size": "10mb" # Limit PDF file size
|
37 |
-
# Uncomment the following to enable external access and public link sharing:
|
38 |
-
# "server_name": "0.0.0.0", # Allow external access
|
39 |
-
# "share": True, # Create public link
|
40 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/pdf_image_extractor.py
DELETED
@@ -1,155 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
PDF Image Extraction utilities.
|
3 |
-
Extracts images from PDF using bounding box coordinates.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
import tempfile
|
8 |
-
from typing import List, Dict, Any, Optional
|
9 |
-
import fitz # PyMuPDF
|
10 |
-
from PIL import Image
|
11 |
-
import base64
|
12 |
-
import io
|
13 |
-
|
14 |
-
|
15 |
-
class PDFImageExtractor:
|
16 |
-
"""Extract images from PDF using bounding box coordinates."""
|
17 |
-
|
18 |
-
@staticmethod
|
19 |
-
def extract_images_from_pdf(pdf_path: str, images_data: List[Dict[str, Any]]) -> List[str]:
|
20 |
-
"""
|
21 |
-
Extract images from PDF using bounding box coordinates.
|
22 |
-
|
23 |
-
Args:
|
24 |
-
pdf_path: Path to the PDF file
|
25 |
-
images_data: List of image data with bounding box coordinates
|
26 |
-
|
27 |
-
Returns:
|
28 |
-
List[str]: List of paths to extracted image files
|
29 |
-
"""
|
30 |
-
if not images_data:
|
31 |
-
return []
|
32 |
-
|
33 |
-
try:
|
34 |
-
# Open the PDF document
|
35 |
-
pdf_doc = fitz.open(pdf_path)
|
36 |
-
extracted_image_paths = []
|
37 |
-
temp_dir = tempfile.gettempdir()
|
38 |
-
|
39 |
-
for index, img_data in enumerate(images_data):
|
40 |
-
try:
|
41 |
-
page_num = img_data.get('page', 0)
|
42 |
-
|
43 |
-
# Ensure page number is valid
|
44 |
-
if page_num >= len(pdf_doc):
|
45 |
-
print(f"Warning: Page {page_num} not found in PDF (max: {len(pdf_doc)-1})")
|
46 |
-
continue
|
47 |
-
|
48 |
-
# Get the page
|
49 |
-
page = pdf_doc[page_num]
|
50 |
-
|
51 |
-
# Get bounding box coordinates
|
52 |
-
top_left_x = img_data.get('top_left_x', 0)
|
53 |
-
top_left_y = img_data.get('top_left_y', 0)
|
54 |
-
bottom_right_x = img_data.get('bottom_right_x', 0)
|
55 |
-
bottom_right_y = img_data.get('bottom_right_y', 0)
|
56 |
-
|
57 |
-
# Create a rectangle for the bounding box
|
58 |
-
# PyMuPDF uses (x0, y0, x1, y1) format
|
59 |
-
bbox = fitz.Rect(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
|
60 |
-
|
61 |
-
# Render the page as a pixmap with high resolution
|
62 |
-
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
63 |
-
pix = page.get_pixmap(matrix=mat, clip=bbox)
|
64 |
-
|
65 |
-
# Convert pixmap to PIL Image
|
66 |
-
img_data_bytes = pix.tobytes("png")
|
67 |
-
img = Image.open(io.BytesIO(img_data_bytes))
|
68 |
-
|
69 |
-
# Save the image to a temporary file
|
70 |
-
img_filename = f"extracted_image_page{page_num}_{index}.png"
|
71 |
-
img_path = os.path.join(temp_dir, img_filename)
|
72 |
-
img.save(img_path, "PNG")
|
73 |
-
|
74 |
-
extracted_image_paths.append(img_path)
|
75 |
-
print(f"✅ Extracted image {index} from page {page_num}: {img_path}")
|
76 |
-
|
77 |
-
except Exception as e:
|
78 |
-
print(f"Error extracting image {index}: {str(e)}")
|
79 |
-
|
80 |
-
# Fallback: try to use base64 data if available
|
81 |
-
base64_data = img_data.get('base64', '')
|
82 |
-
if base64_data:
|
83 |
-
try:
|
84 |
-
img_filename = f"extracted_image_base64_{index}.jpg"
|
85 |
-
img_path = os.path.join(temp_dir, img_filename)
|
86 |
-
|
87 |
-
with open(img_path, "wb") as img_file:
|
88 |
-
img_file.write(base64.b64decode(base64_data))
|
89 |
-
|
90 |
-
extracted_image_paths.append(img_path)
|
91 |
-
print(f"✅ Used base64 data for image {index}: {img_path}")
|
92 |
-
except Exception as e2:
|
93 |
-
print(f"Error using base64 data for image {index}: {str(e2)}")
|
94 |
-
|
95 |
-
pdf_doc.close()
|
96 |
-
return extracted_image_paths
|
97 |
-
|
98 |
-
except Exception as e:
|
99 |
-
print(f"Error opening PDF file: {str(e)}")
|
100 |
-
return []
|
101 |
-
|
102 |
-
@staticmethod
|
103 |
-
def extract_all_images_from_pdf(pdf_path: str) -> List[str]:
|
104 |
-
"""
|
105 |
-
Extract all images from PDF without using bounding boxes.
|
106 |
-
This is a fallback method when no bounding box data is available.
|
107 |
-
|
108 |
-
Args:
|
109 |
-
pdf_path: Path to the PDF file
|
110 |
-
|
111 |
-
Returns:
|
112 |
-
List[str]: List of paths to extracted image files
|
113 |
-
"""
|
114 |
-
try:
|
115 |
-
pdf_doc = fitz.open(pdf_path)
|
116 |
-
extracted_image_paths = []
|
117 |
-
temp_dir = tempfile.gettempdir()
|
118 |
-
|
119 |
-
for page_num in range(len(pdf_doc)):
|
120 |
-
page = pdf_doc[page_num]
|
121 |
-
image_list = page.get_images()
|
122 |
-
|
123 |
-
for img_index, img in enumerate(image_list):
|
124 |
-
try:
|
125 |
-
# Get image data
|
126 |
-
xref = img[0]
|
127 |
-
pix = fitz.Pixmap(pdf_doc, xref)
|
128 |
-
|
129 |
-
# Convert to PNG if CMYK
|
130 |
-
if pix.n - pix.alpha < 4: # GRAY or RGB
|
131 |
-
img_data = pix.tobytes("png")
|
132 |
-
else: # CMYK: convert to RGB first
|
133 |
-
pix1 = fitz.Pixmap(fitz.csRGB, pix)
|
134 |
-
img_data = pix1.tobytes("png")
|
135 |
-
pix1 = None
|
136 |
-
|
137 |
-
# Save image
|
138 |
-
img_filename = f"all_images_page{page_num}_img{img_index}.png"
|
139 |
-
img_path = os.path.join(temp_dir, img_filename)
|
140 |
-
|
141 |
-
with open(img_path, "wb") as f:
|
142 |
-
f.write(img_data)
|
143 |
-
|
144 |
-
extracted_image_paths.append(img_path)
|
145 |
-
pix = None
|
146 |
-
|
147 |
-
except Exception as e:
|
148 |
-
print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
|
149 |
-
|
150 |
-
pdf_doc.close()
|
151 |
-
return extracted_image_paths
|
152 |
-
|
153 |
-
except Exception as e:
|
154 |
-
print(f"Error extracting all images from PDF: {str(e)}")
|
155 |
-
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/text_explainer.py
DELETED
@@ -1,341 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Text Explanation utilities using Mistral AI.
|
3 |
-
Splits text by markdown headings and generates contextual explanations for each section.
|
4 |
-
Maintains chat history to provide coherent explanations that build upon previous sections.
|
5 |
-
"""
|
6 |
-
|
7 |
-
import os
|
8 |
-
import re
|
9 |
-
from typing import List, Dict, Tuple, Optional
|
10 |
-
from mistralai import Mistral
|
11 |
-
|
12 |
-
|
13 |
-
class TextExplainer:
|
14 |
-
"""Generate explanations for text sections using Mistral AI."""
|
15 |
-
|
16 |
-
def __init__(self):
|
17 |
-
"""Initialize the text explainer with Mistral AI client."""
|
18 |
-
self.api_key = os.environ.get("MISTRAL_API_KEY")
|
19 |
-
if not self.api_key:
|
20 |
-
raise ValueError("MISTRAL_API_KEY environment variable is required")
|
21 |
-
self.client = Mistral(api_key=self.api_key)
|
22 |
-
self.chat_history = []
|
23 |
-
|
24 |
-
def get_topic(self, text: str) -> Optional[str]:
|
25 |
-
"""
|
26 |
-
Extract the main topic from the text using Mistral AI with structured output.
|
27 |
-
|
28 |
-
Args:
|
29 |
-
text: Input text to analyze
|
30 |
-
|
31 |
-
Returns:
|
32 |
-
Main topic as a string or None if not found
|
33 |
-
"""
|
34 |
-
try:
|
35 |
-
# Define the JSON schema for structured output
|
36 |
-
topic_schema = {
|
37 |
-
"type": "json_schema",
|
38 |
-
"json_schema": {
|
39 |
-
"schema": {
|
40 |
-
"type": "object",
|
41 |
-
"properties": {
|
42 |
-
"main_topic": {
|
43 |
-
"type": "string",
|
44 |
-
"title": "Main Topic",
|
45 |
-
"description": "The primary / general topic or subject of the text"
|
46 |
-
},
|
47 |
-
},
|
48 |
-
"required": ["main_topic"],
|
49 |
-
"additionalProperties": False
|
50 |
-
},
|
51 |
-
"name": "topic_extraction",
|
52 |
-
"strict": True
|
53 |
-
}
|
54 |
-
}
|
55 |
-
|
56 |
-
response = self.client.chat.complete(
|
57 |
-
model="ministral-8b-2410", # Using a more recent model that supports structured output
|
58 |
-
messages=[
|
59 |
-
{
|
60 |
-
"role": "system",
|
61 |
-
"content": "You are an expert in summarizing texts. Extract the main topic from the provided text."
|
62 |
-
},
|
63 |
-
{
|
64 |
-
"role": "user",
|
65 |
-
"content": f"Analyze this text and extract the main topic:\n\n{text[:2000]}..." # Limit to first 2000 characters for performance
|
66 |
-
}
|
67 |
-
],
|
68 |
-
temperature=0.3, # Lower temperature for more consistent structured output
|
69 |
-
max_tokens=200,
|
70 |
-
response_format=topic_schema
|
71 |
-
)
|
72 |
-
|
73 |
-
if hasattr(response, 'choices') and response.choices:
|
74 |
-
# Parse the structured JSON response
|
75 |
-
import json
|
76 |
-
try:
|
77 |
-
topic_data = json.loads(response.choices[0].message.content)
|
78 |
-
main_topic = topic_data.get("main_topic", "").strip()
|
79 |
-
confidence = topic_data.get("confidence", 0.0)
|
80 |
-
secondary_topics = topic_data.get("secondary_topics", [])
|
81 |
-
|
82 |
-
# Log the structured output for debugging
|
83 |
-
print(f"📊 Topic extraction - Main: '{main_topic}', Confidence: {confidence:.2f}")
|
84 |
-
if secondary_topics:
|
85 |
-
print(f"🔍 Secondary topics: {', '.join(secondary_topics)}")
|
86 |
-
|
87 |
-
return main_topic if main_topic else None
|
88 |
-
except json.JSONDecodeError as json_err:
|
89 |
-
print(f"Error parsing JSON response: {json_err}")
|
90 |
-
# Fallback to raw content if JSON parsing fails
|
91 |
-
return response.choices[0].message.content.strip()
|
92 |
-
return None
|
93 |
-
except Exception as e:
|
94 |
-
print(f"Error extracting topic: {str(e)}")
|
95 |
-
return None
|
96 |
-
|
97 |
-
def split_text_by_headings(self, text: str) -> List[Dict[str, str]]:
|
98 |
-
"""
|
99 |
-
Split text into sections based on markdown headings.
|
100 |
-
|
101 |
-
Args:
|
102 |
-
text: Input text with markdown headings
|
103 |
-
|
104 |
-
Returns:
|
105 |
-
List of dictionaries with 'heading' and 'content' keys
|
106 |
-
"""
|
107 |
-
if not text:
|
108 |
-
return []
|
109 |
-
|
110 |
-
# Split by markdown headings (# ## ### etc.)
|
111 |
-
sections = []
|
112 |
-
|
113 |
-
# Regex to find headings and their content
|
114 |
-
# Matches: # Heading, ## Heading, ### Heading, etc.
|
115 |
-
heading_pattern = r'^(#{1,6})\s+(.+?)$'
|
116 |
-
|
117 |
-
lines = text.split('\n')
|
118 |
-
current_heading = None
|
119 |
-
current_content = []
|
120 |
-
current_level = 0
|
121 |
-
|
122 |
-
for line in lines:
|
123 |
-
heading_match = re.match(heading_pattern, line.strip())
|
124 |
-
|
125 |
-
if heading_match:
|
126 |
-
# Save previous section if it exists
|
127 |
-
if current_heading and current_content:
|
128 |
-
content_text = '\n'.join(current_content).strip()
|
129 |
-
if content_text: # Only add if there's actual content
|
130 |
-
sections.append({
|
131 |
-
'heading': current_heading,
|
132 |
-
'content': content_text,
|
133 |
-
'level': current_level
|
134 |
-
})
|
135 |
-
|
136 |
-
# Start new section
|
137 |
-
level = len(heading_match.group(1)) # Count the # characters
|
138 |
-
current_heading = heading_match.group(2).strip()
|
139 |
-
current_level = level
|
140 |
-
current_content = []
|
141 |
-
else:
|
142 |
-
# Add line to current content if we have a heading
|
143 |
-
if current_heading is not None:
|
144 |
-
current_content.append(line)
|
145 |
-
|
146 |
-
# Don't forget the last section
|
147 |
-
if current_heading and current_content:
|
148 |
-
content_text = '\n'.join(current_content).strip()
|
149 |
-
if content_text:
|
150 |
-
sections.append({
|
151 |
-
'heading': current_heading,
|
152 |
-
'content': content_text,
|
153 |
-
'level': current_level
|
154 |
-
})
|
155 |
-
|
156 |
-
# If no headings found, treat entire text as one section
|
157 |
-
if not sections and text.strip():
|
158 |
-
sections.append({
|
159 |
-
'heading': 'Document Content',
|
160 |
-
'content': text.strip(),
|
161 |
-
'level': 1
|
162 |
-
})
|
163 |
-
return sections
|
164 |
-
|
165 |
-
def generate_explanation(self, topic: str, heading: str, content: str, section_number: int = 1, total_sections: int = 1) -> str:
|
166 |
-
"""
|
167 |
-
Generate an explanation for a text section using Mistral AI with chat history context.
|
168 |
-
|
169 |
-
Args:
|
170 |
-
topic: General topic of the document
|
171 |
-
heading: Section heading
|
172 |
-
content: Section content
|
173 |
-
section_number: Current section number (for context)
|
174 |
-
total_sections: Total number of sections (for context)
|
175 |
-
|
176 |
-
Returns:
|
177 |
-
Generated explanation in simple terms
|
178 |
-
"""
|
179 |
-
try:
|
180 |
-
# Build the current user message
|
181 |
-
prompt = f"""
|
182 |
-
**Section {section_number} of {total_sections}**
|
183 |
-
**Section Heading:** {heading}
|
184 |
-
|
185 |
-
**Section Content:**
|
186 |
-
{content}
|
187 |
-
|
188 |
-
**Your Explanation:**"""
|
189 |
-
|
190 |
-
# If this is the first section, initialize with system prompt
|
191 |
-
if section_number == 1:
|
192 |
-
system_prompt = f"""You are an expert teacher who explains complex topics in simple, easy-to-understand terms.
|
193 |
-
|
194 |
-
I will give you sections of text with their headings on the topic of "{topic}", and I want you to explain what each section is about in simple language, by breaking down any complex concepts or terminology. You should also explain why this information might be important or useful, use examples or analogies when helpful, and keep the explanation engaging and educational.
|
195 |
-
|
196 |
-
Make your explanation clear enough for someone without prior knowledge of the topic to understand. As you explain each section, consider how it relates to the previous sections you've already explained to provide coherent, contextual explanations throughout the document.
|
197 |
-
|
198 |
-
Do not mention anything far irrelevant from the topic of "{topic}". Do not repeat information unnecessarily, but build on previous explanations to create a comprehensive understanding of the topic. Avoid using the term 'section' and use the actual section heading instead. No need to mention the section number in your explanation.
|
199 |
-
"""
|
200 |
-
|
201 |
-
# Initialize chat history with system message
|
202 |
-
self.chat_history = [
|
203 |
-
{
|
204 |
-
"role": "system",
|
205 |
-
"content": system_prompt
|
206 |
-
}
|
207 |
-
]
|
208 |
-
|
209 |
-
# Check if content is too small (less than 200 characters)
|
210 |
-
if len(content) < 200:
|
211 |
-
print(f"📋 Skipping API call for short content in '{heading}' ({len(content)} chars < 200)")
|
212 |
-
# Add the user prompt to chat history for context in subsequent queries
|
213 |
-
self.chat_history.append({
|
214 |
-
"role": "user",
|
215 |
-
"content": prompt
|
216 |
-
})
|
217 |
-
# Return a simple message indicating the content was too short
|
218 |
-
return f"This section contains minimal content ({len(content)} characters). The information has been noted for context in subsequent explanations."
|
219 |
-
|
220 |
-
# Add the current user message to chat history
|
221 |
-
self.chat_history.append({
|
222 |
-
"role": "user",
|
223 |
-
"content": prompt
|
224 |
-
})
|
225 |
-
|
226 |
-
# Call Mistral AI for explanation with full chat history
|
227 |
-
response = self.client.chat.complete(
|
228 |
-
model="mistral-small-2503",
|
229 |
-
messages=self.chat_history,
|
230 |
-
temperature=0.7, # Some creativity but still focused
|
231 |
-
# max_tokens=1000 # Reasonable explanation length
|
232 |
-
)
|
233 |
-
|
234 |
-
# Extract the explanation from response
|
235 |
-
if hasattr(response, 'choices') and response.choices:
|
236 |
-
explanation = response.choices[0].message.content
|
237 |
-
|
238 |
-
# Add the assistant's response to chat history
|
239 |
-
self.chat_history.append({
|
240 |
-
"role": "assistant",
|
241 |
-
"content": explanation
|
242 |
-
})
|
243 |
-
|
244 |
-
return explanation.strip()
|
245 |
-
else:
|
246 |
-
return f"Could not generate explanation for section: {heading}"
|
247 |
-
|
248 |
-
except Exception as e:
|
249 |
-
print(f"Error generating explanation for '{heading}': {str(e)}")
|
250 |
-
return f"Error generating explanation for this section: {str(e)}"
|
251 |
-
|
252 |
-
def explain_all_sections(self, text: str) -> List[Dict[str, str]]:
|
253 |
-
"""
|
254 |
-
Split text by headings and generate explanations for all sections with chat history context.
|
255 |
-
|
256 |
-
Args:
|
257 |
-
text: Input text with markdown headings
|
258 |
-
|
259 |
-
Returns:
|
260 |
-
List of dictionaries with 'heading', 'content', 'explanation', and 'level' keys
|
261 |
-
"""
|
262 |
-
sections = self.split_text_by_headings(text)
|
263 |
-
|
264 |
-
if not sections:
|
265 |
-
return []
|
266 |
-
|
267 |
-
print(f"🔍 Found {len(sections)} sections to explain...")
|
268 |
-
|
269 |
-
# Extract the main topic from the text
|
270 |
-
print("🎯 Extracting main topic...")
|
271 |
-
topic = self.get_topic(text)
|
272 |
-
if topic:
|
273 |
-
print(f"📋 Main topic identified: {topic}")
|
274 |
-
else:
|
275 |
-
topic = "General Content" # Fallback topic
|
276 |
-
print("⚠️ Could not identify main topic, using fallback")
|
277 |
-
|
278 |
-
# Reset chat history for new document
|
279 |
-
self.chat_history = []
|
280 |
-
|
281 |
-
explained_sections = []
|
282 |
-
|
283 |
-
for i, section in enumerate(sections, 1):
|
284 |
-
print(f"📝 Generating explanation for section {i}/{len(sections)}: {section['heading'][:50]}...")
|
285 |
-
|
286 |
-
# Pass topic, section content, and context information
|
287 |
-
explanation = self.generate_explanation(
|
288 |
-
topic,
|
289 |
-
section['heading'],
|
290 |
-
section['content'],
|
291 |
-
section_number=i,
|
292 |
-
total_sections=len(sections)
|
293 |
-
)
|
294 |
-
|
295 |
-
explained_sections.append({
|
296 |
-
'heading': section['heading'],
|
297 |
-
'content': section['content'],
|
298 |
-
'explanation': explanation,
|
299 |
-
'level': section['level']
|
300 |
-
})
|
301 |
-
|
302 |
-
print(f"✅ Generated explanations for all {len(explained_sections)} sections")
|
303 |
-
return explained_sections
|
304 |
-
|
305 |
-
def reset_chat_history(self):
|
306 |
-
"""Reset the chat history for a new document or conversation."""
|
307 |
-
self.chat_history = []
|
308 |
-
|
309 |
-
def get_chat_history(self) -> List[Dict[str, str]]:
|
310 |
-
"""Get the current chat history for debugging purposes."""
|
311 |
-
return self.chat_history.copy()
|
312 |
-
|
313 |
-
def get_chat_history_summary(self) -> str:
|
314 |
-
"""Get a summary of the current chat history."""
|
315 |
-
if not self.chat_history:
|
316 |
-
return "No chat history available."
|
317 |
-
|
318 |
-
summary = f"Chat history contains {len(self.chat_history)} messages:\n"
|
319 |
-
for i, message in enumerate(self.chat_history, 1):
|
320 |
-
role = message['role']
|
321 |
-
content_preview = message['content'][:100] + "..." if len(message['content']) > 100 else message['content']
|
322 |
-
summary += f"{i}. {role.upper()}: {content_preview}\n"
|
323 |
-
|
324 |
-
return summary
|
325 |
-
|
326 |
-
def format_explanations_for_display(self, explained_sections: List[Dict[str, str]]) -> str:
|
327 |
-
"""
|
328 |
-
Concatenate only the explanations from all sections for display, filtering out placeholder explanations for minimal content.
|
329 |
-
Args:
|
330 |
-
explained_sections: List of sections with explanations
|
331 |
-
Returns:
|
332 |
-
Concatenated explanations as a single string
|
333 |
-
"""
|
334 |
-
if not explained_sections:
|
335 |
-
return "No sections found to explain."
|
336 |
-
skip_phrase = "This section contains minimal content"
|
337 |
-
return "\n\n".join(
|
338 |
-
section['explanation']
|
339 |
-
for section in explained_sections
|
340 |
-
if section.get('explanation') and not section['explanation'].strip().startswith(skip_phrase)
|
341 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|