import logging
import tempfile
import markdown
import os
import shutil
import re
import urllib.parse
import base64
import asyncio
import pathlib
from components.state import SessionState, get_unit_status_emoji
try:
import pyppeteer
from pyppeteer.launcher import DEFAULT_ARGS
PYPPETEER_AVAILABLE = True
except ImportError:
logging.warning("pyppeteer not installed. PDF export will be disabled. "
"Please run 'pip install pyppeteer'.")
PYPPETEER_AVAILABLE = False
except Exception as e:
logging.error(f"Error importing pyppeteer: {e}. PDF export will be disabled.", exc_info=True)
PYPPETEER_AVAILABLE = False
async def _delete_file_after_delay(file_path: str, delay: int = 60):
"""Deletes a file after a specified delay."""
await asyncio.sleep(delay)
try:
if os.path.exists(file_path):
os.unlink(file_path)
logging.info(f"Deleted temporary export file: {file_path}")
else:
logging.warning(f"File not found for deletion: {file_path}")
except Exception as e:
logging.error(f"Error deleting file {file_path}: {e}", exc_info=True)
def _convert_markdown_to_html(md_content: str) -> str:
"""Converts markdown to HTML, preserving LaTeX for MathJax."""
return markdown.markdown(md_content, extensions=['fenced_code', 'tables', 'sane_lists'])
def _image_to_base64_uri(image_path: str) -> str:
"""Converts an image file to a Base64 data URI."""
if not os.path.exists(image_path):
logging.warning(f"Image not found at path: {image_path}. Skipping embedding.")
return ""
try:
ext = os.path.splitext(image_path)[1][1:].lower()
if ext == 'jpg': ext = 'jpeg'
if ext not in ['jpeg', 'png', 'gif', 'svg']:
logging.warning(f"Unsupported image type '{ext}' for base64 embedding.")
return image_path
mime_type = f"image/{ext}" if ext != 'svg' else "image/svg+xml"
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
return f"data:{mime_type};base64,{encoded_string}"
except Exception as e:
logging.error(f"Could not convert image {image_path} to base64: {e}")
return ""
def export_session_to_markdown(session: SessionState) -> str:
"""Exports the entire session content to a single Markdown string."""
markdown_content = "# LearnFlow AI Session Export\n\n"
markdown_content += f"**LLM Provider:** {session.provider}\n\n"
summary = session.get_progress_summary()
markdown_content += "## Progress Summary\n"
markdown_content += f"- Total Units: {summary.get('total_units', 0)}\n"
markdown_content += f"- Completed: {summary.get('completed_units', 0)} ✅\n"
markdown_content += f"- In Progress: {summary.get('in_progress_units', 0)} 🕑\n"
markdown_content += f"- Not Started: {summary.get('not_started_units', 0)} 📘\n"
markdown_content += f"- Completion Rate: {summary.get('completion_rate', 0):.1f}%\n\n"
markdown_content += "## Learning Units\n\n"
for i, unit in enumerate(session.units, 1):
emoji = get_unit_status_emoji(unit)
markdown_content += f"### {emoji} Unit {i}: {unit.title}\n\n"
markdown_content += f"**Status:** {unit.status.replace('_', ' ').title()}\n\n"
markdown_content += f"**Summary:** {unit.summary}\n\n"
if unit.explanation_data:
markdown_content += "#### Explanation\n"
markdown_content += unit.explanation_data.markdown + "\n\n"
for visual_aid in unit.explanation_data.visual_aids:
markdown_content += (f"![{visual_aid.caption}]"
f"({visual_aid.path})\n\n")
for code_example in unit.explanation_data.code_examples:
markdown_content += f"##### 💻 {code_example.description}\n"
markdown_content += (f"```{code_example.language}\n"
f"{code_example.code}\n```\n\n")
if unit.quiz_data:
markdown_content += "#### Quiz\n"
if unit.quiz_data.mcqs:
markdown_content += "##### Multiple Choice Questions\n"
for q_idx, mcq in enumerate(unit.quiz_data.mcqs, 1):
markdown_content += f"**Q{q_idx}:** {mcq.question}\n"
for key, value in mcq.options.items():
markdown_content += f"- {key}. {value}\n"
markdown_content += (f"**Correct Answer:** {mcq.correct_answer}. "
f"{mcq.options.get(mcq.correct_answer, '')}\n")
markdown_content += f"**Explanation:** {mcq.explanation}\n\n"
if unit.quiz_data.open_ended:
markdown_content += "##### Open-Ended Questions\n"
for q_idx, open_q in enumerate(unit.quiz_data.open_ended, 1):
markdown_content += f"**Q{q_idx}:** {open_q.question}\n"
markdown_content += f"**Model Answer:** {open_q.model_answer}\n\n"
markdown_content += "---\n\n"
return markdown_content
def export_session_to_html(session: SessionState, embed_images_for_pdf: bool = False) -> str:
"""
Exports the entire session content to a single HTML string.
Args:
session: The SessionState object.
embed_images_for_pdf: If True, embeds images as Base64 data URIs, which is
necessary for self-contained PDF generation.
"""
html_parts = []
html_parts.append("
LearnFlow AI Session Export
\n\n")
html_parts.append(f"LLM Provider: {session.provider}
\n\n")
summary = session.get_progress_summary()
html_parts.append("Progress Summary
\n")
html_parts.append("\n")
html_parts.append(f"- Total Units: {summary.get('total_units', 0)}
\n")
html_parts.append(f"- Completed: {summary.get('completed_units', 0)} ✅
\n")
html_parts.append(f"- In Progress: {summary.get('in_progress_units', 0)} 🕑
\n")
html_parts.append(f"- Not Started: {summary.get('not_started_units', 0)} 📘
\n")
html_parts.append(f"- Completion Rate: {summary.get('completion_rate', 0):.1f}%
\n")
html_parts.append("
\n\n")
html_parts.append("Learning Units
\n\n")
for i, unit in enumerate(session.units, 1):
emoji = get_unit_status_emoji(unit)
html_parts.append(f"{emoji} Unit {i}: {unit.title}
\n\n")
html_parts.append(f"Status: {unit.status.replace('_', ' ').title()}
\n\n")
html_parts.append(f"Summary: {unit.summary}
\n\n")
if unit.explanation_data:
html_parts.append("Explanation
\n")
html_parts.append(_convert_markdown_to_html(unit.explanation_data.markdown) + "\n\n")
for visual_aid in unit.explanation_data.visual_aids:
# If generating for PDF, embed the image. Otherwise, use the path.
img_src = _image_to_base64_uri(visual_aid.path) if embed_images_for_pdf else visual_aid.path
if img_src:
html_parts.append(f'
\n\n')
for code_example in unit.explanation_data.code_examples:
html_parts.append(f"💻 {code_example.description}
\n")
html_parts.append(f"{code_example.code}
\n\n")
if unit.quiz_data:
html_parts.append("Quiz
\n")
if unit.quiz_data.mcqs:
html_parts.append("Multiple Choice Questions
\n")
for q_idx, mcq in enumerate(unit.quiz_data.mcqs, 1):
html_parts.append(f"\n")
html_parts.append(f"
Q{q_idx}: {_convert_markdown_to_html(mcq.question)}\n")
html_parts.append("
\n")
for key, value in mcq.options.items():
html_parts.append(f"- {key}. {_convert_markdown_to_html(value)}
\n")
html_parts.append("
\n")
html_parts.append(f"
Correct Answer: {mcq.correct_answer}. {_convert_markdown_to_html(mcq.options.get(mcq.correct_answer, ''))}
\n")
html_parts.append(f"
Explanation: {_convert_markdown_to_html(mcq.explanation)}
\n")
html_parts.append("
\n\n")
if unit.quiz_data.open_ended:
html_parts.append("Open-Ended Questions
\n")
for q_idx, open_q in enumerate(unit.quiz_data.open_ended, 1):
html_parts.append(f"\n")
html_parts.append(f"
Q{q_idx}: {_convert_markdown_to_html(open_q.question)}\n")
html_parts.append(f"
Model Answer: {_convert_markdown_to_html(open_q.model_answer)}
\n")
html_parts.append("
\n\n")
html_parts.append("
\n\n")
html_body = "".join(html_parts)
html_template = """
LearnFlow AI Session Export
{}
"""
return html_template.format(html_body)
# --- PDF ---
async def find_browser_executable_path() -> str | None:
"""
Finds a usable Chrome or Chromium executable path on the system.
This is more robust than pyppeteer's default download.
"""
# 1. For Hugging Face Spaces & Debian/Ubuntu systems
for path in ["/usr/bin/chromium", "/usr/bin/chromium-browser"]:
if os.path.exists(path):
logging.info(f"Found system-installed Chromium at: {path}")
return path
# 2. For Windows systems
if os.name == 'nt':
for path in [
os.path.join(os.environ["ProgramFiles"], "Google", "Chrome", "Application", "chrome.exe"),
os.path.join(os.environ["ProgramFiles(x86)"], "Google", "Chrome", "Application", "chrome.exe"),
os.path.join(os.environ["LOCALAPPDATA"], "Google", "Chrome", "Application", "chrome.exe"),
]:
if os.path.exists(path):
logging.info(f"Found system-installed Chrome at: {path}")
return path
# 3. For macOS systems
mac_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
if os.path.exists(mac_path):
logging.info(f"Found system-installed Chrome at: {mac_path}")
return mac_path
# 4. Fallback to pyppeteer's own downloaded version if it exists
try:
from pyppeteer import launcher
pyppeteer_path = launcher.executablePath()
if os.path.exists(pyppeteer_path):
logging.info(f"Found pyppeteer-managed Chromium at: {pyppeteer_path}")
return pyppeteer_path
except Exception:
pass
logging.warning("Could not find a pre-installed Chrome/Chromium browser.")
return None
async def _export_session_to_pdf_async(session: SessionState, filename: str) -> str:
"""
The core asynchronous function to export the session to PDF using Pyppeteer.
It renders the full HTML with MathJax in a headless browser and prints to PDF.
This version uses a temporary file and page.goto for robust resource loading.
"""
if not PYPPETEER_AVAILABLE:
return "Error: PDF export is disabled because pyppeteer is not installed."
logging.info("Starting PDF export process...")
# The HTML generation is correct, no changes needed there.
html_content = export_session_to_html(session, embed_images_for_pdf=True)
browser = None
temp_html_path = None
try:
# 1. Write the self-contained HTML to a temporary file.
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.html', encoding='utf-8') as f:
f.write(html_content)
temp_html_path = f.name
file_url = pathlib.Path(temp_html_path).as_uri()
logging.info(f"Generated temporary HTML for rendering: {file_url}")
executable_path = await find_browser_executable_path()
args = DEFAULT_ARGS.copy()
if '--enable-automation' in args:
args.remove('--enable-automation')
required_args = ['--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars']
for arg in required_args:
if arg not in args:
args.append(arg)
launch_options = {
'args': args,
'handleSIGINT': False,
'handleSIGTERM': False,
'handleSIGHUP': False
}
if executable_path:
launch_options['executablePath'] = executable_path
logging.info("Launching headless browser...")
browser = await pyppeteer.launch(launch_options)
page = await browser.newPage()
await page.setViewport({'width': 1200, 'height': 800})
logging.info("Navigating to temporary HTML file...")
await page.goto(file_url, waitUntil='networkidle0')
logging.info("Waiting for MathJax to complete rendering...")
await page.waitForSelector('body.MathJax_Processed', timeout=60000)
# ----------------------------------------
logging.info("Generating PDF file...")
await page.pdf({
'path': filename,
'format': 'A4',
'printBackground': True,
'margin': {'top': '20mm', 'bottom': '20mm', 'left': '20mm', 'right': '20mm'}
})
logging.info(f"Session successfully exported to PDF: {filename}")
# Removed asyncio.create_task(_delete_file_after_delay(filename))
return filename
except Exception as e:
logging.error(f"An error occurred during PDF export with Pyppeteer: {e}", exc_info=True)
error_message = (
f"Error exporting to PDF: {e}. If on a platform like Hugging Face, ensure "
"you have 'chromium' in your packages.txt file. On your local machine, ensure "
"Google Chrome is installed."
)
return error_message
finally:
# 4. Clean up everything.
if browser:
logging.info("Closing headless browser.")
await browser.close()
if temp_html_path and os.path.exists(temp_html_path):
os.unlink(temp_html_path)
logging.info("Cleaned up temporary HTML file.")
def export_session_to_pdf(session: SessionState, filename: str = "LearnFlow_Session.pdf") -> str:
"""
Exports the session to a PDF with perfectly rendered LaTeX.
This is a synchronous wrapper around the asynchronous Pyppeteer logic,
making it easy to call from standard synchronous code.
"""
try:
# This runs the async function and waits for it to complete.
result = asyncio.run(_export_session_to_pdf_async(session, filename))
return result
except RuntimeError as e:
if "cannot run loop while another loop is running" in str(e):
logging.error("Asyncio loop conflict. This can happen in environments like Jupyter. "
"Try running 'await _export_session_to_pdf_async(...)' directly.")
return "Error: Asyncio loop conflict. Cannot generate PDF in this environment."
else:
logging.error(f"A runtime error occurred: {e}", exc_info=True)
return f"Error: A runtime error occurred during PDF export: {e}"
except Exception as e:
logging.error(f"An unexpected error occurred in the sync wrapper for PDF export: {e}", exc_info=True)
return f"An unexpected error occurred: {e}"