Spaces:

Agents-MCP-Hackathon
/

magical-tales

Running

App Files Files Community

agharsallah commited on 10 days ago

Commit

ea5af73

1 Parent(s): b492512

Adding LLama parse for PDF extraction

Browse files

Files changed (4) hide show

controllers/app_controller.py +9 -2
requirements.txt +2 -1
services/pdf_text_extractor.py +69 -6
util/mistral_api_client.py +1 -1

controllers/app_controller.py CHANGED Viewed

@@ -7,6 +7,7 @@ from services.streaming_chapter_processor import process_story_into_chapters_str
 from services.audio_generator import generate_audio, generate_melody_from_story
 import gradio as gr
 from config import constants
 logger = logging.getLogger(__name__)
@@ -46,8 +47,14 @@ def process_story_generation(
         # Process PDF if provided
         pdf_content = ""
         if pdf_file:
-            logger.info(f"Extracting text from PDF: {pdf_file.name}")
             pdf_content = extract_text_from_pdf(pdf_file)
             if pdf_content.startswith("Error:"):
                 logger.error(f"PDF extraction error: {pdf_content}")
@@ -60,7 +67,7 @@ def process_story_generation(
             kid_interests=kid_interests,
             subject=subject,
             reading_time=reading_time,
-            pdf_content=pdf_content,
             model_name=model_selector,
         )

 from services.audio_generator import generate_audio, generate_melody_from_story
 import gradio as gr
 from config import constants
+from util.mistral_api_client import MistralAPI
 logger = logging.getLogger(__name__)
         # Process PDF if provided
         pdf_content = ""
         if pdf_file:
+            logger.info("Extracting text from PDF")
             pdf_content = extract_text_from_pdf(pdf_file)
+            # summarize the PDF content for better prompting using mistral
+            mistral_api = MistralAPI()
+            summarized_pdf = mistral_api.send_request(
+                f"Summarize the following Text content into a single-sentence children's story without any explanations, tags, or formatting—just plain text in one line.: {pdf_content}"
+            )["choices"][0]["message"]["content"]
+            logger.info(f"summarized_pdf: {summarized_pdf}")
             if pdf_content.startswith("Error:"):
                 logger.error(f"PDF extraction error: {pdf_content}")
             kid_interests=kid_interests,
             subject=subject,
             reading_time=reading_time,
+            pdf_content=summarized_pdf,
             model_name=model_selector,
         )

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ types-requests
 markdown
 mcp
 asyncio
-openai

 markdown
 mcp
 asyncio
+openai
+llama_cloud_services

services/pdf_text_extractor.py CHANGED Viewed

@@ -1,30 +1,67 @@
 import PyPDF2
 from typing import Optional, Any
 import logging
 # Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-def extract_text_from_pdf(pdf_file: Optional[Any]) -> str:
     """
     Extract text content from a PDF file.
     Args:
         pdf_file: A file-like object containing the PDF data.
     Returns:
         str: Extracted text content or an error message.
     """
     if pdf_file is None:
         logger.warning("No PDF file provided")
         return ""
     try:
         # Create a PDF reader object
         pdf_reader = PyPDF2.PdfReader(pdf_file.name)
         if len(pdf_reader.pages) == 0:
             logger.warning("PDF has no pages")
             return "The PDF file appears to be empty."
@@ -47,3 +84,29 @@ def extract_text_from_pdf(pdf_file: Optional[Any]) -> str:
         error_msg = f"Error extracting text from PDF: {str(e)}"
         logger.error(error_msg, exc_info=True)
         return error_msg

 import PyPDF2
 from typing import Optional, Any
 import logging
+from llama_cloud_services import LlamaParse
+import os
 # Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
 logger = logging.getLogger(__name__)
+def extract_text_from_pdf(pdf_file: str | None) -> str:
+    """
+    uses llama_parse to extract text content from a PDF file.
+    if that fails, it falls back to PyPDF2.
+    Args:
+        pdf_file: Path to the PDF file.
+    Returns:
+        str: Extracted text content or an error message.
+    """
+    if not pdf_file:
+        logger.warning("No PDF file provided")
+        return ""
+    try:
+        # Attempt to extract text using LlamaParse
+        text = extract_text_from_pdf_llama(pdf_file)
+        if text:
+            return text
+        else:
+            logger.info("LlamaParse did not return any text, falling back to PyPDF2")
+    except Exception as e:
+        logger.error(f"Error using LlamaParse: {str(e)}")
+    try:
+        # Fallback to PyPDF2 for text extraction
+        with open(pdf_file, "rb") as pdf_file_obj:
+            return extract_text_from_pdf_pypdf(pdf_file_obj)
+    except Exception as e:
+        error_msg = f"Error extracting text from PDF using PyPDF2: {str(e)}"
+        logger.error(error_msg, exc_info=True)
+        return ""
+def extract_text_from_pdf_pypdf(pdf_file: Optional[Any]) -> str:
     """
     Extract text content from a PDF file.
     Args:
         pdf_file: A file-like object containing the PDF data.
     Returns:
         str: Extracted text content or an error message.
     """
     if pdf_file is None:
         logger.warning("No PDF file provided")
         return ""
     try:
         # Create a PDF reader object
         pdf_reader = PyPDF2.PdfReader(pdf_file.name)
         if len(pdf_reader.pages) == 0:
             logger.warning("PDF has no pages")
             return "The PDF file appears to be empty."
         error_msg = f"Error extracting text from PDF: {str(e)}"
         logger.error(error_msg, exc_info=True)
         return error_msg
+# --- Tool: PDF to Text ---
+def extract_text_from_pdf_llama(pdf_path: str) -> str | None:
+    """
+    Extracts text from a PDF file using LlamaParse.
+    Args:
+        pdf_path (str): Path to the PDF file.
+    Returns:
+        str: The extracted text from the PDF.
+        None: If no text could be extracted.
+    """
+    parser = LlamaParse(
+        api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
+        num_workers=1,
+        verbose=True,
+        language="en",
+    )
+    result = parser.parse(pdf_path)
+    # Get all text as a single string
+    text_documents = result.get_text_documents(split_by_page=False)
+    if text_documents:
+        return "\n".join([doc.text for doc in text_documents])
+    return None

util/mistral_api_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
 # Constants
 MISTRAL_API_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
-MISTRAL_MODEL = "ministral-8b-latest"
 API_KEY_ENV_VAR = "MISTRAL_API_KEY"

 # Constants
 MISTRAL_API_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
+MISTRAL_MODEL = "mistral-medium-latest"
 API_KEY_ENV_VAR = "MISTRAL_API_KEY"