agharsallah commited on
Commit
ea5af73
·
1 Parent(s): b492512

Adding LLama parse for PDF extraction

Browse files
controllers/app_controller.py CHANGED
@@ -7,6 +7,7 @@ from services.streaming_chapter_processor import process_story_into_chapters_str
7
  from services.audio_generator import generate_audio, generate_melody_from_story
8
  import gradio as gr
9
  from config import constants
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
@@ -46,8 +47,14 @@ def process_story_generation(
46
  # Process PDF if provided
47
  pdf_content = ""
48
  if pdf_file:
49
- logger.info(f"Extracting text from PDF: {pdf_file.name}")
50
  pdf_content = extract_text_from_pdf(pdf_file)
 
 
 
 
 
 
51
  if pdf_content.startswith("Error:"):
52
  logger.error(f"PDF extraction error: {pdf_content}")
53
 
@@ -60,7 +67,7 @@ def process_story_generation(
60
  kid_interests=kid_interests,
61
  subject=subject,
62
  reading_time=reading_time,
63
- pdf_content=pdf_content,
64
  model_name=model_selector,
65
  )
66
 
 
7
  from services.audio_generator import generate_audio, generate_melody_from_story
8
  import gradio as gr
9
  from config import constants
10
+ from util.mistral_api_client import MistralAPI
11
 
12
  logger = logging.getLogger(__name__)
13
 
 
47
  # Process PDF if provided
48
  pdf_content = ""
49
  if pdf_file:
50
+ logger.info("Extracting text from PDF")
51
  pdf_content = extract_text_from_pdf(pdf_file)
52
+ # summarize the PDF content for better prompting using mistral
53
+ mistral_api = MistralAPI()
54
+ summarized_pdf = mistral_api.send_request(
55
+ f"Summarize the following Text content into a single-sentence children's story without any explanations, tags, or formatting—just plain text in one line.: {pdf_content}"
56
+ )["choices"][0]["message"]["content"]
57
+ logger.info(f"summarized_pdf: {summarized_pdf}")
58
  if pdf_content.startswith("Error:"):
59
  logger.error(f"PDF extraction error: {pdf_content}")
60
 
 
67
  kid_interests=kid_interests,
68
  subject=subject,
69
  reading_time=reading_time,
70
+ pdf_content=summarized_pdf,
71
  model_name=model_selector,
72
  )
73
 
requirements.txt CHANGED
@@ -7,4 +7,5 @@ types-requests
7
  markdown
8
  mcp
9
  asyncio
10
- openai
 
 
7
  markdown
8
  mcp
9
  asyncio
10
+ openai
11
+ llama_cloud_services
services/pdf_text_extractor.py CHANGED
@@ -1,30 +1,67 @@
1
  import PyPDF2
2
  from typing import Optional, Any
3
  import logging
 
 
4
 
5
  # Configure logging
6
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
7
  logger = logging.getLogger(__name__)
8
 
9
 
10
- def extract_text_from_pdf(pdf_file: Optional[Any]) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
  Extract text content from a PDF file.
13
-
14
  Args:
15
  pdf_file: A file-like object containing the PDF data.
16
-
17
  Returns:
18
  str: Extracted text content or an error message.
19
  """
20
  if pdf_file is None:
21
  logger.warning("No PDF file provided")
22
  return ""
23
-
24
  try:
25
  # Create a PDF reader object
26
  pdf_reader = PyPDF2.PdfReader(pdf_file.name)
27
-
28
  if len(pdf_reader.pages) == 0:
29
  logger.warning("PDF has no pages")
30
  return "The PDF file appears to be empty."
@@ -47,3 +84,29 @@ def extract_text_from_pdf(pdf_file: Optional[Any]) -> str:
47
  error_msg = f"Error extracting text from PDF: {str(e)}"
48
  logger.error(error_msg, exc_info=True)
49
  return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import PyPDF2
2
  from typing import Optional, Any
3
  import logging
4
+ from llama_cloud_services import LlamaParse
5
+ import os
6
 
7
  # Configure logging
8
+ logging.basicConfig(
9
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
10
+ )
11
  logger = logging.getLogger(__name__)
12
 
13
 
14
+ def extract_text_from_pdf(pdf_file: str | None) -> str:
15
+ """
16
+ uses llama_parse to extract text content from a PDF file.
17
+ if that fails, it falls back to PyPDF2.
18
+
19
+ Args:
20
+ pdf_file: Path to the PDF file.
21
+
22
+ Returns:
23
+ str: Extracted text content or an error message.
24
+ """
25
+ if not pdf_file:
26
+ logger.warning("No PDF file provided")
27
+ return ""
28
+ try:
29
+ # Attempt to extract text using LlamaParse
30
+ text = extract_text_from_pdf_llama(pdf_file)
31
+ if text:
32
+ return text
33
+ else:
34
+ logger.info("LlamaParse did not return any text, falling back to PyPDF2")
35
+ except Exception as e:
36
+ logger.error(f"Error using LlamaParse: {str(e)}")
37
+ try:
38
+ # Fallback to PyPDF2 for text extraction
39
+ with open(pdf_file, "rb") as pdf_file_obj:
40
+ return extract_text_from_pdf_pypdf(pdf_file_obj)
41
+ except Exception as e:
42
+ error_msg = f"Error extracting text from PDF using PyPDF2: {str(e)}"
43
+ logger.error(error_msg, exc_info=True)
44
+ return ""
45
+
46
+
47
+ def extract_text_from_pdf_pypdf(pdf_file: Optional[Any]) -> str:
48
  """
49
  Extract text content from a PDF file.
50
+
51
  Args:
52
  pdf_file: A file-like object containing the PDF data.
53
+
54
  Returns:
55
  str: Extracted text content or an error message.
56
  """
57
  if pdf_file is None:
58
  logger.warning("No PDF file provided")
59
  return ""
60
+
61
  try:
62
  # Create a PDF reader object
63
  pdf_reader = PyPDF2.PdfReader(pdf_file.name)
64
+
65
  if len(pdf_reader.pages) == 0:
66
  logger.warning("PDF has no pages")
67
  return "The PDF file appears to be empty."
 
84
  error_msg = f"Error extracting text from PDF: {str(e)}"
85
  logger.error(error_msg, exc_info=True)
86
  return error_msg
87
+
88
+
89
+ # --- Tool: PDF to Text ---
90
+ def extract_text_from_pdf_llama(pdf_path: str) -> str | None:
91
+ """
92
+ Extracts text from a PDF file using LlamaParse.
93
+
94
+ Args:
95
+ pdf_path (str): Path to the PDF file.
96
+
97
+ Returns:
98
+ str: The extracted text from the PDF.
99
+ None: If no text could be extracted.
100
+ """
101
+ parser = LlamaParse(
102
+ api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
103
+ num_workers=1,
104
+ verbose=True,
105
+ language="en",
106
+ )
107
+ result = parser.parse(pdf_path)
108
+ # Get all text as a single string
109
+ text_documents = result.get_text_documents(split_by_page=False)
110
+ if text_documents:
111
+ return "\n".join([doc.text for doc in text_documents])
112
+ return None
util/mistral_api_client.py CHANGED
@@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
8
 
9
  # Constants
10
  MISTRAL_API_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
11
- MISTRAL_MODEL = "ministral-8b-latest"
12
  API_KEY_ENV_VAR = "MISTRAL_API_KEY"
13
 
14
 
 
8
 
9
  # Constants
10
  MISTRAL_API_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
11
+ MISTRAL_MODEL = "mistral-medium-latest"
12
  API_KEY_ENV_VAR = "MISTRAL_API_KEY"
13
 
14