agharsallah
commited on
Commit
·
ea5af73
1
Parent(s):
b492512
Adding LLama parse for PDF extraction
Browse files- controllers/app_controller.py +9 -2
- requirements.txt +2 -1
- services/pdf_text_extractor.py +69 -6
- util/mistral_api_client.py +1 -1
controllers/app_controller.py
CHANGED
@@ -7,6 +7,7 @@ from services.streaming_chapter_processor import process_story_into_chapters_str
|
|
7 |
from services.audio_generator import generate_audio, generate_melody_from_story
|
8 |
import gradio as gr
|
9 |
from config import constants
|
|
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
@@ -46,8 +47,14 @@ def process_story_generation(
|
|
46 |
# Process PDF if provided
|
47 |
pdf_content = ""
|
48 |
if pdf_file:
|
49 |
-
logger.info(
|
50 |
pdf_content = extract_text_from_pdf(pdf_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
if pdf_content.startswith("Error:"):
|
52 |
logger.error(f"PDF extraction error: {pdf_content}")
|
53 |
|
@@ -60,7 +67,7 @@ def process_story_generation(
|
|
60 |
kid_interests=kid_interests,
|
61 |
subject=subject,
|
62 |
reading_time=reading_time,
|
63 |
-
pdf_content=
|
64 |
model_name=model_selector,
|
65 |
)
|
66 |
|
|
|
7 |
from services.audio_generator import generate_audio, generate_melody_from_story
|
8 |
import gradio as gr
|
9 |
from config import constants
|
10 |
+
from util.mistral_api_client import MistralAPI
|
11 |
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
|
|
47 |
# Process PDF if provided
|
48 |
pdf_content = ""
|
49 |
if pdf_file:
|
50 |
+
logger.info("Extracting text from PDF")
|
51 |
pdf_content = extract_text_from_pdf(pdf_file)
|
52 |
+
# summarize the PDF content for better prompting using mistral
|
53 |
+
mistral_api = MistralAPI()
|
54 |
+
summarized_pdf = mistral_api.send_request(
|
55 |
+
f"Summarize the following Text content into a single-sentence children's story without any explanations, tags, or formatting—just plain text in one line.: {pdf_content}"
|
56 |
+
)["choices"][0]["message"]["content"]
|
57 |
+
logger.info(f"summarized_pdf: {summarized_pdf}")
|
58 |
if pdf_content.startswith("Error:"):
|
59 |
logger.error(f"PDF extraction error: {pdf_content}")
|
60 |
|
|
|
67 |
kid_interests=kid_interests,
|
68 |
subject=subject,
|
69 |
reading_time=reading_time,
|
70 |
+
pdf_content=summarized_pdf,
|
71 |
model_name=model_selector,
|
72 |
)
|
73 |
|
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ types-requests
|
|
7 |
markdown
|
8 |
mcp
|
9 |
asyncio
|
10 |
-
openai
|
|
|
|
7 |
markdown
|
8 |
mcp
|
9 |
asyncio
|
10 |
+
openai
|
11 |
+
llama_cloud_services
|
services/pdf_text_extractor.py
CHANGED
@@ -1,30 +1,67 @@
|
|
1 |
import PyPDF2
|
2 |
from typing import Optional, Any
|
3 |
import logging
|
|
|
|
|
4 |
|
5 |
# Configure logging
|
6 |
-
logging.basicConfig(
|
|
|
|
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
9 |
|
10 |
-
def extract_text_from_pdf(pdf_file:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"""
|
12 |
Extract text content from a PDF file.
|
13 |
-
|
14 |
Args:
|
15 |
pdf_file: A file-like object containing the PDF data.
|
16 |
-
|
17 |
Returns:
|
18 |
str: Extracted text content or an error message.
|
19 |
"""
|
20 |
if pdf_file is None:
|
21 |
logger.warning("No PDF file provided")
|
22 |
return ""
|
23 |
-
|
24 |
try:
|
25 |
# Create a PDF reader object
|
26 |
pdf_reader = PyPDF2.PdfReader(pdf_file.name)
|
27 |
-
|
28 |
if len(pdf_reader.pages) == 0:
|
29 |
logger.warning("PDF has no pages")
|
30 |
return "The PDF file appears to be empty."
|
@@ -47,3 +84,29 @@ def extract_text_from_pdf(pdf_file: Optional[Any]) -> str:
|
|
47 |
error_msg = f"Error extracting text from PDF: {str(e)}"
|
48 |
logger.error(error_msg, exc_info=True)
|
49 |
return error_msg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import PyPDF2
|
2 |
from typing import Optional, Any
|
3 |
import logging
|
4 |
+
from llama_cloud_services import LlamaParse
|
5 |
+
import os
|
6 |
|
7 |
# Configure logging
|
8 |
+
logging.basicConfig(
|
9 |
+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
10 |
+
)
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
|
14 |
+
def extract_text_from_pdf(pdf_file: str | None) -> str:
|
15 |
+
"""
|
16 |
+
uses llama_parse to extract text content from a PDF file.
|
17 |
+
if that fails, it falls back to PyPDF2.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
pdf_file: Path to the PDF file.
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
str: Extracted text content or an error message.
|
24 |
+
"""
|
25 |
+
if not pdf_file:
|
26 |
+
logger.warning("No PDF file provided")
|
27 |
+
return ""
|
28 |
+
try:
|
29 |
+
# Attempt to extract text using LlamaParse
|
30 |
+
text = extract_text_from_pdf_llama(pdf_file)
|
31 |
+
if text:
|
32 |
+
return text
|
33 |
+
else:
|
34 |
+
logger.info("LlamaParse did not return any text, falling back to PyPDF2")
|
35 |
+
except Exception as e:
|
36 |
+
logger.error(f"Error using LlamaParse: {str(e)}")
|
37 |
+
try:
|
38 |
+
# Fallback to PyPDF2 for text extraction
|
39 |
+
with open(pdf_file, "rb") as pdf_file_obj:
|
40 |
+
return extract_text_from_pdf_pypdf(pdf_file_obj)
|
41 |
+
except Exception as e:
|
42 |
+
error_msg = f"Error extracting text from PDF using PyPDF2: {str(e)}"
|
43 |
+
logger.error(error_msg, exc_info=True)
|
44 |
+
return ""
|
45 |
+
|
46 |
+
|
47 |
+
def extract_text_from_pdf_pypdf(pdf_file: Optional[Any]) -> str:
|
48 |
"""
|
49 |
Extract text content from a PDF file.
|
50 |
+
|
51 |
Args:
|
52 |
pdf_file: A file-like object containing the PDF data.
|
53 |
+
|
54 |
Returns:
|
55 |
str: Extracted text content or an error message.
|
56 |
"""
|
57 |
if pdf_file is None:
|
58 |
logger.warning("No PDF file provided")
|
59 |
return ""
|
60 |
+
|
61 |
try:
|
62 |
# Create a PDF reader object
|
63 |
pdf_reader = PyPDF2.PdfReader(pdf_file.name)
|
64 |
+
|
65 |
if len(pdf_reader.pages) == 0:
|
66 |
logger.warning("PDF has no pages")
|
67 |
return "The PDF file appears to be empty."
|
|
|
84 |
error_msg = f"Error extracting text from PDF: {str(e)}"
|
85 |
logger.error(error_msg, exc_info=True)
|
86 |
return error_msg
|
87 |
+
|
88 |
+
|
89 |
+
# --- Tool: PDF to Text ---
|
90 |
+
def extract_text_from_pdf_llama(pdf_path: str) -> str | None:
|
91 |
+
"""
|
92 |
+
Extracts text from a PDF file using LlamaParse.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
pdf_path (str): Path to the PDF file.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
str: The extracted text from the PDF.
|
99 |
+
None: If no text could be extracted.
|
100 |
+
"""
|
101 |
+
parser = LlamaParse(
|
102 |
+
api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
|
103 |
+
num_workers=1,
|
104 |
+
verbose=True,
|
105 |
+
language="en",
|
106 |
+
)
|
107 |
+
result = parser.parse(pdf_path)
|
108 |
+
# Get all text as a single string
|
109 |
+
text_documents = result.get_text_documents(split_by_page=False)
|
110 |
+
if text_documents:
|
111 |
+
return "\n".join([doc.text for doc in text_documents])
|
112 |
+
return None
|
util/mistral_api_client.py
CHANGED
@@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
|
|
8 |
|
9 |
# Constants
|
10 |
MISTRAL_API_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
|
11 |
-
MISTRAL_MODEL = "
|
12 |
API_KEY_ENV_VAR = "MISTRAL_API_KEY"
|
13 |
|
14 |
|
|
|
8 |
|
9 |
# Constants
|
10 |
MISTRAL_API_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
|
11 |
+
MISTRAL_MODEL = "mistral-medium-latest"
|
12 |
API_KEY_ENV_VAR = "MISTRAL_API_KEY"
|
13 |
|
14 |
|