Spaces:
Running
Running
Move PDF content extraction to a separate module
Browse files- app.py +4 -26
- helpers/file_manager.py +40 -0
app.py
CHANGED
@@ -19,9 +19,9 @@ from dotenv import load_dotenv
|
|
19 |
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
|
20 |
from langchain_core.messages import HumanMessage
|
21 |
from langchain_core.prompts import ChatPromptTemplate
|
22 |
-
from pypdf import PdfReader
|
23 |
|
24 |
import global_config as gcfg
|
|
|
25 |
from global_config import GlobalConfig
|
26 |
from helpers import llm_helper, pptx_helper, text_helper
|
27 |
|
@@ -274,7 +274,9 @@ def set_up_chat_ui():
|
|
274 |
):
|
275 |
prompt_text = prompt.text or ''
|
276 |
if prompt['files']:
|
277 |
-
|
|
|
|
|
278 |
print(f'{prompt["files"]=}')
|
279 |
|
280 |
provider, llm_name = llm_helper.get_provider_model(
|
@@ -502,30 +504,6 @@ def generate_slide_deck(json_str: str) -> Union[pathlib.Path, None]:
|
|
502 |
return path
|
503 |
|
504 |
|
505 |
-
def get_pdf_contents(
|
506 |
-
pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
507 |
-
max_pages: int = GlobalConfig.MAX_PAGE_COUNT
|
508 |
-
) -> str:
|
509 |
-
"""
|
510 |
-
Extract the text contents from a PDF file.
|
511 |
-
|
512 |
-
:param pdf_file: The uploaded PDF file.
|
513 |
-
:param max_pages: The max no. of pages to extract contents from.
|
514 |
-
:return: The contents.
|
515 |
-
"""
|
516 |
-
|
517 |
-
print(f'{type(pdf_file)=}')
|
518 |
-
reader = PdfReader(pdf_file)
|
519 |
-
n_pages = min(max_pages, len(reader.pages))
|
520 |
-
text = ''
|
521 |
-
|
522 |
-
for page in range(n_pages):
|
523 |
-
page = reader.pages[page]
|
524 |
-
text += page.extract_text()
|
525 |
-
|
526 |
-
return text
|
527 |
-
|
528 |
-
|
529 |
def _is_it_refinement() -> bool:
|
530 |
"""
|
531 |
Whether it is the initial prompt or a refinement.
|
|
|
19 |
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
|
20 |
from langchain_core.messages import HumanMessage
|
21 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
22 |
|
23 |
import global_config as gcfg
|
24 |
+
import helpers.file_manager as filem
|
25 |
from global_config import GlobalConfig
|
26 |
from helpers import llm_helper, pptx_helper, text_helper
|
27 |
|
|
|
274 |
):
|
275 |
prompt_text = prompt.text or ''
|
276 |
if prompt['files']:
|
277 |
+
# Apparently, Streamlit stores uploaded files in memory and clears on browser close
|
278 |
+
# https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
|
279 |
+
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(prompt['files'][0])
|
280 |
print(f'{prompt["files"]=}')
|
281 |
|
282 |
provider, llm_name = llm_helper.get_provider_model(
|
|
|
504 |
return path
|
505 |
|
506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
def _is_it_refinement() -> bool:
|
508 |
"""
|
509 |
Whether it is the initial prompt or a refinement.
|
helpers/file_manager.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
File manager helper to work with uploaded files.
|
3 |
+
"""
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
|
8 |
+
import streamlit as st
|
9 |
+
from pypdf import PdfReader
|
10 |
+
|
11 |
+
sys.path.append('..')
|
12 |
+
sys.path.append('../..')
|
13 |
+
|
14 |
+
from global_config import GlobalConfig
|
15 |
+
|
16 |
+
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
|
20 |
+
def get_pdf_contents(
|
21 |
+
pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
22 |
+
max_pages: int = GlobalConfig.MAX_PAGE_COUNT
|
23 |
+
) -> str:
|
24 |
+
"""
|
25 |
+
Extract the text contents from a PDF file.
|
26 |
+
|
27 |
+
:param pdf_file: The uploaded PDF file.
|
28 |
+
:param max_pages: The max no. of pages to extract contents from.
|
29 |
+
:return: The contents.
|
30 |
+
"""
|
31 |
+
|
32 |
+
reader = PdfReader(pdf_file)
|
33 |
+
n_pages = min(max_pages, len(reader.pages))
|
34 |
+
text = ''
|
35 |
+
|
36 |
+
for page in range(n_pages):
|
37 |
+
page = reader.pages[page]
|
38 |
+
text += page.extract_text()
|
39 |
+
|
40 |
+
return text
|