barunsaha commited on
Commit
1540d77
·
1 Parent(s): 04508ac

Move PDF content extraction to a separate module

Browse files
Files changed (2) hide show
  1. app.py +4 -26
  2. helpers/file_manager.py +40 -0
app.py CHANGED
@@ -19,9 +19,9 @@ from dotenv import load_dotenv
19
  from langchain_community.chat_message_histories import StreamlitChatMessageHistory
20
  from langchain_core.messages import HumanMessage
21
  from langchain_core.prompts import ChatPromptTemplate
22
- from pypdf import PdfReader
23
 
24
  import global_config as gcfg
 
25
  from global_config import GlobalConfig
26
  from helpers import llm_helper, pptx_helper, text_helper
27
 
@@ -274,7 +274,9 @@ def set_up_chat_ui():
274
  ):
275
  prompt_text = prompt.text or ''
276
  if prompt['files']:
277
- st.session_state[ADDITIONAL_INFO] = get_pdf_contents(prompt['files'][0])
 
 
278
  print(f'{prompt["files"]=}')
279
 
280
  provider, llm_name = llm_helper.get_provider_model(
@@ -502,30 +504,6 @@ def generate_slide_deck(json_str: str) -> Union[pathlib.Path, None]:
502
  return path
503
 
504
 
505
- def get_pdf_contents(
506
- pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
507
- max_pages: int = GlobalConfig.MAX_PAGE_COUNT
508
- ) -> str:
509
- """
510
- Extract the text contents from a PDF file.
511
-
512
- :param pdf_file: The uploaded PDF file.
513
- :param max_pages: The max no. of pages to extract contents from.
514
- :return: The contents.
515
- """
516
-
517
- print(f'{type(pdf_file)=}')
518
- reader = PdfReader(pdf_file)
519
- n_pages = min(max_pages, len(reader.pages))
520
- text = ''
521
-
522
- for page in range(n_pages):
523
- page = reader.pages[page]
524
- text += page.extract_text()
525
-
526
- return text
527
-
528
-
529
  def _is_it_refinement() -> bool:
530
  """
531
  Whether it is the initial prompt or a refinement.
 
19
  from langchain_community.chat_message_histories import StreamlitChatMessageHistory
20
  from langchain_core.messages import HumanMessage
21
  from langchain_core.prompts import ChatPromptTemplate
 
22
 
23
  import global_config as gcfg
24
+ import helpers.file_manager as filem
25
  from global_config import GlobalConfig
26
  from helpers import llm_helper, pptx_helper, text_helper
27
 
 
274
  ):
275
  prompt_text = prompt.text or ''
276
  if prompt['files']:
277
+ # Apparently, Streamlit stores uploaded files in memory and clears on browser close
278
+ # https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
279
+ st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(prompt['files'][0])
280
  print(f'{prompt["files"]=}')
281
 
282
  provider, llm_name = llm_helper.get_provider_model(
 
504
  return path
505
 
506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  def _is_it_refinement() -> bool:
508
  """
509
  Whether it is the initial prompt or a refinement.
helpers/file_manager.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File manager helper to work with uploaded files.
3
+ """
4
+ import logging
5
+ import os
6
+ import sys
7
+
8
+ import streamlit as st
9
+ from pypdf import PdfReader
10
+
11
+ sys.path.append('..')
12
+ sys.path.append('../..')
13
+
14
+ from global_config import GlobalConfig
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def get_pdf_contents(
21
+ pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
22
+ max_pages: int = GlobalConfig.MAX_PAGE_COUNT
23
+ ) -> str:
24
+ """
25
+ Extract the text contents from a PDF file.
26
+
27
+ :param pdf_file: The uploaded PDF file.
28
+ :param max_pages: The max no. of pages to extract contents from.
29
+ :return: The contents.
30
+ """
31
+
32
+ reader = PdfReader(pdf_file)
33
+ n_pages = min(max_pages, len(reader.pages))
34
+ text = ''
35
+
36
+ for page in range(n_pages):
37
+ page = reader.pages[page]
38
+ text += page.extract_text()
39
+
40
+ return text