import streamlit as st import tempfile import pytesseract import PyPDF2 from pdf2image import convert_from_path from PIL import Image def extract_text(file_path): text = "" image_text = "" with open(file_path, "rb") as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) num_pages = len(pdf_reader.pages) for page_number in range(num_pages): # st.write(f"Page {page_number + 1}") page = pdf_reader.pages[page_number] text += page.extract_text() images = convert_from_path(file_path) # Convert PDF pages to images for i, image in enumerate(images): # st.write(f"Page {i + 1}") image_text += pytesseract.image_to_string(image) # st.write("text") # st.write(text) # st.write("image_text") # st.write(image_text) text = text + image_text st.write("plus") st.write(text) # Display the extracted text from the image def main(): st.title("PDF Text Extractor") uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) if uploaded_file is not None: with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path st.success("File successfully uploaded. Click below to extract text.") st.button("Extract Text", on_click=extract_text, args=(temp_file.name,)) if __name__ == "__main__": main()