import streamlit as st
import tempfile
import pytesseract
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image


def extract_text(file_path):
    text = ""
    image_text = ""

    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        for page_number in range(num_pages):
            # st.write(f"Page {page_number + 1}")
            page = pdf_reader.pages[page_number]
            text += page.extract_text()

    images = convert_from_path(file_path)  # Convert PDF pages to images
    for i, image in enumerate(images):
        # st.write(f"Page {i + 1}")
        image_text += pytesseract.image_to_string(image)
        
    # st.write("text")
    # st.write(text)

    # st.write("image_text")
    # st.write(image_text)

    text = text + image_text
    st.write("plus")
    st.write(text)  # Display the extracted text from the image

def main():
    st.title("PDF Text Extractor")
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
    if uploaded_file is not None:
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_file.write(uploaded_file.read())  # Save uploaded file to a temporary path
            st.success("File successfully uploaded. Click below to extract text.")
            st.button("Extract Text", on_click=extract_text, args=(temp_file.name,))


if __name__ == "__main__":
    main()