ChrisSacrumCor's picture
Update app.py
82d4481 verified
import gradio as gr
from PyPDF2 import PdfReader
import re
import spacy
from spacy.cli import download
download("en_core_web_sm")
import os
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
def pdf_to_txt(pdf_file_path, txt_file_path):
with open(pdf_file_path, "rb") as filehandle, open(txt_file_path, mode='w', encoding='UTF-8') as output:
pdf = PdfReader(filehandle)
for page_number, page in enumerate(pdf.pages):
print(f"Page: {page_number+1}", file=output)
print('', file=output)
print(page.extract_text(), file=output)
print('', file=output)
def clean_text(text):
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
return text
def chunk_text(text):
doc = nlp(text)
chunks = [str(sent) for sent in doc.sents]
return chunks
def lemmatize_chunk(chunk):
doc = nlp(chunk)
lemmatized_chunk = ' '.join([token.lemma_ for token in doc if token.lemma_ != ''])
return lemmatized_chunk
def process_large_pdf(file):
# Extract the file name without extension
file_name = file.name.split('.')[0]
# Convert PDF to text page by page
temp_txt_path = f"{file_name}_temp.txt"
with open(file.name, "rb") as filehandle, open(temp_txt_path, mode='w', encoding='UTF-8') as output:
pdf = PdfReader(filehandle)
for page_number, page in enumerate(pdf.pages):
print(f"Page: {page_number+1}", file=output)
print('', file=output)
print(page.extract_text(), file=output)
print('', file=output)
# Load the text file
with open(temp_txt_path, 'r') as file_txt:
text = file_txt.read()
# Clean the text
cleaned_text = clean_text(text)
# Save the cleaned text
cleaned_txt_path = f"{file_name}_cleaned.txt"
with open(cleaned_txt_path, 'w') as file_cleaned:
file_cleaned.write(cleaned_text)
# Chunk the text
chunks = chunk_text(cleaned_text)
# Lemmatize each chunk
lemmatized_chunks = [lemmatize_chunk(chunk) for chunk in chunks]
# Save the lemmatized chunks
lemmatized_chunks_path = f"{file_name}_lemmatized.txt"
with open(lemmatized_chunks_path, 'w') as file_lemmatized:
for chunk in lemmatized_chunks:
file_lemmatized.write(chunk + '\n')
# Remove the temporary text file
os.remove(temp_txt_path)
# Return the cleaned text file for download
return cleaned_txt_path
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Text Processing App")
with gr.Column():
file_obj = gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"])
submit_button = gr.Button("Process PDF")
output_file = gr.File(label="Download Cleaned Text File")
submit_button.click(
process_large_pdf,
inputs=file_obj,
outputs=output_file
)
if __name__ == "__main__":
demo.launch()