import gradio as gr from PyPDF2 import PdfReader import re import spacy from spacy.cli import download download("en_core_web_sm") import os # Load the spaCy model nlp = spacy.load("en_core_web_sm") def pdf_to_txt(pdf_file_path, txt_file_path): with open(pdf_file_path, "rb") as filehandle, open(txt_file_path, mode='w', encoding='UTF-8') as output: pdf = PdfReader(filehandle) for page_number, page in enumerate(pdf.pages): print(f"Page: {page_number+1}", file=output) print('', file=output) print(page.extract_text(), file=output) print('', file=output) def clean_text(text): # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters text = re.sub(r'[^a-zA-Z0-9\s]', '', text) return text def chunk_text(text): doc = nlp(text) chunks = [str(sent) for sent in doc.sents] return chunks def lemmatize_chunk(chunk): doc = nlp(chunk) lemmatized_chunk = ' '.join([token.lemma_ for token in doc if token.lemma_ != '']) return lemmatized_chunk def process_large_pdf(file): # Extract the file name without extension file_name = file.name.split('.')[0] # Convert PDF to text page by page temp_txt_path = f"{file_name}_temp.txt" with open(file.name, "rb") as filehandle, open(temp_txt_path, mode='w', encoding='UTF-8') as output: pdf = PdfReader(filehandle) for page_number, page in enumerate(pdf.pages): print(f"Page: {page_number+1}", file=output) print('', file=output) print(page.extract_text(), file=output) print('', file=output) # Load the text file with open(temp_txt_path, 'r') as file_txt: text = file_txt.read() # Clean the text cleaned_text = clean_text(text) # Save the cleaned text cleaned_txt_path = f"{file_name}_cleaned.txt" with open(cleaned_txt_path, 'w') as file_cleaned: file_cleaned.write(cleaned_text) # Chunk the text chunks = chunk_text(cleaned_text) # Lemmatize each chunk lemmatized_chunks = [lemmatize_chunk(chunk) for chunk in chunks] # Save the lemmatized chunks lemmatized_chunks_path = f"{file_name}_lemmatized.txt" with open(lemmatized_chunks_path, 'w') as file_lemmatized: for chunk in lemmatized_chunks: file_lemmatized.write(chunk + '\n') # Remove the temporary text file os.remove(temp_txt_path) # Return the cleaned text file for download return cleaned_txt_path # Gradio interface with gr.Blocks() as demo: gr.Markdown("# PDF Text Processing App") with gr.Column(): file_obj = gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]) submit_button = gr.Button("Process PDF") output_file = gr.File(label="Download Cleaned Text File") submit_button.click( process_large_pdf, inputs=file_obj, outputs=output_file ) if __name__ == "__main__": demo.launch()