Spaces:
Running
Running
import gradio as gr | |
from PyPDF2 import PdfReader | |
import re | |
import spacy | |
from spacy.cli import download | |
download("en_core_web_sm") | |
import os | |
# Load the spaCy model | |
nlp = spacy.load("en_core_web_sm") | |
def pdf_to_txt(pdf_file_path, txt_file_path): | |
with open(pdf_file_path, "rb") as filehandle, open(txt_file_path, mode='w', encoding='UTF-8') as output: | |
pdf = PdfReader(filehandle) | |
for page_number, page in enumerate(pdf.pages): | |
print(f"Page: {page_number+1}", file=output) | |
print('', file=output) | |
print(page.extract_text(), file=output) | |
print('', file=output) | |
def clean_text(text): | |
# Remove extra whitespace | |
text = re.sub(r'\s+', ' ', text) | |
# Remove special characters | |
text = re.sub(r'[^a-zA-Z0-9\s]', '', text) | |
return text | |
def chunk_text(text): | |
doc = nlp(text) | |
chunks = [str(sent) for sent in doc.sents] | |
return chunks | |
def lemmatize_chunk(chunk): | |
doc = nlp(chunk) | |
lemmatized_chunk = ' '.join([token.lemma_ for token in doc if token.lemma_ != '']) | |
return lemmatized_chunk | |
def process_large_pdf(file): | |
# Extract the file name without extension | |
file_name = file.name.split('.')[0] | |
# Convert PDF to text page by page | |
temp_txt_path = f"{file_name}_temp.txt" | |
with open(file.name, "rb") as filehandle, open(temp_txt_path, mode='w', encoding='UTF-8') as output: | |
pdf = PdfReader(filehandle) | |
for page_number, page in enumerate(pdf.pages): | |
print(f"Page: {page_number+1}", file=output) | |
print('', file=output) | |
print(page.extract_text(), file=output) | |
print('', file=output) | |
# Load the text file | |
with open(temp_txt_path, 'r') as file_txt: | |
text = file_txt.read() | |
# Clean the text | |
cleaned_text = clean_text(text) | |
# Save the cleaned text | |
cleaned_txt_path = f"{file_name}_cleaned.txt" | |
with open(cleaned_txt_path, 'w') as file_cleaned: | |
file_cleaned.write(cleaned_text) | |
# Chunk the text | |
chunks = chunk_text(cleaned_text) | |
# Lemmatize each chunk | |
lemmatized_chunks = [lemmatize_chunk(chunk) for chunk in chunks] | |
# Save the lemmatized chunks | |
lemmatized_chunks_path = f"{file_name}_lemmatized.txt" | |
with open(lemmatized_chunks_path, 'w') as file_lemmatized: | |
for chunk in lemmatized_chunks: | |
file_lemmatized.write(chunk + '\n') | |
# Remove the temporary text file | |
os.remove(temp_txt_path) | |
# Return the cleaned text file for download | |
return cleaned_txt_path | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# PDF Text Processing App") | |
with gr.Column(): | |
file_obj = gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]) | |
submit_button = gr.Button("Process PDF") | |
output_file = gr.File(label="Download Cleaned Text File") | |
submit_button.click( | |
process_large_pdf, | |
inputs=file_obj, | |
outputs=output_file | |
) | |
if __name__ == "__main__": | |
demo.launch() |