Spaces:

ChrisSacrumCor
/

pdf_to_clean_txt

Running

App Files Files Community

pdf_to_clean_txt / app.py

ChrisSacrumCor

Update app.py

82d4481 verified 2 months ago

raw

history blame contribute delete

3.13 kB

	import gradio as gr
	from PyPDF2 import PdfReader
	import re
	import spacy
	from spacy.cli import download
	download("en_core_web_sm")
	import os

	# Load the spaCy model
	nlp = spacy.load("en_core_web_sm")

	def pdf_to_txt(pdf_file_path, txt_file_path):
	with open(pdf_file_path, "rb") as filehandle, open(txt_file_path, mode='w', encoding='UTF-8') as output:
	pdf = PdfReader(filehandle)

	for page_number, page in enumerate(pdf.pages):
	print(f"Page: {page_number+1}", file=output)
	print('', file=output)
	print(page.extract_text(), file=output)
	print('', file=output)

	def clean_text(text):
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
	return text

	def chunk_text(text):
	doc = nlp(text)
	chunks = [str(sent) for sent in doc.sents]
	return chunks

	def lemmatize_chunk(chunk):
	doc = nlp(chunk)
	lemmatized_chunk = ' '.join([token.lemma_ for token in doc if token.lemma_ != ''])
	return lemmatized_chunk

	def process_large_pdf(file):
	# Extract the file name without extension
	file_name = file.name.split('.')[0]

	# Convert PDF to text page by page
	temp_txt_path = f"{file_name}_temp.txt"
	with open(file.name, "rb") as filehandle, open(temp_txt_path, mode='w', encoding='UTF-8') as output:
	pdf = PdfReader(filehandle)

	for page_number, page in enumerate(pdf.pages):
	print(f"Page: {page_number+1}", file=output)
	print('', file=output)
	print(page.extract_text(), file=output)
	print('', file=output)


	# Load the text file
	with open(temp_txt_path, 'r') as file_txt:
	text = file_txt.read()

	# Clean the text
	cleaned_text = clean_text(text)

	# Save the cleaned text
	cleaned_txt_path = f"{file_name}_cleaned.txt"
	with open(cleaned_txt_path, 'w') as file_cleaned:
	file_cleaned.write(cleaned_text)

	# Chunk the text
	chunks = chunk_text(cleaned_text)

	# Lemmatize each chunk
	lemmatized_chunks = [lemmatize_chunk(chunk) for chunk in chunks]

	# Save the lemmatized chunks
	lemmatized_chunks_path = f"{file_name}_lemmatized.txt"
	with open(lemmatized_chunks_path, 'w') as file_lemmatized:
	for chunk in lemmatized_chunks:
	file_lemmatized.write(chunk + '\n')

	# Remove the temporary text file
	os.remove(temp_txt_path)

	# Return the cleaned text file for download
	return cleaned_txt_path

	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# PDF Text Processing App")

	with gr.Column():
	file_obj = gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"])
	submit_button = gr.Button("Process PDF")
	output_file = gr.File(label="Download Cleaned Text File")

	submit_button.click(
	process_large_pdf,
	inputs=file_obj,
	outputs=output_file
	)

	if __name__ == "__main__":
	demo.launch()