Spaces:

TDN-M
/

read

Runtime error

App Files Files Community

read / app.py

TDN-M

Update app.py

9f66a8c verified 3 months ago

raw

history blame

3.5 kB

	import gradio as gr
	import PyPDF2
	import docx
	import os
	from pathlib import Path
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import tempfile

	def convert_to_txt(file):
	# Kiểm tra xem file có được tải lên và là file hợp lệ không
	if file is None or not hasattr(file, 'name'):
	return "Vui lòng tải lên một file PDF, DOC hoặc DOCX hợp lệ."

	# Lấy phần mở rộng file
	file_ext = Path(file.name).suffix.lower()
	output_txt = ""

	try:
	# Xử lý file PDF
	if file_ext == ".pdf":
	try:
	# Thử trích xuất văn bản trực tiếp từ PDF
	with open(file.name, "rb") as f:
	pdf_reader = PyPDF2.PdfReader(f)
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text and text.strip(): # Kiểm tra xem có văn bản hợp lệ không
	output_txt += text + "\n"

	# Nếu không trích xuất được văn bản, dùng OCR
	if not output_txt.strip():
	# Chuyển PDF thành hình ảnh
	images = convert_from_path(file.name)
	for img in images:
	text = pytesseract.image_to_string(img, lang='eng+vie') # Hỗ trợ tiếng Anh và tiếng Việt
	output_txt += text + "\n"

	except Exception as e:
	# Nếu trích xuất văn bản thất bại, thử OCR
	images = convert_from_path(file.name)
	for img in images:
	text = pytesseract.image_to_string(img, lang='eng+vie')
	output_txt += text + "\n"

	# Xử lý file DOC hoặc DOCX
	elif file_ext in [".doc", ".docx"]:
	doc = docx.Document(file.name)
	for para in doc.paragraphs:
	output_txt += para.text + "\n"

	else:
	return "Định dạng file không được hỗ trợ. Vui lòng tải lên file PDF, DOC hoặc DOCX."

	# Tạo tên file đầu ra
	output_filename = Path(file.name).stem + "_converted.txt"

	# Lưu nội dung vào file TXT
	with open(output_filename, "w", encoding="utf-8") as f:
	f.write(output_txt)

	return output_filename

	except Exception as e:
	return f"Đã xảy ra lỗi khi xử lý file: {str(e)}"

	# Tạo thư mục tạm thời có quyền ghi
	temp_dir = tempfile.gettempdir()
	flagging_dir = os.path.join(temp_dir, "flagged")

	# Tạo giao diện Gradio với cấu hình đơn giản
	with gr.Blocks() as iface:
	gr.Markdown("# Chuyển đổi PDF/DOC/DOCX sang TXT (Hỗ trợ OCR)")
	gr.Markdown("Tải lên file PDF, DOC hoặc DOCX để chuyển đổi nội dung thành file TXT. Hỗ trợ OCR cho PDF dạng ảnh.")
	file_input = gr.File(label="Tải lên file PDF, DOC hoặc DOCX", file_types=[".pdf", ".doc", ".docx"])
	file_output = gr.File(label="Tải xuống file TXT")
	submit_button = gr.Button("Chuyển đổi")
	submit_button.click(
	fn=convert_to_txt,
	inputs=file_input,
	outputs=file_output,
	api_name="convert_to_txt"
	)

	# Khởi chạy ứng dụng
	if __name__ == "__main__":
	iface.launch()