Spaces:

AlirezaF138
/

Persian-OCR

Running

App Files Files Community

Persian-OCR / app.py

AlirezaF138

Update app.py

75183d4 verified 11 months ago

raw

history blame

2.36 kB

	import gradio as gr
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import os

	# Function to perform OCR
	def ocr(input_file, lang='fas'): # 'fas': Persian language (Farsi)
	extracted_text = ""

	# Check if the input file is a PDF or an image
	if isinstance(input_file, str) and input_file.endswith('.pdf'): # Check if the file is a PDF
	# Convert PDF to images
	images = convert_from_path(input_file)

	# Loop through each image and perform OCR
	for page_number, image in enumerate(images):
	text = pytesseract.image_to_string(image, lang=lang)
	extracted_text += text

	elif isinstance(input_file, Image.Image): # If the input is an image
	text = pytesseract.image_to_string(input_file, lang=lang)
	extracted_text = text

	return extracted_text

	def gradio_interface():
	# Define Gradio inputs and outputs
	input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
	file_input = gr.File(label="Upload PDF/Image")
	language_input = gr.Dropdown(
	label="Select OCR Language",
	choices=[
	("English", "eng"),
	("Mandarin Chinese", "chi_sim"),
	("Hindi", "hin"),
	("Spanish", "spa"),
	("French", "fra"),
	("Standard Arabic", "ara"),
	("Bengali", "ben"),
	("Portuguese", "por"),
	("Russian", "rus"),
	("Urdu", "urd"),
	("Persian (Farsi)", "fas")
	],
	value="fas" # Default to Persian
	)
	output_text = gr.Textbox(label="Extracted Text", interactive=False)

	# Function to process the inputs and return the outputs
	def process(input_type, file, lang):
	if input_type == "PDF":
	extracted_text = ocr(file.name, lang)
	else:
	image = Image.open(file.name)
	extracted_text = ocr(image, lang)
	return extracted_text

	# Create and launch Gradio interface
	gr.Interface(
	fn=process,
	inputs=[input_type, file_input, language_input],
	outputs=[output_text],
	title="OCR (PDF/Image)",
	description="Upload a PDF or Image, select the OCR language, and extract the text."
	).launch()


	# Call the function to create the interface
	gradio_interface()