Spaces:

AlirezaF138
/

Hazm

Running

App Files Files Community

Hazm / app.py

AlirezaF138

Update app.py

d8cb719 verified 6 months ago

raw

history blame contribute delete

4.65 kB

	import gradio as gr
	from hazm import Normalizer, word_tokenize, Lemmatizer, POSTagger, Chunker

	# Initialize Hazm components
	lemmatizer = Lemmatizer()
	pos_tagger = POSTagger(model='resources/pos_tagger.model') # Load POS Tagger model
	chunker = Chunker(model='resources/chunker.model') # Load Chunker model

	def process_text(text, operation, correct_spacing, remove_diacritics, remove_specials_chars, decrease_repeated_chars, persian_style, persian_numbers, unicodes_replacement, seperate_mi):
	# Initialize the Normalizer with user-selected parameters
	normalizer = Normalizer(
	correct_spacing=correct_spacing,
	remove_diacritics=remove_diacritics,
	remove_specials_chars=remove_specials_chars,
	decrease_repeated_chars=decrease_repeated_chars,
	persian_style=persian_style,
	persian_numbers=persian_numbers,
	unicodes_replacement=unicodes_replacement,
	seperate_mi=seperate_mi
	)

	result = ""

	if operation == "normalize":
	result = normalizer.normalize(text)
	elif operation == "tokenize":
	tokens = word_tokenize(text)
	result = " ".join(tokens) # Show tokens as a space-separated string
	elif operation == "lemmatize":
	lemmas = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]
	result = " ".join(lemmas) # Show lemmas as a space-separated string
	elif operation == "chunk":
	# Tokenize and tag the input text
	tokens = word_tokenize(text)
	pos_tags = pos_tagger.tag(tokens) # Generate POS tags
	chunks = chunker.parse(pos_tags) # Pass tagged tokens to Chunker
	result = str(chunks) # Show chunks as text
	elif operation == "pos_tag":
	tokens = word_tokenize(text)
	pos_tags = pos_tagger.tag(tokens)
	result = " ".join([f"{token}/{tag}" for token, tag in pos_tags]) # Format: token/POS

	return result

	def toggle_normalization_options(operation):
	# Show normalization options only if 'normalize' is selected
	is_normalize = (operation == "normalize")
	return [gr.update(visible=is_normalize)] * 8 # Update visibility for all 8 checkboxes

	# Define Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Persian Text Processor with Hazm")
	gr.Markdown("Select an operation and, if applicable, adjust normalization parameters to process the input text using Hazm.")

	with gr.Row():
	input_text = gr.Textbox(lines=10, label="Input Text", placeholder="Enter Persian text here...")

	with gr.Row():
	operation = gr.Radio(
	choices=['normalize', 'tokenize', 'lemmatize', 'chunk', 'pos_tag'],
	label="Select Operation",
	value='normalize',
	info="Choose the type of text processing operation to perform."
	)

	with gr.Column(visible=True) as normalization_options:
	correct_spacing = gr.Checkbox(value=True, label="Correct Spacing", info="Adjusts spaces between words for proper formatting.")
	remove_diacritics = gr.Checkbox(value=True, label="Remove Diacritics", info="Eliminates diacritical marks from the text.")
	remove_specials_chars = gr.Checkbox(value=True, label="Remove Special Characters", info="Removes non-alphanumeric characters.")
	decrease_repeated_chars = gr.Checkbox(value=True, label="Decrease Repeated Characters", info="Reduces sequences of repeated characters to a single character.")
	persian_style = gr.Checkbox(value=True, label="Persian Style", info="Applies standard Persian typography rules.")
	persian_numbers = gr.Checkbox(value=True, label="Persian Numbers", info="Converts Arabic numbers to Persian numbers.")
	unicodes_replacement = gr.Checkbox(value=True, label="Unicodes Replacement", info="Replaces characters with their standard Unicode equivalents.")
	seperate_mi = gr.Checkbox(value=True, label="Separate 'می'", info="Separates the Persian prefix 'می' from verbs.")

	operation.change(
	fn=toggle_normalization_options,
	inputs=operation,
	outputs=normalization_options
	)

	output_text = gr.Textbox(label="Processed Text", lines=10, interactive=False, show_copy_button=True)

	submit_button = gr.Button("Process Text")
	submit_button.click(
	fn=process_text,
	inputs=[
	input_text, operation,
	correct_spacing, remove_diacritics, remove_specials_chars,
	decrease_repeated_chars, persian_style, persian_numbers,
	unicodes_replacement, seperate_mi
	],
	outputs=output_text
	)

	if __name__ == "__main__":
	demo.launch()