Spaces:

Diezu
/

Correct_spelling_mistakes_app

Sleeping

App Files Files Community

Correct_spelling_mistakes_app / app.py

Diezu

Update app.py

2fc88a6 verified 8 months ago

raw

history blame

3.93 kB

	import streamlit as st
	from transformers import pipeline
	from underthesea import word_tokenize # Thư viện underthesea để tokenize tiếng Việt
	import difflib

	# Cấu hình ứng dụng
	MAX_LENGTH = 512
	st.set_page_config(
	page_title="Demo Correct Spelling Mistakes",
	page_icon="🤖",
	layout="centered",
	initial_sidebar_state="auto"
	)

	# CSS tùy chỉnh
	custom_css = """
	<style>
	body {
	background-color: #f4f4f4;
	font-family: 'Arial', sans-serif;
	}
	.main {
	background-color: #ffffff;
	padding: 20px;
	border-radius: 10px;
	box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	max-width: 800px;
	margin: 0 auto;
	}
	h1 {
	text-align: center;
	color: #4a90e2;
	}
	textarea {
	font-family: 'Courier New', Courier, monospace;
	font-size: 14px;
	color: #333;
	}
	.stButton button {
	background-color: #4a90e2;
	color: white;
	border: none;
	border-radius: 5px;
	padding: 10px 20px;
	font-size: 16px;
	cursor: pointer;
	}
	.stButton button:hover {
	background-color: #357ABD;
	}
	.markdown-text-container {
	margin-top: 20px;
	}
	.highlight {
	color: #d9534f;
	font-weight: bold;
	}
	</style>
	"""
	st.markdown(custom_css, unsafe_allow_html=True)

	st.title("Correct Spelling Mistakes App")

	# Load mô hình
	model_checkpoint = "Diezu/bat_pho_bo" # Thay đổi checkpoint phù hợp
	correct_spelling = pipeline("text2text-generation", model=model_checkpoint)

	# Nhập liệu từ người dùng
	context = st.text_area("Input text", placeholder="Nhập văn bản có lỗi chính tả...")

	# Xử lý nút bấm
	if st.button("Get Result"):
	if context.strip():
	try:
	# Sử dụng pipeline để sửa lỗi chính tả
	result = correct_spelling(context, max_length=MAX_LENGTH)
	corrected_text = result[0]['generated_text'] if result else "No output generated."

	# Tokenize tiếng Việt sử dụng underthesea
	original_tokens = word_tokenize(context)
	corrected_tokens = word_tokenize(corrected_text)

	# So sánh các từ và tìm từ thay đổi
	def highlight_differences(original, corrected):
	highlighted_text = []
	modified_indices = []
	matcher = difflib.SequenceMatcher(None, original, corrected)
	for tag, i1, i2, j1, j2 in matcher.get_opcodes():
	if tag == 'replace': # Nếu từ bị thay thế
	for word in corrected[j1:j2]:
	highlighted_text.append(f"<span class='highlight'>{word}</span>") # Bôi đỏ từ đã sửa
	modified_indices.extend(range(j1, j2))
	elif tag == 'insert': # Nếu từ mới được thêm
	for word in corrected[j1:j2]:
	highlighted_text.append(f"<span class='highlight'>{word}</span>")
	modified_indices.extend(range(j1, j2))
	else: # Nếu từ không thay đổi
	highlighted_text.extend(corrected[j1:j2])

	return " ".join(highlighted_text), modified_indices

	# Lấy kết quả đã chỉnh sửa và vị trí các từ đã sửa
	highlighted_text, modified_indices = highlight_differences(original_tokens, corrected_tokens)

	# Hiển thị kết quả
	st.markdown(f"### Corrected Text (with highlighted words):\n\n{highlighted_text}", unsafe_allow_html=True)
	st.markdown(f"### Modified Word Indices:\n\n{modified_indices}")
	except Exception as e:
	st.error(f"An error occurred: {e}")
	else:
	st.warning("Please input some text to process!")