Spaces:
Running
Running
import re | |
from typing import List, Set | |
try: | |
from underthesea import word_tokenize, pos_tag | |
UNDERTHESEA_AVAILABLE = True | |
except ImportError: | |
UNDERTHESEA_AVAILABLE = False | |
print("[WARNING] underthesea not available, falling back to basic tokenization") | |
class VietnameseTextProcessor: | |
"""Vietnamese text processing utilities for ViettelPay knowledge base""" | |
def __init__(self): | |
# Keywords by document type | |
self.keyword_mappings = { | |
"error": "lỗi, error code, mã lỗi, sự cố, problem, thất bại, failed, hệ thống, system, maintenance, bảo trì, nâng cấp, upgrade", | |
"procedure": "hướng dẫn, guide, instruction, bước, step, quy trình, process, nạp cước, topup, recharge, mua, buy, purchase, chọn, select, bấm, click", | |
"definition": "định nghĩa, definition, nghĩa là, meaning, khái niệm, concept, giải thích, explain", | |
"policy": "quy định, policy, rule, chính sách, regulation, hủy, cancel, phí, fee, chiết khấu, discount", | |
"reference": "bảng, table, danh sách, list, thông tin, information, chi tiết, detail", | |
} | |
# Vietnamese stop words | |
self.vietnamese_stop_words = self._load_vietnamese_stop_words() | |
# Keep important domain terms even if they appear in stop words | |
self.domain_important_terms = { | |
"lỗi", | |
"error", | |
"mã", | |
"code", | |
"bước", | |
"step", | |
"hướng", | |
"dẫn", | |
"guide", | |
"thanh", | |
"toán", | |
"payment", | |
"nạp", | |
"cước", | |
"topup", | |
"mua", | |
"buy", | |
"viettel", | |
"viettelpay", | |
"app", | |
"ứng", | |
"dụng", | |
"mobile", | |
"thẻ", | |
"card", | |
"tiền", | |
"money", | |
"rút", | |
"withdraw", | |
"chuyển", | |
"transfer", | |
} | |
def _load_vietnamese_stop_words(self) -> Set[str]: | |
"""Load Vietnamese stop words""" | |
# Common Vietnamese stop words | |
stop_words = { | |
"và", | |
"của", | |
"có", | |
"là", | |
"được", | |
"các", | |
"một", | |
"này", | |
"cho", | |
"với", | |
"trong", | |
"từ", | |
"tại", | |
"về", | |
"như", | |
"sau", | |
"trước", | |
"khi", | |
"nếu", | |
"để", | |
"đã", | |
"sẽ", | |
"đang", | |
"bị", | |
"bởi", | |
"theo", | |
"những", | |
"nhưng", | |
"mà", | |
"thì", | |
"cũng", | |
"hay", | |
"hoặc", | |
"nên", | |
"phải", | |
"rất", | |
"lại", | |
"chỉ", | |
"đó", | |
"đây", | |
"kia", | |
"nào", | |
"ai", | |
"gì", | |
"sao", | |
"đâu", | |
"bao", | |
"nhiều", | |
"lắm", | |
"hơn", | |
"nhất", | |
"cả", | |
"tất", | |
"mọi", | |
"toàn", | |
"chưa", | |
"không", | |
"chẳng", | |
"đang", | |
"vẫn", | |
"còn", | |
"đều", | |
"cùng", | |
"nhau", | |
"riêng", | |
"luôn", | |
"ngay", | |
"liền", | |
"thêm", | |
"nữa", | |
"lần", | |
"cuối", | |
"đầu", | |
"giữa", | |
"ngoài", | |
"trong", | |
"trên", | |
"dưới", | |
"bên", | |
"cạnh", | |
"giữa", | |
"trước", | |
"sau", | |
"gần", | |
"xa", | |
"cao", | |
"thấp", | |
} | |
# Add English stop words that might appear | |
english_stops = { | |
"the", | |
"a", | |
"an", | |
"and", | |
"or", | |
"but", | |
"in", | |
"on", | |
"at", | |
"to", | |
"for", | |
"of", | |
"with", | |
"by", | |
"is", | |
"are", | |
"was", | |
"were", | |
"be", | |
"been", | |
"have", | |
"has", | |
"had", | |
"do", | |
"does", | |
"did", | |
"will", | |
"would", | |
"could", | |
"should", | |
"may", | |
"might", | |
"can", | |
"this", | |
"that", | |
"these", | |
"those", | |
} | |
return stop_words.union(english_stops) | |
def vietnamese_tokenize(self, text: str) -> List[str]: | |
"""Vietnamese word tokenization using underthesea or fallback""" | |
if not text: | |
return [] | |
if UNDERTHESEA_AVAILABLE: | |
try: | |
# Use underthesea for proper Vietnamese tokenization | |
tokenized_text = word_tokenize(text, format="text") | |
return tokenized_text.split() | |
except Exception as e: | |
print( | |
f"[WARNING] underthesea tokenization failed: {e}, falling back to basic" | |
) | |
# Fallback: basic tokenization with Vietnamese-aware splitting | |
# Handle Vietnamese compound words better | |
tokens = text.split() | |
return [token.strip() for token in tokens if token.strip()] | |
def remove_stop_words(self, tokens: List[str]) -> List[str]: | |
"""Remove Vietnamese stop words while preserving domain terms""" | |
filtered_tokens = [] | |
for token in tokens: | |
# Always keep domain-important terms | |
if token.lower() in self.domain_important_terms: | |
filtered_tokens.append(token) | |
# Keep numbers and error codes | |
elif re.match(r"^\d+$", token) or re.match(r"^[A-Z]\d+$", token): | |
filtered_tokens.append(token) | |
# Remove stop words | |
elif token.lower() not in self.vietnamese_stop_words: | |
filtered_tokens.append(token) | |
return filtered_tokens | |
def normalize_text_for_bm25(self, text: str) -> str: | |
"""Enhanced Vietnamese normalization for BM25""" | |
if not text: | |
return "" | |
# Basic normalization | |
normalized = text.lower().strip() | |
# Vietnamese tokenization | |
tokens = self.vietnamese_tokenize(normalized) | |
# Remove stop words but keep domain terms | |
tokens = self.remove_stop_words(tokens) | |
# Filter out very short tokens (but keep numbers and codes) | |
tokens = [ | |
token | |
for token in tokens | |
if len(token) >= 2 | |
or token.isdigit() | |
or re.match(r"^[A-Z]\d+$", token.upper()) | |
] | |
# Join back | |
normalized = " ".join(tokens) | |
return normalized | |
def bm25_tokenizer(self, text: str) -> str: | |
if not text: | |
return "" | |
# Basic normalization | |
normalized = text.lower().strip() | |
# Vietnamese tokenization | |
tokens = self.vietnamese_tokenize(normalized) | |
# Remove stop words but keep domain terms | |
tokens = self.remove_stop_words(tokens) | |
# Filter out very short tokens (but keep numbers and codes) | |
tokens = [ | |
token | |
for token in tokens | |
if len(token) >= 2 | |
or token.isdigit() | |
or re.match(r"^[A-Z]\d+$", token.upper()) | |
] | |
return tokens | |
def enhance_for_bm25( | |
self, | |
content: str, | |
doc_type: str, | |
additional_keywords: str = "", | |
) -> str: | |
"""Enhanced content processing for BM25 with Vietnamese preprocessing""" | |
# Only use document-type specific keywords (no generic base keywords) | |
type_specific_keywords = self.keyword_mappings.get(doc_type, "") | |
enhanced_content = f""" | |
{type_specific_keywords} {additional_keywords} | |
{content} | |
""" | |
return self.normalize_text_for_bm25(enhanced_content) | |
def extract_error_code_variations(self, error_code: str) -> str: | |
"""Generate variations of error codes for better BM25 matching""" | |
if not error_code: | |
return "" | |
variations = [error_code] | |
# Add common Vietnamese variations | |
if error_code.isdigit(): | |
# For numeric codes like "606" | |
variations.extend( | |
[ | |
f"lỗi {error_code}", | |
f"error {error_code}", | |
f"mã {error_code}", | |
f"code {error_code}", | |
f"mã lỗi {error_code}", | |
] | |
) | |
else: | |
# For alphanumeric codes like "W02", "BL2" | |
variations.extend( | |
[ | |
f"lỗi {error_code}", | |
f"error {error_code}", | |
f"mã lỗi {error_code}", | |
f"code {error_code}", | |
] | |
) | |
return " ".join(variations) | |
def extract_steps_keywords(self, guide_text: str) -> str: | |
"""Extract step-related keywords from procedure text""" | |
if not guide_text: | |
return "" | |
# Find step patterns | |
steps = re.findall(r"(?:bước|b)\s*\d+", guide_text, re.IGNORECASE) | |
step_keywords = " ".join(steps) | |
# Add common procedure keywords | |
procedure_keywords = ( | |
"step bước instruction hướng dẫn guide quy trình process thao tác action" | |
) | |
return f"{step_keywords} {procedure_keywords}" | |
def clean_column_name(self, column_name: str) -> str: | |
"""Clean column names by removing extra whitespace and newlines""" | |
if not column_name: | |
return "" | |
# Remove newlines and extra spaces | |
cleaned = re.sub(r"\s+", " ", column_name.strip()) | |
return cleaned | |