import re from typing import List, Set try: from underthesea import word_tokenize, pos_tag UNDERTHESEA_AVAILABLE = True except ImportError: UNDERTHESEA_AVAILABLE = False print("[WARNING] underthesea not available, falling back to basic tokenization") class VietnameseTextProcessor: """Vietnamese text processing utilities for ViettelPay knowledge base""" def __init__(self): # Keywords by document type self.keyword_mappings = { "error": "lỗi, error code, mã lỗi, sự cố, problem, thất bại, failed, hệ thống, system, maintenance, bảo trì, nâng cấp, upgrade", "procedure": "hướng dẫn, guide, instruction, bước, step, quy trình, process, nạp cước, topup, recharge, mua, buy, purchase, chọn, select, bấm, click", "definition": "định nghĩa, definition, nghĩa là, meaning, khái niệm, concept, giải thích, explain", "policy": "quy định, policy, rule, chính sách, regulation, hủy, cancel, phí, fee, chiết khấu, discount", "reference": "bảng, table, danh sách, list, thông tin, information, chi tiết, detail", } # Vietnamese stop words self.vietnamese_stop_words = self._load_vietnamese_stop_words() # Keep important domain terms even if they appear in stop words self.domain_important_terms = { "lỗi", "error", "mã", "code", "bước", "step", "hướng", "dẫn", "guide", "thanh", "toán", "payment", "nạp", "cước", "topup", "mua", "buy", "viettel", "viettelpay", "app", "ứng", "dụng", "mobile", "thẻ", "card", "tiền", "money", "rút", "withdraw", "chuyển", "transfer", } def _load_vietnamese_stop_words(self) -> Set[str]: """Load Vietnamese stop words""" # Common Vietnamese stop words stop_words = { "và", "của", "có", "là", "được", "các", "một", "này", "cho", "với", "trong", "từ", "tại", "về", "như", "sau", "trước", "khi", "nếu", "để", "đã", "sẽ", "đang", "bị", "bởi", "theo", "những", "nhưng", "mà", "thì", "cũng", "hay", "hoặc", "nên", "phải", "rất", "lại", "chỉ", "đó", "đây", "kia", "nào", "ai", "gì", "sao", "đâu", "bao", "nhiều", "lắm", "hơn", "nhất", "cả", "tất", "mọi", "toàn", "chưa", "không", "chẳng", "đang", "vẫn", "còn", "đều", "cùng", "nhau", "riêng", "luôn", "ngay", "liền", "thêm", "nữa", "lần", "cuối", "đầu", "giữa", "ngoài", "trong", "trên", "dưới", "bên", "cạnh", "giữa", "trước", "sau", "gần", "xa", "cao", "thấp", } # Add English stop words that might appear english_stops = { "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "is", "are", "was", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "this", "that", "these", "those", } return stop_words.union(english_stops) def vietnamese_tokenize(self, text: str) -> List[str]: """Vietnamese word tokenization using underthesea or fallback""" if not text: return [] if UNDERTHESEA_AVAILABLE: try: # Use underthesea for proper Vietnamese tokenization tokenized_text = word_tokenize(text, format="text") return tokenized_text.split() except Exception as e: print( f"[WARNING] underthesea tokenization failed: {e}, falling back to basic" ) # Fallback: basic tokenization with Vietnamese-aware splitting # Handle Vietnamese compound words better tokens = text.split() return [token.strip() for token in tokens if token.strip()] def remove_stop_words(self, tokens: List[str]) -> List[str]: """Remove Vietnamese stop words while preserving domain terms""" filtered_tokens = [] for token in tokens: # Always keep domain-important terms if token.lower() in self.domain_important_terms: filtered_tokens.append(token) # Keep numbers and error codes elif re.match(r"^\d+$", token) or re.match(r"^[A-Z]\d+$", token): filtered_tokens.append(token) # Remove stop words elif token.lower() not in self.vietnamese_stop_words: filtered_tokens.append(token) return filtered_tokens def normalize_text_for_bm25(self, text: str) -> str: """Enhanced Vietnamese normalization for BM25""" if not text: return "" # Basic normalization normalized = text.lower().strip() # Vietnamese tokenization tokens = self.vietnamese_tokenize(normalized) # Remove stop words but keep domain terms tokens = self.remove_stop_words(tokens) # Filter out very short tokens (but keep numbers and codes) tokens = [ token for token in tokens if len(token) >= 2 or token.isdigit() or re.match(r"^[A-Z]\d+$", token.upper()) ] # Join back normalized = " ".join(tokens) return normalized def bm25_tokenizer(self, text: str) -> str: if not text: return "" # Basic normalization normalized = text.lower().strip() # Vietnamese tokenization tokens = self.vietnamese_tokenize(normalized) # Remove stop words but keep domain terms tokens = self.remove_stop_words(tokens) # Filter out very short tokens (but keep numbers and codes) tokens = [ token for token in tokens if len(token) >= 2 or token.isdigit() or re.match(r"^[A-Z]\d+$", token.upper()) ] return tokens def enhance_for_bm25( self, content: str, doc_type: str, additional_keywords: str = "", ) -> str: """Enhanced content processing for BM25 with Vietnamese preprocessing""" # Only use document-type specific keywords (no generic base keywords) type_specific_keywords = self.keyword_mappings.get(doc_type, "") enhanced_content = f""" {type_specific_keywords} {additional_keywords} {content} """ return self.normalize_text_for_bm25(enhanced_content) def extract_error_code_variations(self, error_code: str) -> str: """Generate variations of error codes for better BM25 matching""" if not error_code: return "" variations = [error_code] # Add common Vietnamese variations if error_code.isdigit(): # For numeric codes like "606" variations.extend( [ f"lỗi {error_code}", f"error {error_code}", f"mã {error_code}", f"code {error_code}", f"mã lỗi {error_code}", ] ) else: # For alphanumeric codes like "W02", "BL2" variations.extend( [ f"lỗi {error_code}", f"error {error_code}", f"mã lỗi {error_code}", f"code {error_code}", ] ) return " ".join(variations) def extract_steps_keywords(self, guide_text: str) -> str: """Extract step-related keywords from procedure text""" if not guide_text: return "" # Find step patterns steps = re.findall(r"(?:bước|b)\s*\d+", guide_text, re.IGNORECASE) step_keywords = " ".join(steps) # Add common procedure keywords procedure_keywords = ( "step bước instruction hướng dẫn guide quy trình process thao tác action" ) return f"{step_keywords} {procedure_keywords}" def clean_column_name(self, column_name: str) -> str: """Clean column names by removing extra whitespace and newlines""" if not column_name: return "" # Remove newlines and extra spaces cleaned = re.sub(r"\s+", " ", column_name.strip()) return cleaned