Spaces:

minhan6559
/

viettelpay-chatbot

Running

File size: 10,239 Bytes

60d1d13

import re
from typing import List, Set

try:
    from underthesea import word_tokenize, pos_tag

    UNDERTHESEA_AVAILABLE = True
except ImportError:
    UNDERTHESEA_AVAILABLE = False
    print("[WARNING] underthesea not available, falling back to basic tokenization")


class VietnameseTextProcessor:
    """Vietnamese text processing utilities for ViettelPay knowledge base"""

    def __init__(self):
        # Keywords by document type
        self.keyword_mappings = {
            "error": "lỗi, error code, mã lỗi, sự cố, problem, thất bại, failed, hệ thống, system, maintenance, bảo trì, nâng cấp, upgrade",
            "procedure": "hướng dẫn, guide, instruction, bước, step, quy trình, process, nạp cước, topup, recharge, mua, buy, purchase, chọn, select, bấm, click",
            "definition": "định nghĩa, definition, nghĩa là, meaning, khái niệm, concept, giải thích, explain",
            "policy": "quy định, policy, rule, chính sách, regulation, hủy, cancel, phí, fee, chiết khấu, discount",
            "reference": "bảng, table, danh sách, list, thông tin, information, chi tiết, detail",
        }

        # Vietnamese stop words
        self.vietnamese_stop_words = self._load_vietnamese_stop_words()

        # Keep important domain terms even if they appear in stop words
        self.domain_important_terms = {
            "lỗi",
            "error",
            "mã",
            "code",
            "bước",
            "step",
            "hướng",
            "dẫn",
            "guide",
            "thanh",
            "toán",
            "payment",
            "nạp",
            "cước",
            "topup",
            "mua",
            "buy",
            "viettel",
            "viettelpay",
            "app",
            "ứng",
            "dụng",
            "mobile",
            "thẻ",
            "card",
            "tiền",
            "money",
            "rút",
            "withdraw",
            "chuyển",
            "transfer",
        }

    def _load_vietnamese_stop_words(self) -> Set[str]:
        """Load Vietnamese stop words"""
        # Common Vietnamese stop words
        stop_words = {
            "và",
            "của",
            "có",
            "là",
            "được",
            "các",
            "một",
            "này",
            "cho",
            "với",
            "trong",
            "từ",
            "tại",
            "về",
            "như",
            "sau",
            "trước",
            "khi",
            "nếu",
            "để",
            "đã",
            "sẽ",
            "đang",
            "bị",
            "bởi",
            "theo",
            "những",
            "nhưng",
            "mà",
            "thì",
            "cũng",
            "hay",
            "hoặc",
            "nên",
            "phải",
            "rất",
            "lại",
            "chỉ",
            "đó",
            "đây",
            "kia",
            "nào",
            "ai",
            "gì",
            "sao",
            "đâu",
            "bao",
            "nhiều",
            "lắm",
            "hơn",
            "nhất",
            "cả",
            "tất",
            "mọi",
            "toàn",
            "chưa",
            "không",
            "chẳng",
            "đang",
            "vẫn",
            "còn",
            "đều",
            "cùng",
            "nhau",
            "riêng",
            "luôn",
            "ngay",
            "liền",
            "thêm",
            "nữa",
            "lần",
            "cuối",
            "đầu",
            "giữa",
            "ngoài",
            "trong",
            "trên",
            "dưới",
            "bên",
            "cạnh",
            "giữa",
            "trước",
            "sau",
            "gần",
            "xa",
            "cao",
            "thấp",
        }

        # Add English stop words that might appear
        english_stops = {
            "the",
            "a",
            "an",
            "and",
            "or",
            "but",
            "in",
            "on",
            "at",
            "to",
            "for",
            "of",
            "with",
            "by",
            "is",
            "are",
            "was",
            "were",
            "be",
            "been",
            "have",
            "has",
            "had",
            "do",
            "does",
            "did",
            "will",
            "would",
            "could",
            "should",
            "may",
            "might",
            "can",
            "this",
            "that",
            "these",
            "those",
        }

        return stop_words.union(english_stops)

    def vietnamese_tokenize(self, text: str) -> List[str]:
        """Vietnamese word tokenization using underthesea or fallback"""
        if not text:
            return []

        if UNDERTHESEA_AVAILABLE:
            try:
                # Use underthesea for proper Vietnamese tokenization
                tokenized_text = word_tokenize(text, format="text")

                return tokenized_text.split()
            except Exception as e:
                print(
                    f"[WARNING] underthesea tokenization failed: {e}, falling back to basic"
                )

        # Fallback: basic tokenization with Vietnamese-aware splitting
        # Handle Vietnamese compound words better
        tokens = text.split()
        return [token.strip() for token in tokens if token.strip()]

    def remove_stop_words(self, tokens: List[str]) -> List[str]:
        """Remove Vietnamese stop words while preserving domain terms"""
        filtered_tokens = []

        for token in tokens:
            # Always keep domain-important terms
            if token.lower() in self.domain_important_terms:
                filtered_tokens.append(token)
            # Keep numbers and error codes
            elif re.match(r"^\d+$", token) or re.match(r"^[A-Z]\d+$", token):
                filtered_tokens.append(token)
            # Remove stop words
            elif token.lower() not in self.vietnamese_stop_words:
                filtered_tokens.append(token)

        return filtered_tokens

    def normalize_text_for_bm25(self, text: str) -> str:
        """Enhanced Vietnamese normalization for BM25"""
        if not text:
            return ""

        # Basic normalization
        normalized = text.lower().strip()

        # Vietnamese tokenization
        tokens = self.vietnamese_tokenize(normalized)

        # Remove stop words but keep domain terms
        tokens = self.remove_stop_words(tokens)

        # Filter out very short tokens (but keep numbers and codes)
        tokens = [
            token
            for token in tokens
            if len(token) >= 2
            or token.isdigit()
            or re.match(r"^[A-Z]\d+$", token.upper())
        ]

        # Join back
        normalized = " ".join(tokens)

        return normalized

    def bm25_tokenizer(self, text: str) -> str:
        if not text:
            return ""

        # Basic normalization
        normalized = text.lower().strip()

        # Vietnamese tokenization
        tokens = self.vietnamese_tokenize(normalized)

        # Remove stop words but keep domain terms
        tokens = self.remove_stop_words(tokens)

        # Filter out very short tokens (but keep numbers and codes)
        tokens = [
            token
            for token in tokens
            if len(token) >= 2
            or token.isdigit()
            or re.match(r"^[A-Z]\d+$", token.upper())
        ]

        return tokens

    def enhance_for_bm25(
        self,
        content: str,
        doc_type: str,
        additional_keywords: str = "",
    ) -> str:
        """Enhanced content processing for BM25 with Vietnamese preprocessing"""
        # Only use document-type specific keywords (no generic base keywords)
        type_specific_keywords = self.keyword_mappings.get(doc_type, "")

        enhanced_content = f"""
        {type_specific_keywords} {additional_keywords}
        {content}
        """

        return self.normalize_text_for_bm25(enhanced_content)

    def extract_error_code_variations(self, error_code: str) -> str:
        """Generate variations of error codes for better BM25 matching"""
        if not error_code:
            return ""

        variations = [error_code]

        # Add common Vietnamese variations
        if error_code.isdigit():
            # For numeric codes like "606"
            variations.extend(
                [
                    f"lỗi {error_code}",
                    f"error {error_code}",
                    f"mã {error_code}",
                    f"code {error_code}",
                    f"mã lỗi {error_code}",
                ]
            )
        else:
            # For alphanumeric codes like "W02", "BL2"
            variations.extend(
                [
                    f"lỗi {error_code}",
                    f"error {error_code}",
                    f"mã lỗi {error_code}",
                    f"code {error_code}",
                ]
            )

        return " ".join(variations)

    def extract_steps_keywords(self, guide_text: str) -> str:
        """Extract step-related keywords from procedure text"""
        if not guide_text:
            return ""

        # Find step patterns
        steps = re.findall(r"(?:bước|b)\s*\d+", guide_text, re.IGNORECASE)
        step_keywords = " ".join(steps)

        # Add common procedure keywords
        procedure_keywords = (
            "step bước instruction hướng dẫn guide quy trình process thao tác action"
        )

        return f"{step_keywords} {procedure_keywords}"

    def clean_column_name(self, column_name: str) -> str:
        """Clean column names by removing extra whitespace and newlines"""
        if not column_name:
            return ""

        # Remove newlines and extra spaces
        cleaned = re.sub(r"\s+", " ", column_name.strip())

        return cleaned