File size: 10,239 Bytes
60d1d13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import re
from typing import List, Set

try:
    from underthesea import word_tokenize, pos_tag

    UNDERTHESEA_AVAILABLE = True
except ImportError:
    UNDERTHESEA_AVAILABLE = False
    print("[WARNING] underthesea not available, falling back to basic tokenization")


class VietnameseTextProcessor:
    """Vietnamese text processing utilities for ViettelPay knowledge base"""

    def __init__(self):
        # Keywords by document type
        self.keyword_mappings = {
            "error": "lỗi, error code, mã lỗi, sự cố, problem, thất bại, failed, hệ thống, system, maintenance, bảo trì, nâng cấp, upgrade",
            "procedure": "hướng dẫn, guide, instruction, bước, step, quy trình, process, nạp cước, topup, recharge, mua, buy, purchase, chọn, select, bấm, click",
            "definition": "định nghĩa, definition, nghĩa là, meaning, khái niệm, concept, giải thích, explain",
            "policy": "quy định, policy, rule, chính sách, regulation, hủy, cancel, phí, fee, chiết khấu, discount",
            "reference": "bảng, table, danh sách, list, thông tin, information, chi tiết, detail",
        }

        # Vietnamese stop words
        self.vietnamese_stop_words = self._load_vietnamese_stop_words()

        # Keep important domain terms even if they appear in stop words
        self.domain_important_terms = {
            "lỗi",
            "error",
            "mã",
            "code",
            "bước",
            "step",
            "hướng",
            "dẫn",
            "guide",
            "thanh",
            "toán",
            "payment",
            "nạp",
            "cước",
            "topup",
            "mua",
            "buy",
            "viettel",
            "viettelpay",
            "app",
            "ứng",
            "dụng",
            "mobile",
            "thẻ",
            "card",
            "tiền",
            "money",
            "rút",
            "withdraw",
            "chuyển",
            "transfer",
        }

    def _load_vietnamese_stop_words(self) -> Set[str]:
        """Load Vietnamese stop words"""
        # Common Vietnamese stop words
        stop_words = {
            "và",
            "của",
            "có",
            "là",
            "được",
            "các",
            "một",
            "này",
            "cho",
            "với",
            "trong",
            "từ",
            "tại",
            "về",
            "như",
            "sau",
            "trước",
            "khi",
            "nếu",
            "để",
            "đã",
            "sẽ",
            "đang",
            "bị",
            "bởi",
            "theo",
            "những",
            "nhưng",
            "mà",
            "thì",
            "cũng",
            "hay",
            "hoặc",
            "nên",
            "phải",
            "rất",
            "lại",
            "chỉ",
            "đó",
            "đây",
            "kia",
            "nào",
            "ai",
            "gì",
            "sao",
            "đâu",
            "bao",
            "nhiều",
            "lắm",
            "hơn",
            "nhất",
            "cả",
            "tất",
            "mọi",
            "toàn",
            "chưa",
            "không",
            "chẳng",
            "đang",
            "vẫn",
            "còn",
            "đều",
            "cùng",
            "nhau",
            "riêng",
            "luôn",
            "ngay",
            "liền",
            "thêm",
            "nữa",
            "lần",
            "cuối",
            "đầu",
            "giữa",
            "ngoài",
            "trong",
            "trên",
            "dưới",
            "bên",
            "cạnh",
            "giữa",
            "trước",
            "sau",
            "gần",
            "xa",
            "cao",
            "thấp",
        }

        # Add English stop words that might appear
        english_stops = {
            "the",
            "a",
            "an",
            "and",
            "or",
            "but",
            "in",
            "on",
            "at",
            "to",
            "for",
            "of",
            "with",
            "by",
            "is",
            "are",
            "was",
            "were",
            "be",
            "been",
            "have",
            "has",
            "had",
            "do",
            "does",
            "did",
            "will",
            "would",
            "could",
            "should",
            "may",
            "might",
            "can",
            "this",
            "that",
            "these",
            "those",
        }

        return stop_words.union(english_stops)

    def vietnamese_tokenize(self, text: str) -> List[str]:
        """Vietnamese word tokenization using underthesea or fallback"""
        if not text:
            return []

        if UNDERTHESEA_AVAILABLE:
            try:
                # Use underthesea for proper Vietnamese tokenization
                tokenized_text = word_tokenize(text, format="text")

                return tokenized_text.split()
            except Exception as e:
                print(
                    f"[WARNING] underthesea tokenization failed: {e}, falling back to basic"
                )

        # Fallback: basic tokenization with Vietnamese-aware splitting
        # Handle Vietnamese compound words better
        tokens = text.split()
        return [token.strip() for token in tokens if token.strip()]

    def remove_stop_words(self, tokens: List[str]) -> List[str]:
        """Remove Vietnamese stop words while preserving domain terms"""
        filtered_tokens = []

        for token in tokens:
            # Always keep domain-important terms
            if token.lower() in self.domain_important_terms:
                filtered_tokens.append(token)
            # Keep numbers and error codes
            elif re.match(r"^\d+$", token) or re.match(r"^[A-Z]\d+$", token):
                filtered_tokens.append(token)
            # Remove stop words
            elif token.lower() not in self.vietnamese_stop_words:
                filtered_tokens.append(token)

        return filtered_tokens

    def normalize_text_for_bm25(self, text: str) -> str:
        """Enhanced Vietnamese normalization for BM25"""
        if not text:
            return ""

        # Basic normalization
        normalized = text.lower().strip()

        # Vietnamese tokenization
        tokens = self.vietnamese_tokenize(normalized)

        # Remove stop words but keep domain terms
        tokens = self.remove_stop_words(tokens)

        # Filter out very short tokens (but keep numbers and codes)
        tokens = [
            token
            for token in tokens
            if len(token) >= 2
            or token.isdigit()
            or re.match(r"^[A-Z]\d+$", token.upper())
        ]

        # Join back
        normalized = " ".join(tokens)

        return normalized

    def bm25_tokenizer(self, text: str) -> str:
        if not text:
            return ""

        # Basic normalization
        normalized = text.lower().strip()

        # Vietnamese tokenization
        tokens = self.vietnamese_tokenize(normalized)

        # Remove stop words but keep domain terms
        tokens = self.remove_stop_words(tokens)

        # Filter out very short tokens (but keep numbers and codes)
        tokens = [
            token
            for token in tokens
            if len(token) >= 2
            or token.isdigit()
            or re.match(r"^[A-Z]\d+$", token.upper())
        ]

        return tokens

    def enhance_for_bm25(
        self,
        content: str,
        doc_type: str,
        additional_keywords: str = "",
    ) -> str:
        """Enhanced content processing for BM25 with Vietnamese preprocessing"""
        # Only use document-type specific keywords (no generic base keywords)
        type_specific_keywords = self.keyword_mappings.get(doc_type, "")

        enhanced_content = f"""
        {type_specific_keywords} {additional_keywords}
        {content}
        """

        return self.normalize_text_for_bm25(enhanced_content)

    def extract_error_code_variations(self, error_code: str) -> str:
        """Generate variations of error codes for better BM25 matching"""
        if not error_code:
            return ""

        variations = [error_code]

        # Add common Vietnamese variations
        if error_code.isdigit():
            # For numeric codes like "606"
            variations.extend(
                [
                    f"lỗi {error_code}",
                    f"error {error_code}",
                    f"mã {error_code}",
                    f"code {error_code}",
                    f"mã lỗi {error_code}",
                ]
            )
        else:
            # For alphanumeric codes like "W02", "BL2"
            variations.extend(
                [
                    f"lỗi {error_code}",
                    f"error {error_code}",
                    f"mã lỗi {error_code}",
                    f"code {error_code}",
                ]
            )

        return " ".join(variations)

    def extract_steps_keywords(self, guide_text: str) -> str:
        """Extract step-related keywords from procedure text"""
        if not guide_text:
            return ""

        # Find step patterns
        steps = re.findall(r"(?:bước|b)\s*\d+", guide_text, re.IGNORECASE)
        step_keywords = " ".join(steps)

        # Add common procedure keywords
        procedure_keywords = (
            "step bước instruction hướng dẫn guide quy trình process thao tác action"
        )

        return f"{step_keywords} {procedure_keywords}"

    def clean_column_name(self, column_name: str) -> str:
        """Clean column names by removing extra whitespace and newlines"""
        if not column_name:
            return ""

        # Remove newlines and extra spaces
        cleaned = re.sub(r"\s+", " ", column_name.strip())

        return cleaned