fisherman611 commited on
Commit
7c19a8b
·
verified ·
1 Parent(s): e62c1c2

Update utils/text_processor.py

Browse files
Files changed (1) hide show
  1. utils/text_processor.py +44 -3
utils/text_processor.py CHANGED
@@ -78,10 +78,51 @@ class VietnameseTextProcessor:
78
  return [token for token in tokens if token.lower() not in self.stopwords]
79
 
80
  def preprocess_for_search(self, text: str) -> str:
81
- """Preprocess text for search - tokenize and remove stopwords"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  tokens = self.tokenize(text)
83
  filtered_tokens = self.remove_stopwords(tokens)
84
- return " ".join(filtered_tokens)
 
 
 
 
 
 
 
 
85
 
86
  def extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
87
  """Extract keywords from text"""
@@ -107,4 +148,4 @@ class VietnameseTextProcessor:
107
  if chunk_tokens:
108
  chunks.append(" ".join(chunk_tokens))
109
 
110
- return chunks
 
78
  return [token for token in tokens if token.lower() not in self.stopwords]
79
 
80
  def preprocess_for_search(self, text: str) -> str:
81
+ """Preprocess text for search - tokenize and remove stopwords with legal term preservation"""
82
+ # First, preserve important legal patterns and identifiers
83
+ preserved_patterns = []
84
+
85
+ # Preserve legal document IDs (e.g., "47/2011/tt-bca", "159/2020/nđ-cp")
86
+ legal_id_pattern = r'\d+/\d+/[a-z\-]+'
87
+ legal_ids = re.findall(legal_id_pattern, text, re.IGNORECASE)
88
+ for legal_id in legal_ids:
89
+ placeholder = f"LEGALID_{len(preserved_patterns)}"
90
+ preserved_patterns.append((placeholder, legal_id))
91
+ text = text.replace(legal_id, placeholder)
92
+
93
+ # Preserve important legal terms and phrases
94
+ legal_terms = [
95
+ r'điều\s+\d+', # "điều 15", "điều 20"
96
+ r'khoản\s+\d+', # "khoản 1", "khoản 2"
97
+ r'điểm\s+[a-z]', # "điểm a", "điểm b"
98
+ r'nghị\s+định',
99
+ r'thông\s+tư',
100
+ r'quyết\s+định',
101
+ r'luật\s+\w+',
102
+ r'vi\s+phạm',
103
+ r'xử\s+phạt',
104
+ r'mức\s+phạt',
105
+ ]
106
+
107
+ for pattern in legal_terms:
108
+ matches = re.findall(pattern, text, re.IGNORECASE)
109
+ for match in matches:
110
+ placeholder = f"LEGALTERM_{len(preserved_patterns)}"
111
+ preserved_patterns.append((placeholder, match))
112
+ text = text.replace(match, placeholder)
113
+
114
+ # Normal tokenization and stopword removal
115
  tokens = self.tokenize(text)
116
  filtered_tokens = self.remove_stopwords(tokens)
117
+
118
+ # Reconstruct text
119
+ processed_text = " ".join(filtered_tokens)
120
+
121
+ # Restore preserved patterns
122
+ for placeholder, original in preserved_patterns:
123
+ processed_text = processed_text.replace(placeholder, original)
124
+
125
+ return processed_text
126
 
127
  def extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
128
  """Extract keywords from text"""
 
148
  if chunk_tokens:
149
  chunks.append(" ".join(chunk_tokens))
150
 
151
+ return chunks