Spaces:
Sleeping
Sleeping
| # src/utils.py | |
| import re | |
| from collections import Counter | |
| import json | |
| def tokenize(text): | |
| """ | |
| Simple tokenizer that splits text into tokens based on whitespace and punctuation. | |
| """ | |
| tokens = re.findall(r'\b\w+\b', text.lower()) | |
| return tokens | |
| def build_vocab(tokenized_texts, min_freq=2): | |
| """ | |
| Builds a vocabulary dictionary from tokenized texts. | |
| Tokens appearing fewer than `min_freq` times are excluded. | |
| """ | |
| counter = Counter() | |
| for tokens in tokenized_texts: | |
| counter.update(tokens) | |
| vocab = {'<PAD>': 0, '<UNK>': 1} | |
| for word, freq in counter.items(): | |
| if freq >= min_freq: | |
| vocab[word] = len(vocab) | |
| return vocab | |
| def save_vocab(vocab, filepath='vocab.json'): | |
| """ | |
| Saves the vocabulary dictionary to a JSON file. | |
| """ | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(vocab, f, ensure_ascii=False, indent=4) | |
| def load_vocab(filepath='vocab.json'): | |
| """ | |
| Loads the vocabulary dictionary from a JSON file. | |
| """ | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |