|
|
|
""" |
|
Word Similarity Engine using Dictionary Embeddings |
|
|
|
Reads dictionary from CSV, creates embeddings for all words, |
|
and provides similarity search functionality. |
|
""" |
|
|
|
import os |
|
import csv |
|
import numpy as np |
|
from typing import List, Tuple |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
class WordSimilarityEngine: |
|
def __init__(self, cache_dir: str = None): |
|
"""Initialize the word similarity engine. |
|
|
|
Args: |
|
cache_dir: Directory to cache the embedding model |
|
""" |
|
if cache_dir is None: |
|
cache_dir = os.path.join(os.path.dirname(__file__), 'model_cache') |
|
|
|
self.cache_dir = cache_dir |
|
os.makedirs(cache_dir, exist_ok=True) |
|
|
|
|
|
print("Loading embedding model...") |
|
self.model = SentenceTransformer( |
|
'sentence-transformers/all-mpnet-base-v2', |
|
cache_folder=cache_dir |
|
) |
|
print("Model loaded successfully.") |
|
|
|
|
|
self.words = self._load_dictionary() |
|
print(f"Loaded {len(self.words)} words from dictionary.") |
|
|
|
print("Creating embeddings for all words...") |
|
self.embeddings = self._create_embeddings() |
|
print("Embeddings created successfully.") |
|
|
|
def _load_dictionary(self) -> List[str]: |
|
"""Load words from the dictionary CSV file.""" |
|
dict_path = os.path.join(os.path.dirname(__file__), 'dict-words', 'dict.csv') |
|
words = [] |
|
|
|
try: |
|
with open(dict_path, 'r', encoding='utf-8') as csvfile: |
|
reader = csv.DictReader(csvfile) |
|
for row in reader: |
|
word = row['word'].strip().lower() |
|
if word and len(word) > 1: |
|
words.append(word) |
|
except FileNotFoundError: |
|
raise Exception(f"Dictionary file not found: {dict_path}") |
|
except Exception as e: |
|
raise Exception(f"Error reading dictionary: {e}") |
|
|
|
return words |
|
|
|
def _create_embeddings(self) -> np.ndarray: |
|
"""Create embeddings for all dictionary words.""" |
|
|
|
batch_size = 256 |
|
all_embeddings = [] |
|
|
|
for i in range(0, len(self.words), batch_size): |
|
batch_words = self.words[i:i + batch_size] |
|
batch_embeddings = self.model.encode( |
|
batch_words, |
|
convert_to_tensor=False, |
|
show_progress_bar=True if i == 0 else False |
|
) |
|
all_embeddings.append(batch_embeddings) |
|
|
|
return np.vstack(all_embeddings) |
|
|
|
def find_similar_words(self, word: str, top_k: int = 10, min_similarity: float = 0.3) -> List[Tuple[str, float]]: |
|
"""Find words similar to the input word. |
|
|
|
Args: |
|
word: Input word to find similarities for |
|
top_k: Number of similar words to return |
|
min_similarity: Minimum similarity threshold |
|
|
|
Returns: |
|
List of tuples (word, similarity_score) sorted by similarity |
|
""" |
|
word = word.strip().lower() |
|
|
|
|
|
if word not in self.words: |
|
print(f"Warning: '{word}' not found in dictionary. Computing similarity anyway...") |
|
|
|
|
|
input_embedding = self.model.encode([word]) |
|
|
|
|
|
similarities = cosine_similarity(input_embedding, self.embeddings)[0] |
|
|
|
|
|
similar_indices = np.argsort(similarities)[::-1] |
|
|
|
|
|
results = [] |
|
for idx in similar_indices: |
|
similarity_score = similarities[idx] |
|
similar_word = self.words[idx] |
|
|
|
|
|
if similar_word != word and similarity_score >= min_similarity: |
|
results.append((similar_word, similarity_score)) |
|
|
|
if len(results) >= top_k: |
|
break |
|
|
|
return results |
|
|
|
def get_word_embedding(self, word: str) -> np.ndarray: |
|
"""Get embedding for a specific word.""" |
|
return self.model.encode([word.strip().lower()])[0] |
|
|
|
|
|
def main(): |
|
"""Demo the word similarity functionality.""" |
|
|
|
engine = WordSimilarityEngine() |
|
|
|
|
|
test_words = ["cat", "science", "computer", "ocean", "music"] |
|
|
|
print("\n" + "="*60) |
|
print("WORD SIMILARITY DEMO") |
|
print("="*60) |
|
|
|
for test_word in test_words: |
|
print(f"\nWords similar to '{test_word}':") |
|
print("-" * 30) |
|
|
|
similar_words = engine.find_similar_words(test_word, top_k=8) |
|
|
|
if similar_words: |
|
for word, score in similar_words: |
|
print(f" {word:<15} (similarity: {score:.3f})") |
|
else: |
|
print(" No similar words found.") |
|
|
|
|
|
print("\n" + "="*60) |
|
print("INTERACTIVE MODE (type 'quit' to exit)") |
|
print("="*60) |
|
|
|
while True: |
|
try: |
|
user_word = input("\nEnter a word to find similar words: ").strip() |
|
|
|
if user_word.lower() == 'quit': |
|
break |
|
|
|
if not user_word: |
|
continue |
|
|
|
print(f"\nWords similar to '{user_word}':") |
|
print("-" * 30) |
|
|
|
similar_words = engine.find_similar_words(user_word, top_k=50) |
|
|
|
if similar_words: |
|
for word, score in similar_words: |
|
print(f" {word:<15} (similarity: {score:.3f})") |
|
else: |
|
print(" No similar words found.") |
|
|
|
except KeyboardInterrupt: |
|
break |
|
except Exception as e: |
|
print(f"Error: {e}") |
|
|
|
print("\nGoodbye!") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|