Spaces:

GroNLP
/

agalma

Running

App Files Files Community

Mark7549 commited on Jun 17, 2024

Commit

17c5755

1 Parent(s): 74e30c6

improved code quality

Browse files

Files changed (1) hide show

word2vec.py +99 -168

word2vec.py CHANGED Viewed

@@ -1,16 +1,9 @@
 from gensim.models import Word2Vec
 from collections import defaultdict
 import os
-import pickle
 import tempfile
 import pandas as pd
-from sklearn.preprocessing import StandardScaler
-from sklearn.manifold import TSNE
-import plotly.express as px
 from collections import Counter
-import streamlit as st
 def load_all_models():
@@ -30,6 +23,8 @@ def load_all_models():
 def load_selected_models(selected_models):
     '''
         Load the selected word2vec models
     '''
     models = []
     for model in selected_models:
@@ -48,6 +43,8 @@ def load_selected_models(selected_models):
 def load_word2vec_model(model_path):
     '''
         Load a word2vec model from a file
     '''
     return Word2Vec.load(model_path)
@@ -55,6 +52,9 @@ def load_word2vec_model(model_path):
 def get_word_vector(model, word):
     '''
         Return the word vector of a word
     '''
     return model.wv[word]
@@ -62,6 +62,8 @@ def get_word_vector(model, word):
 def iterate_over_words(model):
     '''
         Iterate over all words in the vocabulary and print their vectors
     '''
     index = 0
     for word, index in model.wv.key_to_index.items():
@@ -74,6 +76,8 @@ def model_dictionary(model):
     '''
         Return the dictionary of the word2vec model
         Key is the word and value is the vector of the word
     '''
     dict = defaultdict(list)
     for word, index in model.wv.key_to_index.items():
@@ -86,13 +90,24 @@ def model_dictionary(model):
 def dot_product(vector_a, vector_b):
     '''
         Return the dot product of two vectors
     '''
     return sum(a * b for a, b in zip(vector_a, vector_b))
 def magnitude(vector):
     '''
-        Return the magnitude of a vector
     '''
     return sum(x**2 for x in vector) ** 0.5
@@ -100,6 +115,13 @@ def magnitude(vector):
 def cosine_similarity(vector_a, vector_b):
     '''
         Return the cosine similarity of two vectors
     '''
     dot_prod = dot_product(vector_a, vector_b)
     mag_a = magnitude(vector_a)
@@ -116,10 +138,16 @@ def cosine_similarity(vector_a, vector_b):
 def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
     '''
         Return the cosine similarity of two words
     '''
-    # TO DO: MOET NETTER
-    # Return if path does not exist
     time_slice_1 = convert_time_name_to_model(time_slice_1)
     time_slice_2 = convert_time_name_to_model(time_slice_2)
@@ -139,6 +167,14 @@ def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
 def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
     '''
         Return the cosine similarity of one word in two different time slices
     '''
     # Return if path does not exist
@@ -158,6 +194,14 @@ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
 def validate_nearest_neighbours(word, n, models):
     '''
         Validate the input of the nearest neighbours function
     '''
     if word == '' or n == '' or models == []:
         return False
@@ -167,6 +211,11 @@ def validate_nearest_neighbours(word, n, models):
 def convert_model_to_time_name(model_name):
     '''
         Convert the model name to the time slice name
     '''
     if model_name == 'archaic_cbow' or model_name == 'archaic':
         return 'Archaic'
@@ -183,6 +232,12 @@ def convert_model_to_time_name(model_name):
 def convert_time_name_to_model(time_name):
     '''
         Convert the time slice name to the model name
     '''
     if time_name == 'Archaic':
         return 'archaic_cbow'
@@ -205,52 +260,6 @@ def convert_time_name_to_model(time_name):
     elif time_name == 'archaic':
         return 'Archaic'
-def get_nearest_neighbours2(word, n=10, models=load_all_models()):
-    '''
-        Return the nearest neighbours of a word
-        word: the word for which the nearest neighbours are calculated
-        time_slice_model: the word2vec model of the time slice of the input word
-        models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
-        n: the number of nearest neighbours to return (default: 10)
-        Return: list of tuples with the word, the time slice and
-                the cosine similarity of the nearest neighbours
-    '''
-    time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
-    vector_1 = get_word_vector(time_slice_model, word)
-    nearest_neighbours = []
-    # Iterate over all models
-    for model in models:
-        model_name = model[0]
-        time_name = convert_model_to_time_name(model_name)
-        model = model[1]
-        # Iterate over all words of the model
-        for word, index in model.wv.key_to_index.items():
-            # Vector of the current word
-            vector_2 = get_word_vector(model, word)
-            # Calculate the cosine similarity between current word and input word
-            cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
-            # If the list of nearest neighbours is not full yet, add the current word
-            if len(nearest_neighbours) < n:
-                nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
-            # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
-            else:
-                smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
-                if cosine_similarity_vectors > smallest_neighbour[2]:
-                    nearest_neighbours.remove(smallest_neighbour)
-                    nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
-    return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
 def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
     """
@@ -298,9 +307,16 @@ def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
 def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
-    """
-        Returns the vectors of the nearest neighbours of a word
-    """
     model_name = convert_model_to_time_name(time_slice_model)
     time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
     vector_1 = get_word_vector(time_slice_model, word)
@@ -327,6 +343,10 @@ def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
 def write_to_file(data):
     '''
         Write the data to a file
     '''
     # Create random tmp file name
     temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
@@ -342,7 +362,11 @@ def write_to_file(data):
 def store_df_in_temp_file(all_dfs):
     '''
-        Store the dataframe in a temporary file
     '''
     # Define directory for temporary files
     temp_dir = "./downloads/nn"
@@ -350,37 +374,34 @@ def store_df_in_temp_file(all_dfs):
     # Create the directory if it doesn't exist
     os.makedirs(temp_dir, exist_ok=True)
-    # Create random tmp file name
     _, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".xlsx", dir=temp_dir)
     # Concatenate all dataframes
     df = pd.concat([df for _, df in all_dfs], axis=1, keys=[model for model, _ in all_dfs])
     # Create an ExcelWriter object
     with pd.ExcelWriter(temp_file_path, engine='xlsxwriter') as writer:
         # Create a new sheet
         worksheet = writer.book.add_worksheet('Results')
-        # Write text before DataFrames
         start_row = 0
         for model, df in all_dfs:
-            # Write model name as text
             worksheet.write(start_row, 0, f"Model: {model}")
-            # Write DataFrame
             df.to_excel(writer, sheet_name='Results', index=False, startrow=start_row + 1, startcol=0)
-            # Update start_row for the next model
             start_row += df.shape[0] + 3  # Add some space between models
     return temp_file_path
 def check_word_in_models(word):
-    """
-        Check in which models a word occurs.
-    """
     all_models = load_all_models()
     eligible_models = []
@@ -393,75 +414,16 @@ def check_word_in_models(word):
     return eligible_models
-def reduce_dimensions_tSNE():
-    '''
-        Reduce the dimensions of the data using t-SNE
-    '''
-    all_models = load_all_models()
-    for model in all_models:
-        model_name = model[0]
-        model = model[1]
-        model_dict = model_dictionary(model)
-        # Extract vectors and names from model_dict
-        all_vector_names = list(model_dict.keys())
-        all_vectors = list(model_dict.values())
-        print('Scaling', model_name)
-        # Scale vectors
-        scaler = StandardScaler()
-        vectors_scaled = scaler.fit_transform(all_vectors)
-        print('Fitting', model_name)
-        # Make t-SNE model and fit it to the scaled vectors
-        tsne_model = TSNE(n_components=3, random_state=42)
-        tsne_result = tsne_model.fit_transform(vectors_scaled)
-        print('Done fitting')
-        # Associate the names with the 3D representations
-        result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
-        # Store all vectors in /3d_models/{model_name}.model
-        store_3d_model(result_with_names, model_name)
-def store_3d_model(result_with_names, model_name):
-    """
-    Store the 3D model data to a file.
-    """
-    output_dir = './3d_models'
-    os.makedirs(output_dir, exist_ok=True)
-    file_path = os.path.join(output_dir, f'{model_name}.model')
-    with open(file_path, 'wb') as f:
-        pickle.dump(result_with_names, f)
-    print(f"3D model for {model_name} stored at {file_path}")
-def print_3d_model(model_name):
-    """
-    Print the 3D model data.
-    """
-    file_path = f'./3d_models/{model_name}.model'
-    with open(file_path, 'rb') as f:
-        result_with_names = pickle.load(f)
-    for word, vector in result_with_names:
-        print(f'{word}: {vector}')
-def count_lemmas(directory):
-    """
-        Create a Counter with all words and their occurences for all models
-    """
     lemma_count_dict = {}
     for file in os.listdir(directory):
         model_name = file.split('.')[0].replace('_', ' ').capitalize()
@@ -475,34 +437,3 @@ def count_lemmas(directory):
                 lemma_count_dict[model_name] = Counter(words)
     return lemma_count_dict
-def main():
-    # model = load_word2vec_model('models/archaic_cbow.model')
-    # archaic_cbow_dict = model_dictionary(model)
-    # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
-    # print(score)
-    # archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
-    # classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
-    # early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
-    # hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
-    # late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
-    # models = [archaic, classical, early_roman, hellen, late_roman]
-    # nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
-    # print(nearest_neighbours)
-    # vector = get_word_vector(model, 'ἀνήρ')
-    # print(vector)
-    # Iterate over all words and print their vectors
-    # iterate_over_words(model)
-    print(count_lemmas('lemma_list_raw'))
-if __name__ == "__main__":
-    main()

 from gensim.models import Word2Vec
 from collections import defaultdict
 import os
 import tempfile
 import pandas as pd
 from collections import Counter
 def load_all_models():
 def load_selected_models(selected_models):
     '''
         Load the selected word2vec models
+        selected_models: a list of models that should be loaded
     '''
     models = []
     for model in selected_models:
 def load_word2vec_model(model_path):
     '''
         Load a word2vec model from a file
+        model_path: relative path to model files
     '''
     return Word2Vec.load(model_path)
 def get_word_vector(model, word):
     '''
         Return the word vector of a word
+        model: word2vec model object
+        word: word to extract vector from
     '''
     return model.wv[word]
 def iterate_over_words(model):
     '''
         Iterate over all words in the vocabulary and print their vectors
+        model: word2vec model object
     '''
     index = 0
     for word, index in model.wv.key_to_index.items():
     '''
         Return the dictionary of the word2vec model
         Key is the word and value is the vector of the word
+        model: word2vec model object
     '''
     dict = defaultdict(list)
     for word, index in model.wv.key_to_index.items():
 def dot_product(vector_a, vector_b):
     '''
         Return the dot product of two vectors
+        vector_a: A list of numbers representing the first vector
+        vector_b: A list of numbers representing the second vector
+        Returns:
+        A single number representing the dot product of the two vectors
     '''
     return sum(a * b for a, b in zip(vector_a, vector_b))
 def magnitude(vector):
     '''
+        Returns the magnitude of a vector
+        vector: A list of numbers representing the vetor
+        Returns:
+        A single number representing the magnitude of the vector.
     '''
     return sum(x**2 for x in vector) ** 0.5
 def cosine_similarity(vector_a, vector_b):
     '''
         Return the cosine similarity of two vectors
+        vector_a: A list of numbers representing the first vector
+        vector_b: A list of numbers representing the second vector
+        Returns:
+        A String representing the cosine similarity of the two vectors \
+        formatted to two decimals.
     '''
     dot_prod = dot_product(vector_a, vector_b)
     mag_a = magnitude(vector_a)
 def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
     '''
         Return the cosine similarity of two words
+        word1: The first word as a string.
+        time_slice_1: The time slice for the first word as a string.
+        word2: The second word as a string.
+        time_slice_2: The time slice for the second word as a string.
+        Returns:
+        A string representing the cosine similarity of the two words formatted to two decimal places.
     '''
     time_slice_1 = convert_time_name_to_model(time_slice_1)
     time_slice_2 = convert_time_name_to_model(time_slice_2)
 def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
     '''
         Return the cosine similarity of one word in two different time slices
+        word: The word as a string.
+        time_slice1: The first time slice as a string.
+        time_slice2: The second time slice as a string.
+        Returns:
+        A string representing the cosine similarity of the word in two different time slices formatted to two decimal places.
     '''
     # Return if path does not exist
 def validate_nearest_neighbours(word, n, models):
     '''
         Validate the input of the nearest neighbours function
+        word: The word as a string.
+        n: The number of nearest neighbours to find as an integer.
+        models: A list of model names as strings.
+        Returns:
+        A boolean value. True if inputs are valid, False otherwise.
     '''
     if word == '' or n == '' or models == []:
         return False
 def convert_model_to_time_name(model_name):
     '''
         Convert the model name to the time slice name
+        model_name: The model name as a string.
+        Returns:
+        A string representing the corresponding time slice name.
     '''
     if model_name == 'archaic_cbow' or model_name == 'archaic':
         return 'Archaic'
 def convert_time_name_to_model(time_name):
     '''
         Convert the time slice name to the model name
+        time_name -- The time slice name as a string.
+        Returns:
+        A string representing the corresponding model name.
     '''
     if time_name == 'Archaic':
         return 'archaic_cbow'
     elif time_name == 'archaic':
         return 'Archaic'
 def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
     """
 def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
+    '''
+        Return the vectors of the nearest neighbours of a word
+        word: the word for which the nearest neighbours are calculated
+        time_slice_model: the word2vec model of the time slice of the input word
+        n: the number of nearest neighbours to return (default: 15)
+        Return: list of tuples with the word, the time slice, the vector, and the cosine similarity
+                of the nearest neighbours
+    '''
     model_name = convert_model_to_time_name(time_slice_model)
     time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
     vector_1 = get_word_vector(time_slice_model, word)
 def write_to_file(data):
     '''
         Write the data to a file
+        data: the data to be written to the file
+        Return: the path to the temporary file
     '''
     # Create random tmp file name
     temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
 def store_df_in_temp_file(all_dfs):
     '''
+        Store the dataframes in a temporary file
+        all_dfs: list of tuples with the name of the time slice and the dataframe
+        Return: the path to the temporary Excel file
     '''
     # Define directory for temporary files
     temp_dir = "./downloads/nn"
     # Create the directory if it doesn't exist
     os.makedirs(temp_dir, exist_ok=True)
+    # Create random temporary file name
     _, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".xlsx", dir=temp_dir)
     # Concatenate all dataframes
     df = pd.concat([df for _, df in all_dfs], axis=1, keys=[model for model, _ in all_dfs])
     # Create an ExcelWriter object
     with pd.ExcelWriter(temp_file_path, engine='xlsxwriter') as writer:
         # Create a new sheet
         worksheet = writer.book.add_worksheet('Results')
         start_row = 0
         for model, df in all_dfs:
             worksheet.write(start_row, 0, f"Model: {model}")
             df.to_excel(writer, sheet_name='Results', index=False, startrow=start_row + 1, startcol=0)
             start_row += df.shape[0] + 3  # Add some space between models
     return temp_file_path
 def check_word_in_models(word):
+    '''
+        Check in which models a word occurs
+        word: the word to check
+        Return: list of model names where the word occurs
+    '''
     all_models = load_all_models()
     eligible_models = []
     return eligible_models
+def count_lemmas(directory):
+    '''
+        Create a Counter with all words and their occurrences for all models
+        directory: the directory containing the text files for the models
+        Return: a dictionary where keys are model names and values are Counters of word occurrences
+    '''
     lemma_count_dict = {}
     for file in os.listdir(directory):
         model_name = file.split('.')[0].replace('_', ' ').capitalize()
                 lemma_count_dict[model_name] = Counter(words)
     return lemma_count_dict