fastened the 3d plot creation by using pretrained vectors, stored in ./3d_models directory
Browse files- 3d_models/archaic_cbow.model +3 -0
- 3d_models/classical_cbow.model +3 -0
- 3d_models/early_roman_cbow.model +3 -0
- 3d_models/hellen_cbow.model +3 -0
- 3d_models/late_roman_cbow.model +3 -0
- app.py +2 -4
- plots.py +17 -19
- word2vec.py +82 -8
3d_models/archaic_cbow.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce261e66010d55466a312dec46a0eb0eefed49158932599bfc45345d47e5d7c2
|
| 3 |
+
size 231604
|
3d_models/classical_cbow.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:052b888d4678c06e41ac8f7d6a8e9ffd441178b7481230c8fcab287c38140d40
|
| 3 |
+
size 911163
|
3d_models/early_roman_cbow.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f39dd99b02d0bc39f28bf0df12bd81a155b9df1a38b8634032887c5302b7650
|
| 3 |
+
size 1238889
|
3d_models/hellen_cbow.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2e9ac6e2bd5107f376cc831bc5a571b0b25a28fee4f45418d5f5b7fe2df7f78
|
| 3 |
+
size 794386
|
3d_models/late_roman_cbow.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a4846279207474ff1feab84f05e0802020b9b4ed46b3f4cead259e0c99ea4c4
|
| 3 |
+
size 532145
|
app.py
CHANGED
|
@@ -216,11 +216,9 @@ elif active_tab == "3D graph":
|
|
| 216 |
|
| 217 |
if graph_button:
|
| 218 |
time_slice_model = convert_time_name_to_model(time_slice)
|
| 219 |
-
nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
|
| 220 |
|
| 221 |
-
fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
|
| 222 |
-
|
| 223 |
-
# st.dataframe(df)
|
| 224 |
|
| 225 |
st.plotly_chart(fig)
|
| 226 |
|
|
|
|
| 216 |
|
| 217 |
if graph_button:
|
| 218 |
time_slice_model = convert_time_name_to_model(time_slice)
|
| 219 |
+
nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
|
| 220 |
|
| 221 |
+
fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
|
|
|
|
|
|
|
| 222 |
|
| 223 |
st.plotly_chart(fig)
|
| 224 |
|
plots.py
CHANGED
|
@@ -10,33 +10,30 @@ import plotly.express as px
|
|
| 10 |
from sklearn.manifold import TSNE
|
| 11 |
|
| 12 |
|
| 13 |
-
def make_3d_plot_tSNE(vectors_list,
|
| 14 |
"""
|
| 15 |
Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
|
| 16 |
List structure: [(word, model_name, vector, cosine_sim)]
|
| 17 |
"""
|
|
|
|
|
|
|
| 18 |
# Load model
|
| 19 |
model = load_word2vec_model(f'models/{time_slice_model}.model')
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
vectors_scaled = scaler.fit_transform(all_vectors)
|
| 29 |
|
| 30 |
-
# Make t-SNE model and fit it to the scaled vectors
|
| 31 |
-
tsne_model = TSNE(n_components=3, random_state=0)
|
| 32 |
-
tsne_result = tsne_model.fit_transform(vectors_scaled)
|
| 33 |
|
| 34 |
-
# Associate the names with the 3D representations
|
| 35 |
-
result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
|
| 36 |
|
| 37 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
| 38 |
-
result_with_names = [
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
# Create DataFrame from the transformed vectors
|
| 42 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
|
@@ -44,14 +41,15 @@ def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
|
|
| 44 |
# Sort dataframe by cosine_sim
|
| 45 |
df = df.sort_values(by='cosine_sim', ascending=False)
|
| 46 |
|
|
|
|
| 47 |
x = df['3d_vector'].apply(lambda v: v[0])
|
| 48 |
y = df['3d_vector'].apply(lambda v: v[1])
|
| 49 |
z = df['3d_vector'].apply(lambda v: v[2])
|
| 50 |
-
|
| 51 |
# Plot
|
| 52 |
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
|
| 53 |
fig.update_traces(marker=dict(size=5))
|
| 54 |
-
fig.update_layout(title=f'3D plot of nearest neighbours to {
|
| 55 |
|
| 56 |
return fig, df
|
| 57 |
|
|
|
|
| 10 |
from sklearn.manifold import TSNE
|
| 11 |
|
| 12 |
|
| 13 |
+
def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
|
| 14 |
"""
|
| 15 |
Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
|
| 16 |
List structure: [(word, model_name, vector, cosine_sim)]
|
| 17 |
"""
|
| 18 |
+
word = target_word
|
| 19 |
+
|
| 20 |
# Load model
|
| 21 |
model = load_word2vec_model(f'models/{time_slice_model}.model')
|
| 22 |
+
|
| 23 |
+
# Extract vectors and names from ./3d_models/{time_slice_model}.model
|
| 24 |
+
all_vectors = {}
|
| 25 |
+
with open(f'./3d_models/{time_slice_model}.model', 'rb') as f:
|
| 26 |
+
result_with_names = pickle.load(f)
|
| 27 |
+
|
| 28 |
+
for word, vector in result_with_names:
|
| 29 |
+
all_vectors[word] = vector
|
|
|
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
| 34 |
+
result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
|
| 38 |
# Create DataFrame from the transformed vectors
|
| 39 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
|
|
|
| 41 |
# Sort dataframe by cosine_sim
|
| 42 |
df = df.sort_values(by='cosine_sim', ascending=False)
|
| 43 |
|
| 44 |
+
|
| 45 |
x = df['3d_vector'].apply(lambda v: v[0])
|
| 46 |
y = df['3d_vector'].apply(lambda v: v[1])
|
| 47 |
z = df['3d_vector'].apply(lambda v: v[2])
|
| 48 |
+
|
| 49 |
# Plot
|
| 50 |
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
|
| 51 |
fig.update_traces(marker=dict(size=5))
|
| 52 |
+
fig.update_layout(title=f'3D plot of nearest neighbours to {target_word}')
|
| 53 |
|
| 54 |
return fig, df
|
| 55 |
|
word2vec.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
| 1 |
from gensim.models import Word2Vec
|
| 2 |
from collections import defaultdict
|
| 3 |
import os
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import pandas as pd
|
| 6 |
import xlsxwriter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def load_all_models():
|
|
@@ -302,6 +308,7 @@ def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
|
|
| 302 |
|
| 303 |
|
| 304 |
for word, index in time_slice_model.wv.key_to_index.items():
|
|
|
|
| 305 |
vector_2 = get_word_vector(time_slice_model, word)
|
| 306 |
cosine_sim = cosine_similarity(vector_1, vector_2)
|
| 307 |
|
|
@@ -386,6 +393,71 @@ def check_word_in_models(word):
|
|
| 386 |
return eligible_models
|
| 387 |
|
| 388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
def main():
|
| 390 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
| 391 |
# archaic_cbow_dict = model_dictionary(model)
|
|
@@ -394,20 +466,22 @@ def main():
|
|
| 394 |
# print(score)
|
| 395 |
|
| 396 |
|
| 397 |
-
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
|
| 398 |
-
classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
|
| 399 |
-
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
|
| 400 |
-
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
|
| 401 |
-
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
|
| 402 |
|
| 403 |
-
models = [archaic, classical, early_roman, hellen, late_roman]
|
| 404 |
-
nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
|
| 405 |
-
print(nearest_neighbours)
|
| 406 |
# vector = get_word_vector(model, 'ἀνήρ')
|
| 407 |
# print(vector)
|
| 408 |
|
| 409 |
# Iterate over all words and print their vectors
|
| 410 |
# iterate_over_words(model)
|
|
|
|
|
|
|
| 411 |
|
| 412 |
|
| 413 |
if __name__ == "__main__":
|
|
|
|
| 1 |
from gensim.models import Word2Vec
|
| 2 |
from collections import defaultdict
|
| 3 |
import os
|
| 4 |
+
import pickle
|
| 5 |
import tempfile
|
| 6 |
import pandas as pd
|
| 7 |
import xlsxwriter
|
| 8 |
+
from sklearn.preprocessing import StandardScaler
|
| 9 |
+
from sklearn.manifold import TSNE
|
| 10 |
+
import plotly.express as px
|
| 11 |
+
|
| 12 |
+
|
| 13 |
|
| 14 |
|
| 15 |
def load_all_models():
|
|
|
|
| 308 |
|
| 309 |
|
| 310 |
for word, index in time_slice_model.wv.key_to_index.items():
|
| 311 |
+
print(word)
|
| 312 |
vector_2 = get_word_vector(time_slice_model, word)
|
| 313 |
cosine_sim = cosine_similarity(vector_1, vector_2)
|
| 314 |
|
|
|
|
| 393 |
return eligible_models
|
| 394 |
|
| 395 |
|
| 396 |
+
|
| 397 |
+
def reduce_dimensions_tSNE():
|
| 398 |
+
'''
|
| 399 |
+
Reduce the dimensions of the data using t-SNE
|
| 400 |
+
'''
|
| 401 |
+
all_models = load_all_models()
|
| 402 |
+
|
| 403 |
+
for model in all_models:
|
| 404 |
+
model_name = model[0]
|
| 405 |
+
model = model[1]
|
| 406 |
+
model_dict = model_dictionary(model)
|
| 407 |
+
|
| 408 |
+
# Extract vectors and names from model_dict
|
| 409 |
+
all_vector_names = list(model_dict.keys())
|
| 410 |
+
all_vectors = list(model_dict.values())
|
| 411 |
+
|
| 412 |
+
print('Scaling', model_name)
|
| 413 |
+
|
| 414 |
+
# Scale vectors
|
| 415 |
+
scaler = StandardScaler()
|
| 416 |
+
vectors_scaled = scaler.fit_transform(all_vectors)
|
| 417 |
+
|
| 418 |
+
print('Fitting', model_name)
|
| 419 |
+
|
| 420 |
+
# Make t-SNE model and fit it to the scaled vectors
|
| 421 |
+
tsne_model = TSNE(n_components=3, random_state=42)
|
| 422 |
+
tsne_result = tsne_model.fit_transform(vectors_scaled)
|
| 423 |
+
|
| 424 |
+
print('Done fitting')
|
| 425 |
+
|
| 426 |
+
# Associate the names with the 3D representations
|
| 427 |
+
result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
|
| 428 |
+
|
| 429 |
+
# Store all vectors in /3d_models/{model_name}.model
|
| 430 |
+
store_3d_model(result_with_names, model_name)
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def store_3d_model(result_with_names, model_name):
|
| 434 |
+
"""
|
| 435 |
+
Store the 3D model data to a file.
|
| 436 |
+
"""
|
| 437 |
+
output_dir = './3d_models'
|
| 438 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 439 |
+
file_path = os.path.join(output_dir, f'{model_name}.model')
|
| 440 |
+
|
| 441 |
+
with open(file_path, 'wb') as f:
|
| 442 |
+
pickle.dump(result_with_names, f)
|
| 443 |
+
print(f"3D model for {model_name} stored at {file_path}")
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
def print_3d_model(model_name):
|
| 448 |
+
"""
|
| 449 |
+
Print the 3D model data.
|
| 450 |
+
"""
|
| 451 |
+
file_path = f'./3d_models/{model_name}.model'
|
| 452 |
+
|
| 453 |
+
with open(file_path, 'rb') as f:
|
| 454 |
+
result_with_names = pickle.load(f)
|
| 455 |
+
|
| 456 |
+
for word, vector in result_with_names:
|
| 457 |
+
print(f'{word}: {vector}')
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
|
| 461 |
def main():
|
| 462 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
| 463 |
# archaic_cbow_dict = model_dictionary(model)
|
|
|
|
| 466 |
# print(score)
|
| 467 |
|
| 468 |
|
| 469 |
+
# archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
|
| 470 |
+
# classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
|
| 471 |
+
# early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
|
| 472 |
+
# hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
|
| 473 |
+
# late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
|
| 474 |
|
| 475 |
+
# models = [archaic, classical, early_roman, hellen, late_roman]
|
| 476 |
+
# nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
|
| 477 |
+
# print(nearest_neighbours)
|
| 478 |
# vector = get_word_vector(model, 'ἀνήρ')
|
| 479 |
# print(vector)
|
| 480 |
|
| 481 |
# Iterate over all words and print their vectors
|
| 482 |
# iterate_over_words(model)
|
| 483 |
+
|
| 484 |
+
print_3d_model('archaic')
|
| 485 |
|
| 486 |
|
| 487 |
if __name__ == "__main__":
|