Spaces:
Sleeping
Sleeping
Deepak Sahu
commited on
Commit
·
dc4b86a
1
Parent(s):
481a63a
adding other files
Browse files- app.py +7 -5
- books_summary.csv +0 -0
- clean_books_summary.csv +0 -0
- unique_titles_books_summary.csv +0 -0
- z_clean_data.py +42 -0
- z_embedding.py +74 -0
- z_evaluate.py +65 -0
- z_finetune_gpt.py +77 -0
- z_hypothetical_summary.py +46 -0
- z_similarity.py +30 -0
- z_utils.py +30 -0
app.py
CHANGED
@@ -21,16 +21,18 @@
|
|
21 |
# Reference: https://huggingface.co/learn/nlp-course/en/chapter9/2
|
22 |
|
23 |
import gradio as gr
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
|
30 |
# We instantiate the Textbox class
|
31 |
textbox = gr.Textbox(label="Write truth you wana Know:", placeholder="John Doe", lines=2)
|
32 |
|
33 |
|
34 |
-
demo = gr.Interface(fn=
|
35 |
|
36 |
demo.launch()
|
|
|
21 |
# Reference: https://huggingface.co/learn/nlp-course/en/chapter9/2
|
22 |
|
23 |
import gradio as gr
|
24 |
+
from z_similarity import computes_similarity_w_hypothetical
|
25 |
+
from z_hypothetical_summary import generate_summaries
|
26 |
|
27 |
+
def get_recommendation(book_title: str):
|
28 |
+
# Generate hypothetical summary
|
29 |
+
fake_summaries = generate_summaries(book_title=book_title, n_samples=5) # other parameters are set to default in the function
|
30 |
+
return fake_summaries[0]
|
31 |
|
32 |
# We instantiate the Textbox class
|
33 |
textbox = gr.Textbox(label="Write truth you wana Know:", placeholder="John Doe", lines=2)
|
34 |
|
35 |
|
36 |
+
demo = gr.Interface(fn=get_recommendation, inputs=textbox, outputs="text")
|
37 |
|
38 |
demo.launch()
|
books_summary.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clean_books_summary.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
unique_titles_books_summary.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
z_clean_data.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from z_utils import get_dataframe
|
2 |
+
|
3 |
+
# Const
|
4 |
+
ORIGNAL_DF = "books_summary.csv"
|
5 |
+
CLEAN_DF = "cleaned_"+ORIGNAL_DF
|
6 |
+
CLEAN_DF_UNIQUE_TITLES = "unique_titles_"+ORIGNAL_DF
|
7 |
+
|
8 |
+
# Load dataset
|
9 |
+
books_df = get_dataframe(ORIGNAL_DF)
|
10 |
+
|
11 |
+
# Original stats
|
12 |
+
print(f"Original Shape: {books_df.shape}")
|
13 |
+
|
14 |
+
# Drop Unknown columns
|
15 |
+
req_columns = ['book_name', 'summaries', 'categories']
|
16 |
+
books_df = books_df[req_columns] # another way could be .drop(...)
|
17 |
+
|
18 |
+
# Check for nulls
|
19 |
+
print(f"\n\nNulls Count=== \n{books_df.isna().sum()}")
|
20 |
+
# removing nulls rowsise cuz their other attirbutes dont contribute
|
21 |
+
books_df.dropna(axis=0, inplace=True)
|
22 |
+
|
23 |
+
|
24 |
+
# Check & remove duplciates
|
25 |
+
print(f"\n\nDuplicate Records: {books_df.duplicated().sum()}")
|
26 |
+
books_df.drop_duplicates(inplace=True)
|
27 |
+
|
28 |
+
|
29 |
+
# Final stats
|
30 |
+
print(f"\n\nCleaned Shape: {books_df.shape}")
|
31 |
+
|
32 |
+
# Saving these cleaned DF
|
33 |
+
print("Storing cleaned as (this includes same titles with diff cats: "+CLEAN_DF)
|
34 |
+
books_df.to_csv(ORIGNAL_DF, index=False)
|
35 |
+
|
36 |
+
# ==== NOW to store the unique titles ====
|
37 |
+
books_df = books_df[["book_name", "summaries"]]
|
38 |
+
books_df.drop_duplicates(inplace=True)
|
39 |
+
print(f"\n\nDF w/ unique titles Shape: {books_df.shape}")
|
40 |
+
# Saving these cleaned DF
|
41 |
+
print("Storing dataset w/ unqiue titles & summaries only "+CLEAN_DF_UNIQUE_TITLES)
|
42 |
+
books_df.to_csv(CLEAN_DF_UNIQUE_TITLES, index=False)
|
z_embedding.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from z_utils import get_dataframe
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
# CONST
|
8 |
+
EMB_MODEL = "all-MiniLM-L6-v2"
|
9 |
+
INP_DATASET_CSV = "unique_titles_books_summary.csv"
|
10 |
+
CACHE_SUMMARY_EMB_NPY = "app_cache/summary_vectors.npy"
|
11 |
+
|
12 |
+
# Load Model
|
13 |
+
# setting this at global level because entire runtime will continue to use this model.
|
14 |
+
model = SentenceTransformer(EMB_MODEL)
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
def dataframe_compute_summary_vector(books_df: pd.DataFrame) -> np.ndarray:
|
19 |
+
'''Takes books summaries and compute embedding vectors
|
20 |
+
|
21 |
+
WARNING: Generated output will return the order of the dataframe; so yeah dont think about DELTA changes.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
books_df: The input DataFrame.
|
25 |
+
|
26 |
+
Returns:
|
27 |
+
pd.DataFrame: The processed DataFrame with new column `vector`
|
28 |
+
'''
|
29 |
+
global model
|
30 |
+
|
31 |
+
if 'summaries' not in books_df.columns:
|
32 |
+
raise ValueError("DataFrame must contain 'summaries' columns.")
|
33 |
+
|
34 |
+
# Progress bar
|
35 |
+
pbar = tqdm(total=books_df.shape[0])
|
36 |
+
|
37 |
+
def encode(text:str):
|
38 |
+
pbar.update(1)
|
39 |
+
try:
|
40 |
+
return model.encode(text)
|
41 |
+
except TypeError as t: # Handle NAN
|
42 |
+
return np.zeros(384) # HARDCODED
|
43 |
+
|
44 |
+
|
45 |
+
summary_vectors = books_df['summaries'].map(encode).to_numpy()
|
46 |
+
|
47 |
+
pbar.close()
|
48 |
+
|
49 |
+
# reorder array size (N_ROWS, 384)
|
50 |
+
summary_vectors = np.stack(summary_vectors)
|
51 |
+
|
52 |
+
return summary_vectors
|
53 |
+
|
54 |
+
def get_embeddings(summaries: list[str]) -> np.ndarray:
|
55 |
+
'''Utils function to to take in hypothetical document(s) and return the embedding of it(s)
|
56 |
+
'''
|
57 |
+
global model
|
58 |
+
if isinstance(summaries, str):
|
59 |
+
summaries = [summaries, ]
|
60 |
+
return model.encode(summaries)
|
61 |
+
|
62 |
+
|
63 |
+
def cache_create_embeddings(books_csv_path: str, output_path: str) -> None:
|
64 |
+
'''Read the books csv and generate vectors of the `summaries` columns and store in `output_path`
|
65 |
+
'''
|
66 |
+
books_df = get_dataframe(books_csv_path)
|
67 |
+
vectors = dataframe_compute_summary_vector(books_df)
|
68 |
+
np.save(file=output_path, arr=vectors)
|
69 |
+
print(f"Vectors saved to {output_path}")
|
70 |
+
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
print("Generating vectors of the summaries")
|
74 |
+
cache_create_embeddings(books_csv_path=INP_DATASET_CSV, output_path=CACHE_SUMMARY_EMB_NPY)
|
z_evaluate.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from z_utils import get_dataframe
|
3 |
+
from z_similarity import computes_similarity_w_hypothetical
|
4 |
+
from z_hypothetical_summary import generate_summaries
|
5 |
+
from tqdm import tqdm
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
# CONST
|
9 |
+
random.seed(53)
|
10 |
+
CLEAN_DF_UNIQUE_TITLES = "unique_titles_books_summary.csv"
|
11 |
+
N_SAMPLES_EVAL = 2
|
12 |
+
TOP_K = 50
|
13 |
+
TOP_P = 0.85
|
14 |
+
|
15 |
+
books_df = get_dataframe(CLEAN_DF_UNIQUE_TITLES)
|
16 |
+
|
17 |
+
# sampling row id
|
18 |
+
random_values: list = random.sample(range(0, books_df.shape[0]), N_SAMPLES_EVAL)
|
19 |
+
|
20 |
+
reciprocal_ranks: list[int] = list()
|
21 |
+
|
22 |
+
pbar = tqdm(total=N_SAMPLES_EVAL)
|
23 |
+
|
24 |
+
for idx in random_values:
|
25 |
+
# Sample a book
|
26 |
+
book = books_df.iloc[idx]
|
27 |
+
|
28 |
+
# Generate hypothetical summary
|
29 |
+
fake_summaries = generate_summaries(book_title = book["book_name"], n_samples=5, top_k=TOP_K, top_p=TOP_P)
|
30 |
+
|
31 |
+
# Compute Simialrity
|
32 |
+
similarity, ranks = computes_similarity_w_hypothetical(hypothetical_summaries=fake_summaries)
|
33 |
+
|
34 |
+
# Get reciprocal Rank
|
35 |
+
df_ranked = books_df.iloc[ranks]
|
36 |
+
df_ranked = df_ranked.reset_index()
|
37 |
+
df_ranked.drop(columns=["index"], inplace=True)
|
38 |
+
rank = df_ranked[df_ranked["book_name"] == book["book_name"]].index.values[0] + 1 # rank starts 0 hence offseting by 1
|
39 |
+
|
40 |
+
# Update list
|
41 |
+
reciprocal_ranks.append(1/rank)
|
42 |
+
pbar.update(1)
|
43 |
+
|
44 |
+
pbar.close()
|
45 |
+
|
46 |
+
print(f"USING Paramerters: TOP_K={TOP_K} TOP_P={TOP_P}")
|
47 |
+
print("MRR: ", sum(reciprocal_ranks)/len(reciprocal_ranks))
|
48 |
+
|
49 |
+
# Calculate five-number summary
|
50 |
+
values = reciprocal_ranks
|
51 |
+
minimum = np.min(values)
|
52 |
+
q1 = np.percentile(values, 25) # First quartile
|
53 |
+
median = np.median(values)
|
54 |
+
q3 = np.percentile(values, 75) # Third quartile
|
55 |
+
maximum = np.max(values)
|
56 |
+
|
57 |
+
# Print the five-number summary
|
58 |
+
print("Five-Number Summary:")
|
59 |
+
print(f"Min: {minimum}")
|
60 |
+
print(f"Q1 : {q1}")
|
61 |
+
print(f"Med: {median}")
|
62 |
+
print(f"Q3 : {q3}")
|
63 |
+
print(f"Max: {maximum}")
|
64 |
+
|
65 |
+
|
z_finetune_gpt.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# THIS file is meant to be used once hence not having functions just sequential code
|
2 |
+
import pandas as pd
|
3 |
+
from transformers import AutoTokenizer, set_seed
|
4 |
+
from transformers import DataCollatorForLanguageModeling
|
5 |
+
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
|
6 |
+
from datasets import Dataset
|
7 |
+
from z_utils import get_dataframe
|
8 |
+
|
9 |
+
# CONST
|
10 |
+
INP_DATASET_CSV = "clean_books_summary.csv"
|
11 |
+
BASE_CASUAL_MODEL = "openai-community/gpt2"
|
12 |
+
# TRAINED_MODEL_OUTPUT_DIR = "gpt2-book-summary-generator" # same name for HF Hub
|
13 |
+
TRAINED_MODEL_OUTPUT_DIR = "content" # same name for HF Hub
|
14 |
+
|
15 |
+
set_seed(42)
|
16 |
+
EPOCHS = 1
|
17 |
+
LR = 2e-5
|
18 |
+
|
19 |
+
# Load dataset
|
20 |
+
books: pd.DataFrame = get_dataframe(INP_DATASET_CSV)
|
21 |
+
|
22 |
+
# Create HF dataset, easier to perform preprocessing at scale
|
23 |
+
dataset_books = Dataset.from_pandas(books, split="train")
|
24 |
+
|
25 |
+
# Loading Tokenizer
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_CASUAL_MODEL)
|
27 |
+
|
28 |
+
# Data Preprocessing
|
29 |
+
def preprocess_function(book):
|
30 |
+
'''Funtion to convert dataset to in prompt form
|
31 |
+
'''
|
32 |
+
# Its Multiline, so DONT put tabs in this editor view otherwise it will get inside string
|
33 |
+
text = f'''Genre: {book['categories']}
|
34 |
+
Book Title: {book['book_name']}
|
35 |
+
Description: {book['book_name']} {book['summaries']}
|
36 |
+
'''
|
37 |
+
return tokenizer(text)
|
38 |
+
|
39 |
+
# Apply Preprocessing
|
40 |
+
tokenized_dataset_books = dataset_books.map(
|
41 |
+
preprocess_function,
|
42 |
+
# batched=True,
|
43 |
+
num_proc=4,
|
44 |
+
remove_columns=dataset_books.column_names,
|
45 |
+
)
|
46 |
+
|
47 |
+
# Data Collator, req for Casual LM
|
48 |
+
tokenizer.pad_token = tokenizer.eos_token
|
49 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
50 |
+
|
51 |
+
|
52 |
+
# Load Casual LM
|
53 |
+
model = AutoModelForCausalLM.from_pretrained(BASE_CASUAL_MODEL)
|
54 |
+
training_args = TrainingArguments(
|
55 |
+
output_dir=TRAINED_MODEL_OUTPUT_DIR,
|
56 |
+
eval_strategy="no",
|
57 |
+
learning_rate=LR,
|
58 |
+
weight_decay=0.01,
|
59 |
+
push_to_hub=True,
|
60 |
+
num_train_epochs=EPOCHS,
|
61 |
+
report_to="none"
|
62 |
+
)
|
63 |
+
|
64 |
+
trainer = Trainer(
|
65 |
+
model=model,
|
66 |
+
args=training_args,
|
67 |
+
train_dataset=tokenized_dataset_books,
|
68 |
+
# eval_dataset=lm_dataset["test"],
|
69 |
+
data_collator=data_collator,
|
70 |
+
tokenizer=tokenizer,
|
71 |
+
)
|
72 |
+
|
73 |
+
# Start training
|
74 |
+
trainer.train()
|
75 |
+
|
76 |
+
# Commit model files to HF
|
77 |
+
trainer.push_to_hub()
|
z_hypothetical_summary.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This Code is about generating supercial book summaries based on the title
|
2 |
+
# the idea is that it will share the similar semantic meaning to the books summaries
|
3 |
+
# already stored because the generator model is fine-tune on the book summaries.
|
4 |
+
from typing import Optional
|
5 |
+
from transformers import pipeline, set_seed
|
6 |
+
|
7 |
+
# CONST
|
8 |
+
set_seed(42)
|
9 |
+
TRAINED_CASUAL_MODEL = "LunaticMaestro/gpt2-book-summary-generator"
|
10 |
+
|
11 |
+
|
12 |
+
generator_model = pipeline('text-generation', model=TRAINED_CASUAL_MODEL)
|
13 |
+
|
14 |
+
|
15 |
+
def generate_summaries(book_title: str, genre: Optional[str] = None, n_samples=2, top_k = 50, top_p = 0.85, ) -> list[str]:
|
16 |
+
'''Generate hypothetical summaries based on book title
|
17 |
+
|
18 |
+
Args:
|
19 |
+
book_title: a 2-3 words descriptive name of the book
|
20 |
+
n_samples: (default=2) count of hypothetical summaries
|
21 |
+
top_k: (default = 50)
|
22 |
+
top_p: (default=0.85)
|
23 |
+
Returns:
|
24 |
+
summaries: list of hypothetical summaries.
|
25 |
+
'''
|
26 |
+
global generator_model
|
27 |
+
|
28 |
+
# basic prompt very similary to one used in fine-tuning
|
29 |
+
prompt = f'''Book Title: {book_title}
|
30 |
+
Description: {book_title}'''
|
31 |
+
|
32 |
+
# Use genre if provied, in the same order as while training
|
33 |
+
if genre:
|
34 |
+
prompt = "Genre: {genre}]\n"+prompt
|
35 |
+
|
36 |
+
# summaries generation
|
37 |
+
output: list[dict] = generator_model(prompt, max_length=100, num_return_sequences=n_samples, top_k=top_k, top_p=top_p)
|
38 |
+
|
39 |
+
# post-processing summaries to remove prompt which was provided
|
40 |
+
summaries = list()
|
41 |
+
for summary in output:
|
42 |
+
summaries.append(
|
43 |
+
summary['generated_text'].lstrip(prompt)
|
44 |
+
)
|
45 |
+
|
46 |
+
return summaries
|
z_similarity.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from z_utils import load_cache_embeddings
|
2 |
+
from z_embedding import model, get_embeddings
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
books_summaries_embs = load_cache_embeddings()
|
7 |
+
|
8 |
+
def computes_similarity_w_hypothetical(hypothetical_summaries: list[str]) -> (np.ndarray, np.ndarray):
|
9 |
+
'''Computes cosine similarity between stored book_summaries and all hypothetical_summaries
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
|
13 |
+
Avg cosine similiarity between actual books sumamries' embeddings and hypothetical summaries
|
14 |
+
|
15 |
+
Ranks of the books summaries based on above consine similarity Distance; Lower ranks means more similar
|
16 |
+
'''
|
17 |
+
global books_summaries_embs, model
|
18 |
+
hypothetical_summaries_embs = get_embeddings(hypothetical_summaries)
|
19 |
+
similarity: torch.Tensor = model.similarity(books_summaries_embs, hypothetical_summaries_embs)
|
20 |
+
|
21 |
+
# Average ouut the distance across all hypothetical embddings
|
22 |
+
similarity = torch.mean(similarity, dim=1)
|
23 |
+
|
24 |
+
# Get the order
|
25 |
+
ranks = torch.argsort(similarity, descending=True)
|
26 |
+
|
27 |
+
# return None
|
28 |
+
return similarity.detach().numpy(), ranks.detach().numpy()
|
29 |
+
|
30 |
+
|
z_utils.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def get_dataframe(file_path: str) -> pd.DataFrame:
|
5 |
+
'''Reads a CSV file and returns a Pandas DataFrame.
|
6 |
+
|
7 |
+
Args:
|
8 |
+
file_path: The path to the CSV file.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
pd.DataFrame: The loaded DataFrame.
|
12 |
+
'''
|
13 |
+
try:
|
14 |
+
df = pd.read_csv(file_path)
|
15 |
+
## Minor tweak to fix the escape sequence character
|
16 |
+
df['summaries'] = df['summaries'].str.replace('\xa0', '', regex=False)
|
17 |
+
return df
|
18 |
+
except FileNotFoundError:
|
19 |
+
print(f"Error: The file at {file_path} was not found.")
|
20 |
+
except pd.errors.EmptyDataError:
|
21 |
+
print(f"Error: The file at {file_path} is empty.")
|
22 |
+
except Exception as e:
|
23 |
+
print(f"An unexpected error occurred: {e}")
|
24 |
+
|
25 |
+
|
26 |
+
def load_cache_embeddings(embedding_path: str="app_cache/summary_vectors.npy") -> np.ndarray:
|
27 |
+
'''Returns embeddings of the book summaries'''
|
28 |
+
emb = np.load(embedding_path)
|
29 |
+
emb = emb.astype(np.float32)
|
30 |
+
return emb
|