Spaces:

LunaticMaestro
/

book-recommender

Running

App Files Files Community

Deepak Sahu commited on Nov 23, 2024

Commit

dc4b86a

1 Parent(s): 481a63a

adding other files

Browse files

Files changed (11) hide show

app.py +7 -5
books_summary.csv +0 -0
clean_books_summary.csv +0 -0
unique_titles_books_summary.csv +0 -0
z_clean_data.py +42 -0
z_embedding.py +74 -0
z_evaluate.py +65 -0
z_finetune_gpt.py +77 -0
z_hypothetical_summary.py +46 -0
z_similarity.py +30 -0
z_utils.py +30 -0

app.py CHANGED Viewed

@@ -21,16 +21,18 @@
 # Reference: https://huggingface.co/learn/nlp-course/en/chapter9/2
 import gradio as gr
-def greet(name):
-    return "Hello " + name
 # We instantiate the Textbox class
 textbox = gr.Textbox(label="Write truth you wana Know:", placeholder="John Doe", lines=2)
-demo = gr.Interface(fn=greet, inputs=textbox, outputs="text")
 demo.launch()

 # Reference: https://huggingface.co/learn/nlp-course/en/chapter9/2
 import gradio as gr
+from z_similarity import computes_similarity_w_hypothetical
+from z_hypothetical_summary import generate_summaries
+def get_recommendation(book_title: str):
+    # Generate hypothetical summary
+    fake_summaries = generate_summaries(book_title=book_title, n_samples=5) # other parameters are set to default in the function
+    return fake_summaries[0]
 # We instantiate the Textbox class
 textbox = gr.Textbox(label="Write truth you wana Know:", placeholder="John Doe", lines=2)
+demo = gr.Interface(fn=get_recommendation, inputs=textbox, outputs="text")
 demo.launch()

books_summary.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

clean_books_summary.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

unique_titles_books_summary.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

z_clean_data.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from z_utils import get_dataframe
+# Const
+ORIGNAL_DF = "books_summary.csv"
+CLEAN_DF = "cleaned_"+ORIGNAL_DF
+CLEAN_DF_UNIQUE_TITLES = "unique_titles_"+ORIGNAL_DF
+# Load dataset
+books_df = get_dataframe(ORIGNAL_DF)
+# Original stats
+print(f"Original Shape: {books_df.shape}")
+# Drop Unknown columns
+req_columns = ['book_name', 'summaries', 'categories']
+books_df = books_df[req_columns] # another way could be .drop(...)
+# Check for nulls
+print(f"\n\nNulls Count=== \n{books_df.isna().sum()}")
+# removing nulls rowsise cuz their other attirbutes dont contribute
+books_df.dropna(axis=0, inplace=True)
+# Check & remove duplciates
+print(f"\n\nDuplicate Records: {books_df.duplicated().sum()}")
+books_df.drop_duplicates(inplace=True)
+# Final stats
+print(f"\n\nCleaned Shape: {books_df.shape}")
+# Saving these cleaned DF
+print("Storing cleaned as (this includes same titles with diff cats: "+CLEAN_DF)
+books_df.to_csv(ORIGNAL_DF, index=False)
+# ==== NOW to store the unique titles  ====
+books_df = books_df[["book_name", "summaries"]]
+books_df.drop_duplicates(inplace=True)
+print(f"\n\nDF w/ unique titles Shape: {books_df.shape}")
+# Saving these cleaned DF
+print("Storing dataset w/ unqiue titles & summaries only "+CLEAN_DF_UNIQUE_TITLES)
+books_df.to_csv(CLEAN_DF_UNIQUE_TITLES, index=False)

z_embedding.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from sentence_transformers import SentenceTransformer
+import pandas as pd
+import numpy as np
+from z_utils import get_dataframe
+from tqdm import tqdm
+# CONST
+EMB_MODEL = "all-MiniLM-L6-v2"
+INP_DATASET_CSV = "unique_titles_books_summary.csv"
+CACHE_SUMMARY_EMB_NPY = "app_cache/summary_vectors.npy"
+# Load Model
+#   setting this at global level because entire runtime will continue to use this model.
+model = SentenceTransformer(EMB_MODEL)
+def dataframe_compute_summary_vector(books_df: pd.DataFrame) -> np.ndarray:
+    '''Takes books summaries and compute embedding vectors
+    WARNING: Generated output will return the order of the dataframe; so yeah dont think about DELTA changes.
+    Args:
+        books_df: The input DataFrame.
+    Returns:
+        pd.DataFrame: The processed DataFrame with new column `vector`
+    '''
+    global model
+    if 'summaries' not in books_df.columns:
+        raise ValueError("DataFrame must contain 'summaries' columns.")
+    # Progress bar
+    pbar = tqdm(total=books_df.shape[0])
+    def encode(text:str):
+        pbar.update(1)
+        try:
+            return model.encode(text)
+        except TypeError as t: # Handle NAN
+            return np.zeros(384) # HARDCODED
+    summary_vectors = books_df['summaries'].map(encode).to_numpy()
+    pbar.close()
+    # reorder array size (N_ROWS, 384)
+    summary_vectors = np.stack(summary_vectors)
+    return summary_vectors
+def get_embeddings(summaries: list[str]) -> np.ndarray:
+    '''Utils function to to take in hypothetical document(s) and return the embedding of it(s)
+    '''
+    global model
+    if isinstance(summaries, str):
+        summaries = [summaries, ]
+    return model.encode(summaries)
+def cache_create_embeddings(books_csv_path: str, output_path: str) -> None:
+    '''Read the books csv and generate vectors of the `summaries` columns and store in `output_path`
+    '''
+    books_df = get_dataframe(books_csv_path)
+    vectors = dataframe_compute_summary_vector(books_df)
+    np.save(file=output_path, arr=vectors)
+    print(f"Vectors saved to {output_path}")
+if __name__ == "__main__":
+    print("Generating vectors of the summaries")
+    cache_create_embeddings(books_csv_path=INP_DATASET_CSV, output_path=CACHE_SUMMARY_EMB_NPY)

z_evaluate.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import random
+from z_utils import get_dataframe
+from z_similarity import computes_similarity_w_hypothetical
+from z_hypothetical_summary import generate_summaries
+from tqdm import tqdm
+import numpy as np
+# CONST
+random.seed(53)
+CLEAN_DF_UNIQUE_TITLES = "unique_titles_books_summary.csv"
+N_SAMPLES_EVAL = 2
+TOP_K = 50
+TOP_P = 0.85
+books_df = get_dataframe(CLEAN_DF_UNIQUE_TITLES)
+# sampling row id
+random_values: list = random.sample(range(0, books_df.shape[0]), N_SAMPLES_EVAL)
+reciprocal_ranks: list[int] = list()
+pbar = tqdm(total=N_SAMPLES_EVAL)
+for idx in random_values:
+    # Sample a book
+    book = books_df.iloc[idx]
+    # Generate hypothetical summary
+    fake_summaries = generate_summaries(book_title = book["book_name"], n_samples=5, top_k=TOP_K, top_p=TOP_P)
+    # Compute Simialrity
+    similarity, ranks = computes_similarity_w_hypothetical(hypothetical_summaries=fake_summaries)
+    # Get reciprocal Rank
+    df_ranked =  books_df.iloc[ranks]
+    df_ranked = df_ranked.reset_index()
+    df_ranked.drop(columns=["index"], inplace=True)
+    rank = df_ranked[df_ranked["book_name"] == book["book_name"]].index.values[0] + 1 # rank starts 0 hence offseting by 1
+    # Update list
+    reciprocal_ranks.append(1/rank)
+    pbar.update(1)
+pbar.close()
+print(f"USING Paramerters: TOP_K={TOP_K}  TOP_P={TOP_P}")
+print("MRR: ", sum(reciprocal_ranks)/len(reciprocal_ranks))
+# Calculate five-number summary
+values = reciprocal_ranks
+minimum = np.min(values)
+q1 = np.percentile(values, 25)  # First quartile
+median = np.median(values)
+q3 = np.percentile(values, 75)  # Third quartile
+maximum = np.max(values)
+# Print the five-number summary
+print("Five-Number Summary:")
+print(f"Min: {minimum}")
+print(f"Q1 : {q1}")
+print(f"Med: {median}")
+print(f"Q3 : {q3}")
+print(f"Max: {maximum}")

z_finetune_gpt.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# THIS file is meant to be used once hence not having functions just sequential code
+import pandas as pd
+from transformers import AutoTokenizer, set_seed
+from transformers import DataCollatorForLanguageModeling
+from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+from datasets import Dataset
+from z_utils import get_dataframe
+# CONST
+INP_DATASET_CSV = "clean_books_summary.csv"
+BASE_CASUAL_MODEL = "openai-community/gpt2"
+# TRAINED_MODEL_OUTPUT_DIR = "gpt2-book-summary-generator" # same name for HF Hub
+TRAINED_MODEL_OUTPUT_DIR = "content" # same name for HF Hub
+set_seed(42)
+EPOCHS = 1
+LR = 2e-5
+# Load dataset
+books: pd.DataFrame = get_dataframe(INP_DATASET_CSV)
+# Create HF dataset, easier to perform preprocessing at scale
+dataset_books = Dataset.from_pandas(books, split="train")
+# Loading Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(BASE_CASUAL_MODEL)
+# Data Preprocessing
+def preprocess_function(book):
+    '''Funtion to convert dataset to in prompt form
+    '''
+    # Its Multiline, so DONT put tabs in this editor view otherwise it will get inside string
+    text = f'''Genre: {book['categories']}
+Book Title: {book['book_name']}
+Description: {book['book_name']} {book['summaries']}
+'''
+    return tokenizer(text)
+# Apply Preprocessing
+tokenized_dataset_books = dataset_books.map(
+    preprocess_function,
+    # batched=True,
+    num_proc=4,
+    remove_columns=dataset_books.column_names,
+)
+# Data Collator, req for Casual LM
+tokenizer.pad_token = tokenizer.eos_token
+data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+# Load Casual LM
+model = AutoModelForCausalLM.from_pretrained(BASE_CASUAL_MODEL)
+training_args = TrainingArguments(
+    output_dir=TRAINED_MODEL_OUTPUT_DIR,
+    eval_strategy="no",
+    learning_rate=LR,
+    weight_decay=0.01,
+    push_to_hub=True,
+    num_train_epochs=EPOCHS,
+    report_to="none"
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset_books,
+    # eval_dataset=lm_dataset["test"],
+    data_collator=data_collator,
+    tokenizer=tokenizer,
+)
+# Start training
+trainer.train()
+# Commit model files to HF
+trainer.push_to_hub()

z_hypothetical_summary.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# This Code is about generating supercial book summaries based on the title
+#  the idea is that it will share the similar semantic meaning to the books summaries
+#  already stored because the generator model is fine-tune on the book summaries.
+from typing import Optional
+from transformers import pipeline, set_seed
+# CONST
+set_seed(42)
+TRAINED_CASUAL_MODEL = "LunaticMaestro/gpt2-book-summary-generator"
+generator_model = pipeline('text-generation', model=TRAINED_CASUAL_MODEL)
+def generate_summaries(book_title: str, genre: Optional[str] = None, n_samples=2, top_k = 50, top_p = 0.85, ) -> list[str]:
+    '''Generate hypothetical summaries based on book title
+    Args:
+        book_title: a 2-3 words descriptive name of the book
+        n_samples: (default=2) count of hypothetical summaries
+        top_k: (default = 50)
+        top_p: (default=0.85)
+    Returns:
+        summaries: list of hypothetical summaries.
+    '''
+    global generator_model
+    # basic prompt very similary to one used in fine-tuning
+    prompt = f'''Book Title: {book_title}
+Description: {book_title}'''
+    # Use genre if provied, in the same order as while training
+    if genre:
+        prompt = "Genre: {genre}]\n"+prompt
+    # summaries generation
+    output: list[dict] = generator_model(prompt, max_length=100, num_return_sequences=n_samples, top_k=top_k, top_p=top_p)
+    # post-processing summaries to remove prompt which was provided
+    summaries = list()
+    for summary in output:
+        summaries.append(
+            summary['generated_text'].lstrip(prompt)
+        )
+    return summaries

z_similarity.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from z_utils import load_cache_embeddings
+from z_embedding import model, get_embeddings
+import torch
+import numpy as np
+books_summaries_embs = load_cache_embeddings()
+def computes_similarity_w_hypothetical(hypothetical_summaries: list[str]) -> (np.ndarray, np.ndarray):
+    '''Computes cosine similarity between stored book_summaries and all hypothetical_summaries
+    Returns:
+        Avg cosine similiarity between actual books sumamries' embeddings and hypothetical summaries
+        Ranks of the books summaries based on above consine similarity Distance; Lower ranks means more similar
+    '''
+    global books_summaries_embs, model
+    hypothetical_summaries_embs = get_embeddings(hypothetical_summaries)
+    similarity: torch.Tensor = model.similarity(books_summaries_embs, hypothetical_summaries_embs)
+    # Average ouut the distance across all hypothetical embddings
+    similarity = torch.mean(similarity, dim=1)
+    # Get the order
+    ranks = torch.argsort(similarity, descending=True)
+    # return None
+    return similarity.detach().numpy(), ranks.detach().numpy()

z_utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pandas as pd
+import numpy as np
+def get_dataframe(file_path: str) -> pd.DataFrame:
+    '''Reads a CSV file and returns a Pandas DataFrame.
+    Args:
+        file_path: The path to the CSV file.
+    Returns:
+        pd.DataFrame: The loaded DataFrame.
+    '''
+    try:
+        df = pd.read_csv(file_path)
+        ## Minor tweak to fix the escape sequence character
+        df['summaries'] = df['summaries'].str.replace('\xa0', '', regex=False)
+        return df
+    except FileNotFoundError:
+        print(f"Error: The file at {file_path} was not found.")
+    except pd.errors.EmptyDataError:
+        print(f"Error: The file at {file_path} is empty.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+def load_cache_embeddings(embedding_path: str="app_cache/summary_vectors.npy") -> np.ndarray:
+    '''Returns embeddings of the book summaries'''
+    emb = np.load(embedding_path)
+    emb = emb.astype(np.float32)
+    return emb