Deepak Sahu commited on
Commit
dc4b86a
·
1 Parent(s): 481a63a

adding other files

Browse files
app.py CHANGED
@@ -21,16 +21,18 @@
21
  # Reference: https://huggingface.co/learn/nlp-course/en/chapter9/2
22
 
23
  import gradio as gr
 
 
24
 
25
-
26
- def greet(name):
27
- return "Hello " + name
28
-
29
 
30
  # We instantiate the Textbox class
31
  textbox = gr.Textbox(label="Write truth you wana Know:", placeholder="John Doe", lines=2)
32
 
33
 
34
- demo = gr.Interface(fn=greet, inputs=textbox, outputs="text")
35
 
36
  demo.launch()
 
21
  # Reference: https://huggingface.co/learn/nlp-course/en/chapter9/2
22
 
23
  import gradio as gr
24
+ from z_similarity import computes_similarity_w_hypothetical
25
+ from z_hypothetical_summary import generate_summaries
26
 
27
+ def get_recommendation(book_title: str):
28
+ # Generate hypothetical summary
29
+ fake_summaries = generate_summaries(book_title=book_title, n_samples=5) # other parameters are set to default in the function
30
+ return fake_summaries[0]
31
 
32
  # We instantiate the Textbox class
33
  textbox = gr.Textbox(label="Write truth you wana Know:", placeholder="John Doe", lines=2)
34
 
35
 
36
+ demo = gr.Interface(fn=get_recommendation, inputs=textbox, outputs="text")
37
 
38
  demo.launch()
books_summary.csv ADDED
The diff for this file is too large to render. See raw diff
 
clean_books_summary.csv ADDED
The diff for this file is too large to render. See raw diff
 
unique_titles_books_summary.csv ADDED
The diff for this file is too large to render. See raw diff
 
z_clean_data.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from z_utils import get_dataframe
2
+
3
+ # Const
4
+ ORIGNAL_DF = "books_summary.csv"
5
+ CLEAN_DF = "cleaned_"+ORIGNAL_DF
6
+ CLEAN_DF_UNIQUE_TITLES = "unique_titles_"+ORIGNAL_DF
7
+
8
+ # Load dataset
9
+ books_df = get_dataframe(ORIGNAL_DF)
10
+
11
+ # Original stats
12
+ print(f"Original Shape: {books_df.shape}")
13
+
14
+ # Drop Unknown columns
15
+ req_columns = ['book_name', 'summaries', 'categories']
16
+ books_df = books_df[req_columns] # another way could be .drop(...)
17
+
18
+ # Check for nulls
19
+ print(f"\n\nNulls Count=== \n{books_df.isna().sum()}")
20
+ # removing nulls rowsise cuz their other attirbutes dont contribute
21
+ books_df.dropna(axis=0, inplace=True)
22
+
23
+
24
+ # Check & remove duplciates
25
+ print(f"\n\nDuplicate Records: {books_df.duplicated().sum()}")
26
+ books_df.drop_duplicates(inplace=True)
27
+
28
+
29
+ # Final stats
30
+ print(f"\n\nCleaned Shape: {books_df.shape}")
31
+
32
+ # Saving these cleaned DF
33
+ print("Storing cleaned as (this includes same titles with diff cats: "+CLEAN_DF)
34
+ books_df.to_csv(ORIGNAL_DF, index=False)
35
+
36
+ # ==== NOW to store the unique titles ====
37
+ books_df = books_df[["book_name", "summaries"]]
38
+ books_df.drop_duplicates(inplace=True)
39
+ print(f"\n\nDF w/ unique titles Shape: {books_df.shape}")
40
+ # Saving these cleaned DF
41
+ print("Storing dataset w/ unqiue titles & summaries only "+CLEAN_DF_UNIQUE_TITLES)
42
+ books_df.to_csv(CLEAN_DF_UNIQUE_TITLES, index=False)
z_embedding.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import pandas as pd
3
+ import numpy as np
4
+ from z_utils import get_dataframe
5
+ from tqdm import tqdm
6
+
7
+ # CONST
8
+ EMB_MODEL = "all-MiniLM-L6-v2"
9
+ INP_DATASET_CSV = "unique_titles_books_summary.csv"
10
+ CACHE_SUMMARY_EMB_NPY = "app_cache/summary_vectors.npy"
11
+
12
+ # Load Model
13
+ # setting this at global level because entire runtime will continue to use this model.
14
+ model = SentenceTransformer(EMB_MODEL)
15
+
16
+
17
+
18
+ def dataframe_compute_summary_vector(books_df: pd.DataFrame) -> np.ndarray:
19
+ '''Takes books summaries and compute embedding vectors
20
+
21
+ WARNING: Generated output will return the order of the dataframe; so yeah dont think about DELTA changes.
22
+
23
+ Args:
24
+ books_df: The input DataFrame.
25
+
26
+ Returns:
27
+ pd.DataFrame: The processed DataFrame with new column `vector`
28
+ '''
29
+ global model
30
+
31
+ if 'summaries' not in books_df.columns:
32
+ raise ValueError("DataFrame must contain 'summaries' columns.")
33
+
34
+ # Progress bar
35
+ pbar = tqdm(total=books_df.shape[0])
36
+
37
+ def encode(text:str):
38
+ pbar.update(1)
39
+ try:
40
+ return model.encode(text)
41
+ except TypeError as t: # Handle NAN
42
+ return np.zeros(384) # HARDCODED
43
+
44
+
45
+ summary_vectors = books_df['summaries'].map(encode).to_numpy()
46
+
47
+ pbar.close()
48
+
49
+ # reorder array size (N_ROWS, 384)
50
+ summary_vectors = np.stack(summary_vectors)
51
+
52
+ return summary_vectors
53
+
54
+ def get_embeddings(summaries: list[str]) -> np.ndarray:
55
+ '''Utils function to to take in hypothetical document(s) and return the embedding of it(s)
56
+ '''
57
+ global model
58
+ if isinstance(summaries, str):
59
+ summaries = [summaries, ]
60
+ return model.encode(summaries)
61
+
62
+
63
+ def cache_create_embeddings(books_csv_path: str, output_path: str) -> None:
64
+ '''Read the books csv and generate vectors of the `summaries` columns and store in `output_path`
65
+ '''
66
+ books_df = get_dataframe(books_csv_path)
67
+ vectors = dataframe_compute_summary_vector(books_df)
68
+ np.save(file=output_path, arr=vectors)
69
+ print(f"Vectors saved to {output_path}")
70
+
71
+
72
+ if __name__ == "__main__":
73
+ print("Generating vectors of the summaries")
74
+ cache_create_embeddings(books_csv_path=INP_DATASET_CSV, output_path=CACHE_SUMMARY_EMB_NPY)
z_evaluate.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from z_utils import get_dataframe
3
+ from z_similarity import computes_similarity_w_hypothetical
4
+ from z_hypothetical_summary import generate_summaries
5
+ from tqdm import tqdm
6
+ import numpy as np
7
+
8
+ # CONST
9
+ random.seed(53)
10
+ CLEAN_DF_UNIQUE_TITLES = "unique_titles_books_summary.csv"
11
+ N_SAMPLES_EVAL = 2
12
+ TOP_K = 50
13
+ TOP_P = 0.85
14
+
15
+ books_df = get_dataframe(CLEAN_DF_UNIQUE_TITLES)
16
+
17
+ # sampling row id
18
+ random_values: list = random.sample(range(0, books_df.shape[0]), N_SAMPLES_EVAL)
19
+
20
+ reciprocal_ranks: list[int] = list()
21
+
22
+ pbar = tqdm(total=N_SAMPLES_EVAL)
23
+
24
+ for idx in random_values:
25
+ # Sample a book
26
+ book = books_df.iloc[idx]
27
+
28
+ # Generate hypothetical summary
29
+ fake_summaries = generate_summaries(book_title = book["book_name"], n_samples=5, top_k=TOP_K, top_p=TOP_P)
30
+
31
+ # Compute Simialrity
32
+ similarity, ranks = computes_similarity_w_hypothetical(hypothetical_summaries=fake_summaries)
33
+
34
+ # Get reciprocal Rank
35
+ df_ranked = books_df.iloc[ranks]
36
+ df_ranked = df_ranked.reset_index()
37
+ df_ranked.drop(columns=["index"], inplace=True)
38
+ rank = df_ranked[df_ranked["book_name"] == book["book_name"]].index.values[0] + 1 # rank starts 0 hence offseting by 1
39
+
40
+ # Update list
41
+ reciprocal_ranks.append(1/rank)
42
+ pbar.update(1)
43
+
44
+ pbar.close()
45
+
46
+ print(f"USING Paramerters: TOP_K={TOP_K} TOP_P={TOP_P}")
47
+ print("MRR: ", sum(reciprocal_ranks)/len(reciprocal_ranks))
48
+
49
+ # Calculate five-number summary
50
+ values = reciprocal_ranks
51
+ minimum = np.min(values)
52
+ q1 = np.percentile(values, 25) # First quartile
53
+ median = np.median(values)
54
+ q3 = np.percentile(values, 75) # Third quartile
55
+ maximum = np.max(values)
56
+
57
+ # Print the five-number summary
58
+ print("Five-Number Summary:")
59
+ print(f"Min: {minimum}")
60
+ print(f"Q1 : {q1}")
61
+ print(f"Med: {median}")
62
+ print(f"Q3 : {q3}")
63
+ print(f"Max: {maximum}")
64
+
65
+
z_finetune_gpt.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIS file is meant to be used once hence not having functions just sequential code
2
+ import pandas as pd
3
+ from transformers import AutoTokenizer, set_seed
4
+ from transformers import DataCollatorForLanguageModeling
5
+ from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
6
+ from datasets import Dataset
7
+ from z_utils import get_dataframe
8
+
9
+ # CONST
10
+ INP_DATASET_CSV = "clean_books_summary.csv"
11
+ BASE_CASUAL_MODEL = "openai-community/gpt2"
12
+ # TRAINED_MODEL_OUTPUT_DIR = "gpt2-book-summary-generator" # same name for HF Hub
13
+ TRAINED_MODEL_OUTPUT_DIR = "content" # same name for HF Hub
14
+
15
+ set_seed(42)
16
+ EPOCHS = 1
17
+ LR = 2e-5
18
+
19
+ # Load dataset
20
+ books: pd.DataFrame = get_dataframe(INP_DATASET_CSV)
21
+
22
+ # Create HF dataset, easier to perform preprocessing at scale
23
+ dataset_books = Dataset.from_pandas(books, split="train")
24
+
25
+ # Loading Tokenizer
26
+ tokenizer = AutoTokenizer.from_pretrained(BASE_CASUAL_MODEL)
27
+
28
+ # Data Preprocessing
29
+ def preprocess_function(book):
30
+ '''Funtion to convert dataset to in prompt form
31
+ '''
32
+ # Its Multiline, so DONT put tabs in this editor view otherwise it will get inside string
33
+ text = f'''Genre: {book['categories']}
34
+ Book Title: {book['book_name']}
35
+ Description: {book['book_name']} {book['summaries']}
36
+ '''
37
+ return tokenizer(text)
38
+
39
+ # Apply Preprocessing
40
+ tokenized_dataset_books = dataset_books.map(
41
+ preprocess_function,
42
+ # batched=True,
43
+ num_proc=4,
44
+ remove_columns=dataset_books.column_names,
45
+ )
46
+
47
+ # Data Collator, req for Casual LM
48
+ tokenizer.pad_token = tokenizer.eos_token
49
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
50
+
51
+
52
+ # Load Casual LM
53
+ model = AutoModelForCausalLM.from_pretrained(BASE_CASUAL_MODEL)
54
+ training_args = TrainingArguments(
55
+ output_dir=TRAINED_MODEL_OUTPUT_DIR,
56
+ eval_strategy="no",
57
+ learning_rate=LR,
58
+ weight_decay=0.01,
59
+ push_to_hub=True,
60
+ num_train_epochs=EPOCHS,
61
+ report_to="none"
62
+ )
63
+
64
+ trainer = Trainer(
65
+ model=model,
66
+ args=training_args,
67
+ train_dataset=tokenized_dataset_books,
68
+ # eval_dataset=lm_dataset["test"],
69
+ data_collator=data_collator,
70
+ tokenizer=tokenizer,
71
+ )
72
+
73
+ # Start training
74
+ trainer.train()
75
+
76
+ # Commit model files to HF
77
+ trainer.push_to_hub()
z_hypothetical_summary.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This Code is about generating supercial book summaries based on the title
2
+ # the idea is that it will share the similar semantic meaning to the books summaries
3
+ # already stored because the generator model is fine-tune on the book summaries.
4
+ from typing import Optional
5
+ from transformers import pipeline, set_seed
6
+
7
+ # CONST
8
+ set_seed(42)
9
+ TRAINED_CASUAL_MODEL = "LunaticMaestro/gpt2-book-summary-generator"
10
+
11
+
12
+ generator_model = pipeline('text-generation', model=TRAINED_CASUAL_MODEL)
13
+
14
+
15
+ def generate_summaries(book_title: str, genre: Optional[str] = None, n_samples=2, top_k = 50, top_p = 0.85, ) -> list[str]:
16
+ '''Generate hypothetical summaries based on book title
17
+
18
+ Args:
19
+ book_title: a 2-3 words descriptive name of the book
20
+ n_samples: (default=2) count of hypothetical summaries
21
+ top_k: (default = 50)
22
+ top_p: (default=0.85)
23
+ Returns:
24
+ summaries: list of hypothetical summaries.
25
+ '''
26
+ global generator_model
27
+
28
+ # basic prompt very similary to one used in fine-tuning
29
+ prompt = f'''Book Title: {book_title}
30
+ Description: {book_title}'''
31
+
32
+ # Use genre if provied, in the same order as while training
33
+ if genre:
34
+ prompt = "Genre: {genre}]\n"+prompt
35
+
36
+ # summaries generation
37
+ output: list[dict] = generator_model(prompt, max_length=100, num_return_sequences=n_samples, top_k=top_k, top_p=top_p)
38
+
39
+ # post-processing summaries to remove prompt which was provided
40
+ summaries = list()
41
+ for summary in output:
42
+ summaries.append(
43
+ summary['generated_text'].lstrip(prompt)
44
+ )
45
+
46
+ return summaries
z_similarity.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from z_utils import load_cache_embeddings
2
+ from z_embedding import model, get_embeddings
3
+ import torch
4
+ import numpy as np
5
+
6
+ books_summaries_embs = load_cache_embeddings()
7
+
8
+ def computes_similarity_w_hypothetical(hypothetical_summaries: list[str]) -> (np.ndarray, np.ndarray):
9
+ '''Computes cosine similarity between stored book_summaries and all hypothetical_summaries
10
+
11
+ Returns:
12
+
13
+ Avg cosine similiarity between actual books sumamries' embeddings and hypothetical summaries
14
+
15
+ Ranks of the books summaries based on above consine similarity Distance; Lower ranks means more similar
16
+ '''
17
+ global books_summaries_embs, model
18
+ hypothetical_summaries_embs = get_embeddings(hypothetical_summaries)
19
+ similarity: torch.Tensor = model.similarity(books_summaries_embs, hypothetical_summaries_embs)
20
+
21
+ # Average ouut the distance across all hypothetical embddings
22
+ similarity = torch.mean(similarity, dim=1)
23
+
24
+ # Get the order
25
+ ranks = torch.argsort(similarity, descending=True)
26
+
27
+ # return None
28
+ return similarity.detach().numpy(), ranks.detach().numpy()
29
+
30
+
z_utils.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def get_dataframe(file_path: str) -> pd.DataFrame:
5
+ '''Reads a CSV file and returns a Pandas DataFrame.
6
+
7
+ Args:
8
+ file_path: The path to the CSV file.
9
+
10
+ Returns:
11
+ pd.DataFrame: The loaded DataFrame.
12
+ '''
13
+ try:
14
+ df = pd.read_csv(file_path)
15
+ ## Minor tweak to fix the escape sequence character
16
+ df['summaries'] = df['summaries'].str.replace('\xa0', '', regex=False)
17
+ return df
18
+ except FileNotFoundError:
19
+ print(f"Error: The file at {file_path} was not found.")
20
+ except pd.errors.EmptyDataError:
21
+ print(f"Error: The file at {file_path} is empty.")
22
+ except Exception as e:
23
+ print(f"An unexpected error occurred: {e}")
24
+
25
+
26
+ def load_cache_embeddings(embedding_path: str="app_cache/summary_vectors.npy") -> np.ndarray:
27
+ '''Returns embeddings of the book summaries'''
28
+ emb = np.load(embedding_path)
29
+ emb = emb.astype(np.float32)
30
+ return emb