Spaces:

lu-ny
/

Novel_Title_Summary_Gen

Sleeping

App Files Files Community

lu-ny commited on Dec 14, 2023

Commit

b7fa6f0

1 Parent(s): f007a0a

Update app.py

Browse files

made an embeddings class, switched to roBERTa, and fixed the semantic comparison function

Files changed (1) hide show

app.py +84 -56

app.py CHANGED Viewed

@@ -15,72 +15,100 @@ writing_tones = ["Formal","Informal","Humorous","Serious","Sarcastic","Satirical
 # initialize client
 # we could try something larger, I need to check the models
 client = InferenceClient(
-    "v1olet/v1olet_marcoroni-go-bruins-merge-7B"
 )
-# Load pre-trained tokenizer model (replace with your desired model if you want, but it needs to be small)
-model_id = "sentence-transformers/all-MiniLM-L6-v2" #small embeddings model
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModel.from_pretrained(model_id) #cant do this since HF free uses cpu #load_in_4bit=True,
-# Function to calculate cosine similarity between two embeddings
-def calculate_cosine_similarity(embedding1, embedding2):
-    return cosine_similarity(embedding1, embedding2)[0][0]
-# Function to convert text items into embeddings
-def get_embeddings(text_items):
-    embeddings = []
-    for item in text_items:
-        inputs = tokenizer(item, return_tensors="pt", padding=True, truncation=True)
-        with torch.no_grad():
-            outputs = model(**inputs)
-            pooled_output = outputs['pooler_output']
-        embeddings.append(pooled_output)
-    return embeddings
-# Helper Function to select values with small enough cosine similarity and concatenate them into a string
-def select_values_with_low_similarity(embeddings, original_values, num_values_to_select, max_similarity):
-    selected_values = []
-    selected_indices = set()
-    while len(selected_values) < num_values_to_select:
-        index1, index2 = random.sample(range(len(embeddings)), 2)
-        embedding1, embedding2 = embeddings[index1], embeddings[index2]
-        if index1 != index2 and calculate_cosine_similarity(embedding1, embedding2) < max_similarity:
-            if index1 not in selected_indices:
-                selected_values.append(original_values[index1])
-                selected_indices.add(index1)
-            if index2 not in selected_indices:
-                selected_values.append(original_values[index2])
-                selected_indices.add(index2)
-    # Concatenate the selected values into a single string
-    selected_string = ', '.join(selected_values)
-    return selected_string
-# Convert text items into embeddings
-genre_embeddings = get_embeddings(book_genres)
-theme_embeddings = get_embeddings(book_themes)
-tone_embeddings = get_embeddings(writing_tones)
-#clear memory
-del model
-#torch.cuda.empty_cache()
 # helper function to format the prompt appropriately.
 # For this creative writing tool, the user doesn't design the prompt itself
     #but rather genres, tones, & themes of a book to include
-def format_prompt(message, genres, tones, themes):
-    # pick random ones if user leaves it blank but make sure they aren't opposites
     if not genres:
-        selected_genres = select_values_with_low_similarity(genre_embeddings, book_genres, random.randint(3, 5), 0.2)  # Adjust threshold as needed
     if not tones:
-        selected_tones = select_values_with_low_similarity(tone_embeddings, writing_tones, random.randint(3, 5), 0.2)  # Adjust threshold as needed
     if not themes:
-        selected_themes = select_values_with_low_similarity(theme_embeddings, book_themes, random.randint(3, 5), 0.2)  # Adjust threshold as needed
     #Alpaca format since we can't use mixtral on free CPU settings
     prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n"
     #prompt we are using for now

 # initialize client
 # we could try something larger, I need to check the models
+# using zephyr for now because its pretty quick
 client = InferenceClient(
+    'HuggingFaceH4/zephyr-7b-beta'
+#    "v1olet/v1olet_marcoroni-go-bruins-merge-7B"
 )
+######################################
+########## Embeddings Class ##########
+######################################
+class EmbeddingGenerator:
+    def __init__(self, model_id):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.model = AutoModel.from_pretrained(model_id)
+    def calculate_cosine_similarity(self, embedding1, embedding2):
+        return cosine_similarity(embedding1, embedding2)[0][0]
+    def get_embeddings(self, text_items):
+        embeddings = []
+        for item in text_items:
+            inputs = self.tokenizer(item, return_tensors="pt", padding=True, truncation=True)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                pooled_output = outputs['pooler_output']
+            embeddings.append((pooled_output, item))  # Store the embedding along with the original string
+        return embeddings
+    def select_values_with_medium_similarity(self, embeddings, num_values_to_select, min_similarity, max_similarity):
+        selected_values = []
+        selected_indices = set()
+        # Randomly select an initial embedding
+        initial_index = random.randint(0, len(embeddings) - 1)
+        initial_embedding, initial_item = embeddings[initial_index]
+        selected_values.append(initial_item)
+        selected_indices.add(initial_index)
+        while len(selected_values) < num_values_to_select:
+            # Filter embeddings that are within the desired range
+            candidate_indices = [
+                i for i, (embedding, _) in enumerate(embeddings)
+                if i not in selected_indices and min_similarity < self.calculate_cosine_similarity(embedding, initial_embedding) < max_similarity
+            ]
+            if candidate_indices:
+                # Randomly select an embedding from the filtered candidates
+                index_to_select = random.choice(candidate_indices)
+                selected_embedding, selected_item = embeddings[index_to_select]
+                selected_values.append(selected_item)
+                selected_indices.add(index_to_select)
+            else:
+                break
+        # Concatenate the selected values into a single string
+        selected_string = ', '.join(selected_values)
+        return selected_string
+# testing different embeddings models that can fit in colab,
+# need something smallish but also one that can create good semantic word embeddings for cosine similarity to work well
+#model_id = "sentence-transformers/all-MiniLM-L6-v2"
+#model_id = "BAAI/bge-small-en-v1.5"
+# idk if this will work with CPUs, will either be too slow or too big
+model_id = 'roberta-base'
+#instantiate our class
+embedding_generator = EmbeddingGenerator(model_id)
+#generate embeddings
+genre_embeddings = embedding_generator.get_embeddings(book_genres)
+theme_embeddings = embedding_generator.get_embeddings(book_themes)
+tone_embeddings = embedding_generator.get_embeddings(writing_tones)
+# Clear memory
+del embedding_generator
+# torch.cuda.empty_cache()
 # helper function to format the prompt appropriately.
 # For this creative writing tool, the user doesn't design the prompt itself
     #but rather genres, tones, & themes of a book to include
+def format_prompt(genres, tones, themes):
+    #reinstantiate our embeddings class so we can compare the embeddings
+    embedding_generator = EmbeddingGenerator("roberta-base")
+    # pick 2-5 random ones if user leaves the field blank
+    # lower threshold is to avoid selecting synonyms while upper threshold is to avoid antonyms
     if not genres:
+        genres = embedding_generator.select_values_with_medium_similarity(genre_embeddings, random.randint(3, 5), 0.01, 0.7)  # Adjust thresholds as needed
     if not tones:
+        tones = embedding_generator.select_values_with_medium_similarity(tone_embeddings, random.randint(3, 5), 0.01, 0.7)  # Adjust thresholds as needed
     if not themes:
+        themes = embedding_generator.select_values_with_medium_similarity(theme_embeddings, random.randint(3, 5), 0.01, 0.7)  # Adjust thresholds as needed
+    # we won't need our embeddings generator after this step
+    del embedding_generator
     #Alpaca format since we can't use mixtral on free CPU settings
     prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n"
     #prompt we are using for now