Spaces:

88hours
/

multimodel-rag-chat-with-videos

Running

App Files Files Community

88hours commited on Feb 5

Commit

7c6d6c4

1 Parent(s): d1fd97e

Plot shows, however, I am not certain if it is showing the right data. The main confusion is that to see vector data of 512 dimenstion, you need to reduce it to 2 dimension on a scaler plot. The function given here does not work. First np.concatenate does not like lists that has embeddings and has grad init. It want me to detach numpy. The second problem is with MinMaxScaler method that has issue with dimention, it expects 2, but one is given. Not very clear on this

Browse files

Files changed (2) hide show

README.md +7 -3
s5-how-to-umap.py +30 -70

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
-# Journey into Learning/Disecting - 4:00 PM
-[**Interactive Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
 ### A multimodal AI system should be able to understand both text and video content.
@@ -104,4 +104,8 @@ _ = MultimodalLanceDB.from_text_image_pairs(
     Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
     Model Size: BridgeTower model requires ~3.5GB download
     Image Downloads: Some Flickr images may be unavailable; implement robust error handling
-    Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions

+# Journey into Learning/Disecting
+[**Interactive Video Chat Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
 ### A multimodal AI system should be able to understand both text and video content.
     Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
     Model Size: BridgeTower model requires ~3.5GB download
     Image Downloads: Some Flickr images may be unavailable; implement robust error handling
+    Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions
+#Future
+    Stream model from Hugging Face, instead of download
+        - Example: (https://medium.com/@arthur.lagacherie/two-easy-ways-to-stream-output-from-any-huggingface-model-4c70d6a0cf88)

s5-how-to-umap.py CHANGED Viewed

@@ -22,24 +22,15 @@ templates = [
 ]
 # function helps to prepare list image-text pairs from the first [test_size] data
 def data_prep(hf_dataset_name, templates=templates, test_size=1000):
-    # load Huggingface dataset (download if needed)
-    dataset = load_dataset(hf_dataset_name, trust_remote_code=True)
-    #dataset = load_data_from_huggingface(hf_dataset_name)
-    def display_list(lst, indent=0):
-        for item in lst:
-            if isinstance(item, list):
-                display_list(item, indent + 2)
-            else:
-                print(' ' * indent + str(item))
-    # Example usage:
-    display_list(dataset)
-    # split dataset with specific test_size
-    train_test_dataset = train_test_split(dataset, test_size=test_size)
-    # get the test dataset
     test_dataset = train_test_dataset['test']
     img_txt_pairs = []
     for i in range(len(test_dataset)):
         img_txt_pairs.append({
@@ -48,40 +39,12 @@ def data_prep(hf_dataset_name, templates=templates, test_size=1000):
         })
     return img_txt_pairs
-# load cat and car image-text pairs
-def load_pairs_from_dataset(dataset_name, file_name):
-    def load_dataset_locally(file_name):
-        with open(file_name, 'r') as f:
-            dataset = f.readlines()
-        return dataset
-    def save_dataset_locally(dataset_list, file_name):
-        with open(file_name, 'w') as f:
-            for item in dataset_list:
-                f.write("%s\n" % item)
-    def check_dataset_locally(file_name):
-        if (path.exists(file_name)):
-            return True
-        return False
-    if (check_dataset_locally(file_name)):
-        print('Dataset already exists')
-        img_txt_pairs = load_dataset_locally(file_name)
-    else:
-        print('Downloading dataset')
-        img_txt_pairs = data_prep(dataset_name, test_size=50)
-        save_dataset_locally(img_txt_pairs, file_name)
-    return img_txt_pairs
 def load_all_dataset():
-    cat_img_txt_pairs = load_pairs_from_dataset("yashikota/cat-image-dataset", './shared_data/cat_img_txt_pairs.txt')
-    car_img_txt_pairs = load_pairs_from_dataset("tanganke/stanford_cars", './shared_data/car_img_txt_pairs.txt')
     return cat_img_txt_pairs, car_img_txt_pairs
 # compute BridgeTower embeddings for cat image-text pairs
@@ -102,36 +65,31 @@ def load_cat_and_car_embeddings():
                             img_txt_pairs,
                             total=len(img_txt_pairs)
                         ):
-            pil_img = img_txt_pair['pil_img']
-            caption = img_txt_pair['caption']
-            embedding = load_embeddings(caption, pil_img)
-            embeddings.append(embedding)
-            save_embeddings(cat_embeddings, file_name)
-            return embeddings
-    cat_embeddings = []
-    car_embeddings = []
-    if (path.exists('./shared_data/cat_embeddings.pt')):
-        cat_embeddings = torch.load('./shared_data/cat_embeddings.pt')
-    else:
-        cat_embeddings = load_all_embeddings_from_image_text_pairs(cat_img_txt_pairs, './shared_data/cat_embeddings.pt')
-    if (path.exists('./shared_data/car_embeddings.pt')):
-        car_embeddings = torch.load('./shared_data/car_embeddings.pt')
-    else:
-        car_embeddings = load_all_embeddings_from_image_text_pairs(car_img_txt_pairs, './shared_data/car_embeddings.pt')
     return cat_embeddings, car_embeddings
 # function transforms high-dimension vectors to 2D vectors using UMAP
-def dimensionality_reduction(embed_arr, label):
-    X_scaled = MinMaxScaler().fit_transform(embed_arr)
-    print(X_scaled)
     mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
     df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
-    df_emb["label"] = label
     print(df_emb)
     return df_emb
@@ -139,7 +97,7 @@ def show_umap_visualization():
     def reduce_dimensions():
         cat_embeddings, car_embeddings = load_cat_and_car_embeddings()
         # stacking embeddings of cat and car examples into one numpy array
-        all_embeddings = np.concatenate([cat_embeddings, car_embeddings])
         # prepare labels for the 3 examples
         labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)
@@ -164,7 +122,7 @@ def show_umap_visualization():
     plt.ylabel('Y')
     plt.show()
-def run():
     cat_img_txt_pairs, car_img_txt_pairs = load_all_dataset()
     # display an example of a cat image-text pair data
     display(cat_img_txt_pairs[0]['caption'])
@@ -174,4 +132,6 @@ def run():
     display(car_img_txt_pairs[0]['caption'])
     display(car_img_txt_pairs[0]['pil_img'])
-run()

 ]
 # function helps to prepare list image-text pairs from the first [test_size] data
 def data_prep(hf_dataset_name, templates=templates, test_size=1000):
+    # load Huggingface dataset by streaming the dataset which doesn’t download anything, and lets you use it instantly
+    #dataset = load_dataset(hf_dataset_name, trust_remote_code=True, split='train', streaming=True)
+    dataset = load_dataset(hf_dataset_name)
+    # split dataset with specific test_size
+    train_test_dataset = dataset['train'].train_test_split(test_size=test_size)
     test_dataset = train_test_dataset['test']
+    print(test_dataset)
+    # get the test dataset
     img_txt_pairs = []
     for i in range(len(test_dataset)):
         img_txt_pairs.append({
         })
     return img_txt_pairs
 def load_all_dataset():
+    car_img_txt_pairs = data_prep("tanganke/stanford_cars", test_size=50)
+    cat_img_txt_pairs = data_prep("yashikota/cat-image-dataset", test_size=50)
     return cat_img_txt_pairs, car_img_txt_pairs
 # compute BridgeTower embeddings for cat image-text pairs
                             img_txt_pairs,
                             total=len(img_txt_pairs)
                         ):
+            embedding = load_embeddings(img_txt_pair)
+            print(embedding)
+            cross_modal_embeddings = embedding['cross_modal_embeddings'][0].detach().numpy() #this is not the right way to convert tensor to numpy
+            #print(cross_modal_embeddings.shape) #<class 'torch.Tensor'>
+            #save_embeddings(cross_modal_embeddings, file_name)
+            embeddings.append(cross_modal_embeddings)
+            return cross_modal_embeddings
+    cat_embeddings = load_all_embeddings_from_image_text_pairs(cat_img_txt_pairs, './shared_data/cat_embeddings.pt')
+    car_embeddings = load_all_embeddings_from_image_text_pairs(car_img_txt_pairs, './shared_data/car_embeddings.pt')
     return cat_embeddings, car_embeddings
 # function transforms high-dimension vectors to 2D vectors using UMAP
+def dimensionality_reduction(embeddings, labels):
+    print(embeddings)
+    X_scaled = MinMaxScaler().fit_transform(embeddings.reshape(-1, 1)) # This is not the right way to scale the data
     mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
     df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
+    df_emb["label"] = labels
     print(df_emb)
     return df_emb
     def reduce_dimensions():
         cat_embeddings, car_embeddings = load_cat_and_car_embeddings()
         # stacking embeddings of cat and car examples into one numpy array
+        all_embeddings = np.concatenate([cat_embeddings, car_embeddings]) # This is not the right way to scale the data
         # prepare labels for the 3 examples
         labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)
     plt.ylabel('Y')
     plt.show()
+def  an_example_of_cat_and_car_pair_data():
     cat_img_txt_pairs, car_img_txt_pairs = load_all_dataset()
     # display an example of a cat image-text pair data
     display(cat_img_txt_pairs[0]['caption'])
     display(car_img_txt_pairs[0]['caption'])
     display(car_img_txt_pairs[0]['pil_img'])
+if __name__ == '__main__':
+    show_umap_visualization()