88hours commited on
Commit
7c6d6c4
·
1 Parent(s): d1fd97e

Plot shows, however, I am not certain if it is showing the right data. The main confusion is that to see vector data of 512 dimenstion, you need to reduce it to 2 dimension on a scaler plot. The function given here does not work. First np.concatenate does not like lists that has embeddings and has grad init. It want me to detach numpy. The second problem is with MinMaxScaler method that has issue with dimention, it expects 2, but one is given. Not very clear on this

Browse files
Files changed (2) hide show
  1. README.md +7 -3
  2. s5-how-to-umap.py +30 -70
README.md CHANGED
@@ -1,6 +1,6 @@
1
- # Journey into Learning/Disecting - 4:00 PM
2
 
3
- [**Interactive Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
4
 
5
  ### A multimodal AI system should be able to understand both text and video content.
6
 
@@ -104,4 +104,8 @@ _ = MultimodalLanceDB.from_text_image_pairs(
104
  Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
105
  Model Size: BridgeTower model requires ~3.5GB download
106
  Image Downloads: Some Flickr images may be unavailable; implement robust error handling
107
- Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions
 
 
 
 
 
1
+ # Journey into Learning/Disecting
2
 
3
+ [**Interactive Video Chat Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
4
 
5
  ### A multimodal AI system should be able to understand both text and video content.
6
 
 
104
  Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
105
  Model Size: BridgeTower model requires ~3.5GB download
106
  Image Downloads: Some Flickr images may be unavailable; implement robust error handling
107
+ Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions
108
+
109
+ #Future
110
+ Stream model from Hugging Face, instead of download
111
+ - Example: (https://medium.com/@arthur.lagacherie/two-easy-ways-to-stream-output-from-any-huggingface-model-4c70d6a0cf88)
s5-how-to-umap.py CHANGED
@@ -22,24 +22,15 @@ templates = [
22
  ]
23
  # function helps to prepare list image-text pairs from the first [test_size] data
24
  def data_prep(hf_dataset_name, templates=templates, test_size=1000):
25
- # load Huggingface dataset (download if needed)
26
-
27
- dataset = load_dataset(hf_dataset_name, trust_remote_code=True)
28
- #dataset = load_data_from_huggingface(hf_dataset_name)
29
- def display_list(lst, indent=0):
30
- for item in lst:
31
- if isinstance(item, list):
32
- display_list(item, indent + 2)
33
- else:
34
- print(' ' * indent + str(item))
35
-
36
- # Example usage:
37
- display_list(dataset)
38
- # split dataset with specific test_size
39
- train_test_dataset = train_test_split(dataset, test_size=test_size)
40
 
41
- # get the test dataset
 
 
42
  test_dataset = train_test_dataset['test']
 
 
43
  img_txt_pairs = []
44
  for i in range(len(test_dataset)):
45
  img_txt_pairs.append({
@@ -48,40 +39,12 @@ def data_prep(hf_dataset_name, templates=templates, test_size=1000):
48
  })
49
  return img_txt_pairs
50
 
51
- # load cat and car image-text pairs
52
- def load_pairs_from_dataset(dataset_name, file_name):
53
-
54
- def load_dataset_locally(file_name):
55
- with open(file_name, 'r') as f:
56
- dataset = f.readlines()
57
- return dataset
58
-
59
- def save_dataset_locally(dataset_list, file_name):
60
- with open(file_name, 'w') as f:
61
- for item in dataset_list:
62
- f.write("%s\n" % item)
63
-
64
-
65
- def check_dataset_locally(file_name):
66
- if (path.exists(file_name)):
67
- return True
68
- return False
69
-
70
- if (check_dataset_locally(file_name)):
71
- print('Dataset already exists')
72
- img_txt_pairs = load_dataset_locally(file_name)
73
- else:
74
- print('Downloading dataset')
75
-
76
- img_txt_pairs = data_prep(dataset_name, test_size=50)
77
- save_dataset_locally(img_txt_pairs, file_name)
78
- return img_txt_pairs
79
 
80
 
81
  def load_all_dataset():
82
 
83
- cat_img_txt_pairs = load_pairs_from_dataset("yashikota/cat-image-dataset", './shared_data/cat_img_txt_pairs.txt')
84
- car_img_txt_pairs = load_pairs_from_dataset("tanganke/stanford_cars", './shared_data/car_img_txt_pairs.txt')
85
 
86
  return cat_img_txt_pairs, car_img_txt_pairs
87
  # compute BridgeTower embeddings for cat image-text pairs
@@ -102,36 +65,31 @@ def load_cat_and_car_embeddings():
102
  img_txt_pairs,
103
  total=len(img_txt_pairs)
104
  ):
105
- pil_img = img_txt_pair['pil_img']
106
- caption = img_txt_pair['caption']
107
- embedding = load_embeddings(caption, pil_img)
108
- embeddings.append(embedding)
109
- save_embeddings(cat_embeddings, file_name)
110
- return embeddings
 
 
111
 
112
 
113
- cat_embeddings = []
114
- car_embeddings = []
115
- if (path.exists('./shared_data/cat_embeddings.pt')):
116
- cat_embeddings = torch.load('./shared_data/cat_embeddings.pt')
117
- else:
118
- cat_embeddings = load_all_embeddings_from_image_text_pairs(cat_img_txt_pairs, './shared_data/cat_embeddings.pt')
119
-
120
- if (path.exists('./shared_data/car_embeddings.pt')):
121
- car_embeddings = torch.load('./shared_data/car_embeddings.pt')
122
- else:
123
- car_embeddings = load_all_embeddings_from_image_text_pairs(car_img_txt_pairs, './shared_data/car_embeddings.pt')
124
 
125
  return cat_embeddings, car_embeddings
126
 
127
 
128
  # function transforms high-dimension vectors to 2D vectors using UMAP
129
- def dimensionality_reduction(embed_arr, label):
130
- X_scaled = MinMaxScaler().fit_transform(embed_arr)
131
- print(X_scaled)
 
 
132
  mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
133
  df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
134
- df_emb["label"] = label
135
  print(df_emb)
136
  return df_emb
137
 
@@ -139,7 +97,7 @@ def show_umap_visualization():
139
  def reduce_dimensions():
140
  cat_embeddings, car_embeddings = load_cat_and_car_embeddings()
141
  # stacking embeddings of cat and car examples into one numpy array
142
- all_embeddings = np.concatenate([cat_embeddings, car_embeddings])
143
 
144
  # prepare labels for the 3 examples
145
  labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)
@@ -164,7 +122,7 @@ def show_umap_visualization():
164
  plt.ylabel('Y')
165
  plt.show()
166
 
167
- def run():
168
  cat_img_txt_pairs, car_img_txt_pairs = load_all_dataset()
169
  # display an example of a cat image-text pair data
170
  display(cat_img_txt_pairs[0]['caption'])
@@ -174,4 +132,6 @@ def run():
174
  display(car_img_txt_pairs[0]['caption'])
175
  display(car_img_txt_pairs[0]['pil_img'])
176
 
177
- run()
 
 
 
22
  ]
23
  # function helps to prepare list image-text pairs from the first [test_size] data
24
  def data_prep(hf_dataset_name, templates=templates, test_size=1000):
25
+ # load Huggingface dataset by streaming the dataset which doesn’t download anything, and lets you use it instantly
26
+ #dataset = load_dataset(hf_dataset_name, trust_remote_code=True, split='train', streaming=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ dataset = load_dataset(hf_dataset_name)
29
+ # split dataset with specific test_size
30
+ train_test_dataset = dataset['train'].train_test_split(test_size=test_size)
31
  test_dataset = train_test_dataset['test']
32
+ print(test_dataset)
33
+ # get the test dataset
34
  img_txt_pairs = []
35
  for i in range(len(test_dataset)):
36
  img_txt_pairs.append({
 
39
  })
40
  return img_txt_pairs
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  def load_all_dataset():
45
 
46
+ car_img_txt_pairs = data_prep("tanganke/stanford_cars", test_size=50)
47
+ cat_img_txt_pairs = data_prep("yashikota/cat-image-dataset", test_size=50)
48
 
49
  return cat_img_txt_pairs, car_img_txt_pairs
50
  # compute BridgeTower embeddings for cat image-text pairs
 
65
  img_txt_pairs,
66
  total=len(img_txt_pairs)
67
  ):
68
+
69
+ embedding = load_embeddings(img_txt_pair)
70
+ print(embedding)
71
+ cross_modal_embeddings = embedding['cross_modal_embeddings'][0].detach().numpy() #this is not the right way to convert tensor to numpy
72
+ #print(cross_modal_embeddings.shape) #<class 'torch.Tensor'>
73
+ #save_embeddings(cross_modal_embeddings, file_name)
74
+ embeddings.append(cross_modal_embeddings)
75
+ return cross_modal_embeddings
76
 
77
 
78
+ cat_embeddings = load_all_embeddings_from_image_text_pairs(cat_img_txt_pairs, './shared_data/cat_embeddings.pt')
79
+ car_embeddings = load_all_embeddings_from_image_text_pairs(car_img_txt_pairs, './shared_data/car_embeddings.pt')
 
 
 
 
 
 
 
 
 
80
 
81
  return cat_embeddings, car_embeddings
82
 
83
 
84
  # function transforms high-dimension vectors to 2D vectors using UMAP
85
+ def dimensionality_reduction(embeddings, labels):
86
+
87
+
88
+ print(embeddings)
89
+ X_scaled = MinMaxScaler().fit_transform(embeddings.reshape(-1, 1)) # This is not the right way to scale the data
90
  mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
91
  df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
92
+ df_emb["label"] = labels
93
  print(df_emb)
94
  return df_emb
95
 
 
97
  def reduce_dimensions():
98
  cat_embeddings, car_embeddings = load_cat_and_car_embeddings()
99
  # stacking embeddings of cat and car examples into one numpy array
100
+ all_embeddings = np.concatenate([cat_embeddings, car_embeddings]) # This is not the right way to scale the data
101
 
102
  # prepare labels for the 3 examples
103
  labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)
 
122
  plt.ylabel('Y')
123
  plt.show()
124
 
125
+ def an_example_of_cat_and_car_pair_data():
126
  cat_img_txt_pairs, car_img_txt_pairs = load_all_dataset()
127
  # display an example of a cat image-text pair data
128
  display(cat_img_txt_pairs[0]['caption'])
 
132
  display(car_img_txt_pairs[0]['caption'])
133
  display(car_img_txt_pairs[0]['pil_img'])
134
 
135
+
136
+ if __name__ == '__main__':
137
+ show_umap_visualization()