Plot shows, however, I am not certain if it is showing the right data. The main confusion is that to see vector data of 512 dimenstion, you need to reduce it to 2 dimension on a scaler plot. The function given here does not work. First np.concatenate does not like lists that has embeddings and has grad init. It want me to detach numpy. The second problem is with MinMaxScaler method that has issue with dimention, it expects 2, but one is given. Not very clear on this
Browse files- README.md +7 -3
- s5-how-to-umap.py +30 -70
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
# Journey into Learning/Disecting
|
2 |
|
3 |
-
[**Interactive Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
|
4 |
|
5 |
### A multimodal AI system should be able to understand both text and video content.
|
6 |
|
@@ -104,4 +104,8 @@ _ = MultimodalLanceDB.from_text_image_pairs(
|
|
104 |
Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
|
105 |
Model Size: BridgeTower model requires ~3.5GB download
|
106 |
Image Downloads: Some Flickr images may be unavailable; implement robust error handling
|
107 |
-
Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions
|
|
|
|
|
|
|
|
|
|
1 |
+
# Journey into Learning/Disecting
|
2 |
|
3 |
+
[**Interactive Video Chat Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
|
4 |
|
5 |
### A multimodal AI system should be able to understand both text and video content.
|
6 |
|
|
|
104 |
Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
|
105 |
Model Size: BridgeTower model requires ~3.5GB download
|
106 |
Image Downloads: Some Flickr images may be unavailable; implement robust error handling
|
107 |
+
Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions
|
108 |
+
|
109 |
+
#Future
|
110 |
+
Stream model from Hugging Face, instead of download
|
111 |
+
- Example: (https://medium.com/@arthur.lagacherie/two-easy-ways-to-stream-output-from-any-huggingface-model-4c70d6a0cf88)
|
s5-how-to-umap.py
CHANGED
@@ -22,24 +22,15 @@ templates = [
|
|
22 |
]
|
23 |
# function helps to prepare list image-text pairs from the first [test_size] data
|
24 |
def data_prep(hf_dataset_name, templates=templates, test_size=1000):
|
25 |
-
# load Huggingface dataset
|
26 |
-
|
27 |
-
dataset = load_dataset(hf_dataset_name, trust_remote_code=True)
|
28 |
-
#dataset = load_data_from_huggingface(hf_dataset_name)
|
29 |
-
def display_list(lst, indent=0):
|
30 |
-
for item in lst:
|
31 |
-
if isinstance(item, list):
|
32 |
-
display_list(item, indent + 2)
|
33 |
-
else:
|
34 |
-
print(' ' * indent + str(item))
|
35 |
-
|
36 |
-
# Example usage:
|
37 |
-
display_list(dataset)
|
38 |
-
# split dataset with specific test_size
|
39 |
-
train_test_dataset = train_test_split(dataset, test_size=test_size)
|
40 |
|
41 |
-
|
|
|
|
|
42 |
test_dataset = train_test_dataset['test']
|
|
|
|
|
43 |
img_txt_pairs = []
|
44 |
for i in range(len(test_dataset)):
|
45 |
img_txt_pairs.append({
|
@@ -48,40 +39,12 @@ def data_prep(hf_dataset_name, templates=templates, test_size=1000):
|
|
48 |
})
|
49 |
return img_txt_pairs
|
50 |
|
51 |
-
# load cat and car image-text pairs
|
52 |
-
def load_pairs_from_dataset(dataset_name, file_name):
|
53 |
-
|
54 |
-
def load_dataset_locally(file_name):
|
55 |
-
with open(file_name, 'r') as f:
|
56 |
-
dataset = f.readlines()
|
57 |
-
return dataset
|
58 |
-
|
59 |
-
def save_dataset_locally(dataset_list, file_name):
|
60 |
-
with open(file_name, 'w') as f:
|
61 |
-
for item in dataset_list:
|
62 |
-
f.write("%s\n" % item)
|
63 |
-
|
64 |
-
|
65 |
-
def check_dataset_locally(file_name):
|
66 |
-
if (path.exists(file_name)):
|
67 |
-
return True
|
68 |
-
return False
|
69 |
-
|
70 |
-
if (check_dataset_locally(file_name)):
|
71 |
-
print('Dataset already exists')
|
72 |
-
img_txt_pairs = load_dataset_locally(file_name)
|
73 |
-
else:
|
74 |
-
print('Downloading dataset')
|
75 |
-
|
76 |
-
img_txt_pairs = data_prep(dataset_name, test_size=50)
|
77 |
-
save_dataset_locally(img_txt_pairs, file_name)
|
78 |
-
return img_txt_pairs
|
79 |
|
80 |
|
81 |
def load_all_dataset():
|
82 |
|
83 |
-
|
84 |
-
|
85 |
|
86 |
return cat_img_txt_pairs, car_img_txt_pairs
|
87 |
# compute BridgeTower embeddings for cat image-text pairs
|
@@ -102,36 +65,31 @@ def load_cat_and_car_embeddings():
|
|
102 |
img_txt_pairs,
|
103 |
total=len(img_txt_pairs)
|
104 |
):
|
105 |
-
|
106 |
-
|
107 |
-
embedding
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
111 |
|
112 |
|
113 |
-
cat_embeddings =
|
114 |
-
car_embeddings =
|
115 |
-
if (path.exists('./shared_data/cat_embeddings.pt')):
|
116 |
-
cat_embeddings = torch.load('./shared_data/cat_embeddings.pt')
|
117 |
-
else:
|
118 |
-
cat_embeddings = load_all_embeddings_from_image_text_pairs(cat_img_txt_pairs, './shared_data/cat_embeddings.pt')
|
119 |
-
|
120 |
-
if (path.exists('./shared_data/car_embeddings.pt')):
|
121 |
-
car_embeddings = torch.load('./shared_data/car_embeddings.pt')
|
122 |
-
else:
|
123 |
-
car_embeddings = load_all_embeddings_from_image_text_pairs(car_img_txt_pairs, './shared_data/car_embeddings.pt')
|
124 |
|
125 |
return cat_embeddings, car_embeddings
|
126 |
|
127 |
|
128 |
# function transforms high-dimension vectors to 2D vectors using UMAP
|
129 |
-
def dimensionality_reduction(
|
130 |
-
|
131 |
-
|
|
|
|
|
132 |
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
|
133 |
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
|
134 |
-
df_emb["label"] =
|
135 |
print(df_emb)
|
136 |
return df_emb
|
137 |
|
@@ -139,7 +97,7 @@ def show_umap_visualization():
|
|
139 |
def reduce_dimensions():
|
140 |
cat_embeddings, car_embeddings = load_cat_and_car_embeddings()
|
141 |
# stacking embeddings of cat and car examples into one numpy array
|
142 |
-
all_embeddings = np.concatenate([cat_embeddings, car_embeddings])
|
143 |
|
144 |
# prepare labels for the 3 examples
|
145 |
labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)
|
@@ -164,7 +122,7 @@ def show_umap_visualization():
|
|
164 |
plt.ylabel('Y')
|
165 |
plt.show()
|
166 |
|
167 |
-
def
|
168 |
cat_img_txt_pairs, car_img_txt_pairs = load_all_dataset()
|
169 |
# display an example of a cat image-text pair data
|
170 |
display(cat_img_txt_pairs[0]['caption'])
|
@@ -174,4 +132,6 @@ def run():
|
|
174 |
display(car_img_txt_pairs[0]['caption'])
|
175 |
display(car_img_txt_pairs[0]['pil_img'])
|
176 |
|
177 |
-
|
|
|
|
|
|
22 |
]
|
23 |
# function helps to prepare list image-text pairs from the first [test_size] data
|
24 |
def data_prep(hf_dataset_name, templates=templates, test_size=1000):
|
25 |
+
# load Huggingface dataset by streaming the dataset which doesn’t download anything, and lets you use it instantly
|
26 |
+
#dataset = load_dataset(hf_dataset_name, trust_remote_code=True, split='train', streaming=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
dataset = load_dataset(hf_dataset_name)
|
29 |
+
# split dataset with specific test_size
|
30 |
+
train_test_dataset = dataset['train'].train_test_split(test_size=test_size)
|
31 |
test_dataset = train_test_dataset['test']
|
32 |
+
print(test_dataset)
|
33 |
+
# get the test dataset
|
34 |
img_txt_pairs = []
|
35 |
for i in range(len(test_dataset)):
|
36 |
img_txt_pairs.append({
|
|
|
39 |
})
|
40 |
return img_txt_pairs
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
def load_all_dataset():
|
45 |
|
46 |
+
car_img_txt_pairs = data_prep("tanganke/stanford_cars", test_size=50)
|
47 |
+
cat_img_txt_pairs = data_prep("yashikota/cat-image-dataset", test_size=50)
|
48 |
|
49 |
return cat_img_txt_pairs, car_img_txt_pairs
|
50 |
# compute BridgeTower embeddings for cat image-text pairs
|
|
|
65 |
img_txt_pairs,
|
66 |
total=len(img_txt_pairs)
|
67 |
):
|
68 |
+
|
69 |
+
embedding = load_embeddings(img_txt_pair)
|
70 |
+
print(embedding)
|
71 |
+
cross_modal_embeddings = embedding['cross_modal_embeddings'][0].detach().numpy() #this is not the right way to convert tensor to numpy
|
72 |
+
#print(cross_modal_embeddings.shape) #<class 'torch.Tensor'>
|
73 |
+
#save_embeddings(cross_modal_embeddings, file_name)
|
74 |
+
embeddings.append(cross_modal_embeddings)
|
75 |
+
return cross_modal_embeddings
|
76 |
|
77 |
|
78 |
+
cat_embeddings = load_all_embeddings_from_image_text_pairs(cat_img_txt_pairs, './shared_data/cat_embeddings.pt')
|
79 |
+
car_embeddings = load_all_embeddings_from_image_text_pairs(car_img_txt_pairs, './shared_data/car_embeddings.pt')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
return cat_embeddings, car_embeddings
|
82 |
|
83 |
|
84 |
# function transforms high-dimension vectors to 2D vectors using UMAP
|
85 |
+
def dimensionality_reduction(embeddings, labels):
|
86 |
+
|
87 |
+
|
88 |
+
print(embeddings)
|
89 |
+
X_scaled = MinMaxScaler().fit_transform(embeddings.reshape(-1, 1)) # This is not the right way to scale the data
|
90 |
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
|
91 |
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
|
92 |
+
df_emb["label"] = labels
|
93 |
print(df_emb)
|
94 |
return df_emb
|
95 |
|
|
|
97 |
def reduce_dimensions():
|
98 |
cat_embeddings, car_embeddings = load_cat_and_car_embeddings()
|
99 |
# stacking embeddings of cat and car examples into one numpy array
|
100 |
+
all_embeddings = np.concatenate([cat_embeddings, car_embeddings]) # This is not the right way to scale the data
|
101 |
|
102 |
# prepare labels for the 3 examples
|
103 |
labels = ['cat'] * len(cat_embeddings) + ['car'] * len(car_embeddings)
|
|
|
122 |
plt.ylabel('Y')
|
123 |
plt.show()
|
124 |
|
125 |
+
def an_example_of_cat_and_car_pair_data():
|
126 |
cat_img_txt_pairs, car_img_txt_pairs = load_all_dataset()
|
127 |
# display an example of a cat image-text pair data
|
128 |
display(cat_img_txt_pairs[0]['caption'])
|
|
|
132 |
display(car_img_txt_pairs[0]['caption'])
|
133 |
display(car_img_txt_pairs[0]['pil_img'])
|
134 |
|
135 |
+
|
136 |
+
if __name__ == '__main__':
|
137 |
+
show_umap_visualization()
|