William9999 commited on
Commit
84b407f
·
verified ·
1 Parent(s): 6ef3896

Delete Mini_Project_1_Part_1.py

Browse files
Files changed (1) hide show
  1. Mini_Project_1_Part_1.py +0 -402
Mini_Project_1_Part_1.py DELETED
@@ -1,402 +0,0 @@
1
- ### Import necessary libraries: here you will use streamlit library to run a text search demo, please make sure to install it.
2
- # !pip install streamlit sentence-transformers gdown matplotlib
3
- # !pip install pyngrok
4
- import subprocess
5
-
6
-
7
-
8
- #subprocess.run(["pip", "install", "streamlit", "sentence-transformers", "gdown", "matplotlib", "pyngrok"], check=True)
9
- import subprocess
10
-
11
- subprocess.run([
12
- "pip", "install",
13
- "streamlit",
14
- "sentence-transformers",
15
- "gdown",
16
- "matplotlib",
17
- "pyngrok",
18
- "tf-keras" # 添加 tf-keras 到依赖列表
19
- ], check=True)
20
-
21
- import streamlit as st
22
- import numpy as np
23
- import numpy.linalg as la
24
- import pickle
25
- import os
26
- import gdown
27
- from sentence_transformers import SentenceTransformer
28
- import matplotlib.pyplot as plt
29
- import math
30
- from pyngrok import ngrok
31
- import os
32
- import subprocess
33
-
34
-
35
- ### Some predefined utility functions for you to load the text embeddings
36
-
37
- # Function to Load Glove Embeddings
38
- def load_glove_embeddings(glove_path="Data/embeddings.pkl"):
39
- with open(glove_path, "rb") as f:
40
- embeddings_dict = pickle.load(f, encoding="latin1")
41
-
42
- return embeddings_dict
43
-
44
- def get_model_id_gdrive(model_type):
45
- if model_type == "25d":
46
- word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8"
47
- embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2"
48
- elif model_type == "50d":
49
- embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ"
50
- word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9"
51
- elif model_type == "100d":
52
- word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq"
53
- embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp"
54
-
55
- return word_index_id, embeddings_id
56
-
57
- def download_glove_embeddings_gdrive(model_type):
58
- # Get glove embeddings from google drive
59
- word_index_id, embeddings_id = get_model_id_gdrive(model_type)
60
-
61
- # Use gdown to get files from google drive
62
- embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
63
- word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
64
-
65
- # Download word_index pickle file
66
- print("Downloading word index dictionary....\n")
67
- gdown.download(id=word_index_id, output=word_index_temp, quiet=False)
68
-
69
- # Download embeddings numpy file
70
- print("Donwloading embedings...\n\n")
71
- gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
72
-
73
- # @st.cache_data()
74
- def load_glove_embeddings_gdrive(model_type):
75
- word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
76
- embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
77
-
78
- # Load word index dictionary
79
- word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
80
-
81
- # Load embeddings numpy
82
- embeddings = np.load(embeddings_temp)
83
-
84
- return word_index_dict, embeddings
85
-
86
- @st.cache_resource()
87
- def load_sentence_transformer_model(model_name):
88
- sentenceTransformer = SentenceTransformer(model_name)
89
- return sentenceTransformer
90
-
91
- def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"):
92
- """
93
- Get sentence transformer embeddings for a sentence
94
- """
95
- # 384 dimensional embedding
96
- # Default model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
97
-
98
- sentenceTransformer = load_sentence_transformer_model(model_name)
99
-
100
- try:
101
- return sentenceTransformer.encode(sentence)
102
- except:
103
- if model_name == "all-MiniLM-L6-v2":
104
- return np.zeros(384)
105
- else:
106
- return np.zeros(512)
107
-
108
- def get_glove_embeddings(word, word_index_dict, embeddings, model_type):
109
- """
110
- Get glove embedding for a single word
111
- """
112
- if word.lower() in word_index_dict:
113
- return embeddings[word_index_dict[word.lower()]]
114
- else:
115
- return np.zeros(int(model_type.split("d")[0]))
116
-
117
- def get_category_embeddings(embeddings_metadata):
118
- """
119
- Get embeddings for each category
120
- 1. Split categories into words
121
- 2. Get embeddings for each word
122
- """
123
- model_name = embeddings_metadata["model_name"]
124
- st.session_state["cat_embed_" + model_name] = {}
125
- for category in st.session_state.categories.split(" "):
126
- if model_name:
127
- if not category in st.session_state["cat_embed_" + model_name]:
128
- st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category, model_name=model_name)
129
- else:
130
- if not category in st.session_state["cat_embed_" + model_name]:
131
- st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category)
132
-
133
- def update_category_embeddings(embeddings_metadata):
134
- """
135
- Update embeddings for each category
136
- """
137
- get_category_embeddings(embeddings_metadata)
138
-
139
- ### Plotting utility functions
140
-
141
- def plot_piechart(sorted_cosine_scores_items):
142
- sorted_cosine_scores = np.array([
143
- sorted_cosine_scores_items[index][1]
144
- for index in range(len(sorted_cosine_scores_items))
145
- ]
146
- )
147
- categories = st.session_state.categories.split(" ")
148
- categories_sorted = [
149
- categories[sorted_cosine_scores_items[index][0]]
150
- for index in range(len(sorted_cosine_scores_items))
151
- ]
152
- fig, ax = plt.subplots()
153
- ax.pie(sorted_cosine_scores, labels=categories_sorted, autopct="%1.1f%%")
154
- st.pyplot(fig) # Figure
155
-
156
- def plot_piechart_helper(sorted_cosine_scores_items):
157
- sorted_cosine_scores = np.array(
158
- [
159
- sorted_cosine_scores_items[index][1]
160
- for index in range(len(sorted_cosine_scores_items))
161
- ]
162
- )
163
- categories = st.session_state.categories.split(" ")
164
- categories_sorted = [
165
- categories[sorted_cosine_scores_items[index][0]]
166
- for index in range(len(sorted_cosine_scores_items))
167
- ]
168
- fig, ax = plt.subplots(figsize=(3, 3))
169
- my_explode = np.zeros(len(categories_sorted))
170
- my_explode[0] = 0.2
171
- if len(categories_sorted) == 3:
172
- my_explode[1] = 0.1 # explode this by 0.2
173
- elif len(categories_sorted) > 3:
174
- my_explode[2] = 0.05
175
- ax.pie(
176
- sorted_cosine_scores,
177
- labels=categories_sorted,
178
- autopct="%1.1f%%",
179
- explode=my_explode,
180
- )
181
-
182
- return fig
183
-
184
- def plot_piecharts(sorted_cosine_scores_models):
185
- scores_list = []
186
- categories = st.session_state.categories.split(" ")
187
- index = 0
188
- for model in sorted_cosine_scores_models:
189
- scores_list.append(sorted_cosine_scores_models[model])
190
- index += 1
191
-
192
- if len(sorted_cosine_scores_models) == 2:
193
- fig, (ax1, ax2) = plt.subplots(2)
194
-
195
- categories_sorted = [
196
- categories[scores_list[0][index][0]] for index in range(len(scores_list[0]))
197
- ]
198
- sorted_scores = np.array(
199
- [scores_list[0][index][1] for index in range(len(scores_list[0]))]
200
- )
201
- ax1.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
202
-
203
- categories_sorted = [
204
- categories[scores_list[1][index][0]] for index in range(len(scores_list[1]))
205
- ]
206
- sorted_scores = np.array(
207
- [scores_list[1][index][1] for index in range(len(scores_list[1]))]
208
- )
209
- ax2.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
210
-
211
- st.pyplot(fig)
212
-
213
- def plot_alatirchart(sorted_cosine_scores_models):
214
- models = list(sorted_cosine_scores_models.keys())
215
- tabs = st.tabs(models)
216
- figs = {}
217
- for model in models:
218
- figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
219
-
220
- for index in range(len(tabs)):
221
- with tabs[index]:
222
- st.pyplot(figs[models[index]])
223
-
224
- ### Your Part To Complete: Follow the instructions in each function below to complete the similarity calculation between text embeddings
225
-
226
- # Task I: Compute Cosine Similarity
227
- def cosine_similarity(x, y):
228
- """
229
- Exponentiated cosine similarity
230
- 1. Compute cosine similarity
231
- 2. Exponentiate cosine similarity
232
- 3. Return exponentiated cosine similarity
233
- (20 pts)
234
- """
235
- cosine_sim = np.dot(x, y) / (la.norm(x) * la.norm(y))
236
- return np.exp(cosine_sim)
237
-
238
- # Task II: Average Glove Embedding Calculation
239
- def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
240
- """
241
- Get averaged glove embeddings for a sentence
242
- 1. Split sentence into words
243
- 2. Get embeddings for each word
244
- 3. Add embeddings for each word
245
- 4. Divide by number of words
246
- 5. Return averaged embeddings
247
- (30 pts)
248
- """
249
- words = sentence.split()
250
- embedding = np.zeros(int(model_type.split("d")[0]))
251
- for word in words:
252
- embedding += get_glove_embeddings(word, word_index_dict, embeddings, model_type)
253
- return embedding / len(words)
254
-
255
- # Task III: Sort the cosine similarity
256
- def get_sorted_cosine_similarity(embeddings_metadata):
257
- """
258
- Get sorted cosine similarity between input sentence and categories
259
- Steps:
260
- 1. Get embeddings for input sentence
261
- 2. Get embeddings for categories (if not found, update category embeddings)
262
- 3. Compute cosine similarity between input sentence and categories
263
- 4. Sort cosine similarity
264
- 5. Return sorted cosine similarity
265
- (50 pts)
266
- """
267
- categories = st.session_state.categories.split(" ")
268
- cosine_sim = {}
269
- if embeddings_metadata["embedding_model"] == "glove":
270
- word_index_dict = embeddings_metadata["word_index_dict"]
271
- embeddings = embeddings_metadata["embeddings"]
272
- model_type = embeddings_metadata["model_type"]
273
-
274
- input_embedding = averaged_glove_embeddings_gdrive(st.session_state.text_search,
275
- word_index_dict,
276
- embeddings, model_type)
277
-
278
- for index, category in enumerate(categories):
279
- category_embedding = averaged_glove_embeddings_gdrive(category, word_index_dict, embeddings, model_type)
280
- cosine_sim[index] = cosine_similarity(input_embedding, category_embedding)
281
-
282
- else:
283
- model_name = embeddings_metadata["model_name"]
284
- if not "cat_embed_" + model_name in st.session_state:
285
- get_category_embeddings(embeddings_metadata)
286
-
287
- category_embeddings = st.session_state["cat_embed_" + model_name]
288
-
289
- input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
290
- for index, category in enumerate(categories):
291
- cosine_sim[index] = cosine_similarity(input_embedding, category_embeddings[category])
292
-
293
- sorted_cosine_sim = sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True)
294
- return sorted_cosine_sim
295
-
296
- ### Below is the main function, creating the app demo for text search engine using the text embeddings.
297
-
298
- if __name__ == "__main__":
299
- # Initialize session state variables
300
- if "categories" not in st.session_state:
301
- st.session_state["categories"] = "Flowers Colors Cars Weather Food"
302
-
303
- if "text_search" not in st.session_state:
304
- st.session_state["text_search"] = "Roses are red, trucks are blue, and Seattle is grey right now"
305
-
306
- st.sidebar.title("GloVe Twitter")
307
- st.sidebar.markdown(
308
- """
309
- GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
310
- 2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
311
-
312
- Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
313
- """
314
- )
315
-
316
- model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d", "100d"), index=1)
317
-
318
- st.title("Search Based Retrieval Demo")
319
- st.subheader(
320
- "Pass in space separated categories you want this search demo to be about."
321
- )
322
- st.text_input(
323
- label="Categories", key="categories", value=st.session_state["categories"]
324
- )
325
-
326
- st.subheader("Pass in an input word or even a sentence")
327
- st.text_input(
328
- label="Input your sentence",
329
- key="text_search",
330
- value=st.session_state["text_search"],
331
- )
332
-
333
- embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
334
- word_index_dict_path = "word_index_dict_" + str(model_type) + "_temp.pkl"
335
- if not os.path.isfile(embeddings_path) or not os.path.isfile(word_index_dict_path):
336
- with st.spinner("Downloading glove embeddings..."):
337
- download_glove_embeddings_gdrive(model_type)
338
-
339
- word_index_dict, embeddings = load_glove_embeddings_gdrive(model_type)
340
-
341
- if st.session_state.text_search:
342
- embeddings_metadata = {
343
- "embedding_model": "glove",
344
- "word_index_dict": word_index_dict,
345
- "embeddings": embeddings,
346
- "model_type": model_type,
347
- }
348
- with st.spinner("Obtaining Cosine similarity for Glove..."):
349
- sorted_cosine_sim_glove = get_sorted_cosine_similarity(embeddings_metadata)
350
-
351
- embeddings_metadata = {"embedding_model": "transformers", "model_name": "all-MiniLM-L6-v2"}
352
- with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
353
- sorted_cosine_sim_transformer = get_sorted_cosine_similarity(embeddings_metadata)
354
-
355
- st.subheader(
356
- "Closest word I have between: "
357
- + st.session_state.categories
358
- + " as per different Embeddings"
359
- )
360
-
361
- plot_alatirchart(
362
- {
363
- "glove_" + str(model_type): sorted_cosine_sim_glove,
364
- "sentence_transformer_384": sorted_cosine_sim_transformer,
365
- }
366
- )
367
-
368
- st.write("")
369
- st.write(
370
- "Demo developed by [Your Name](https://www.linkedin.com/in/your_id/ - Optional)"
371
- )
372
-
373
-
374
-
375
- ngrok.set_auth_token("2sEcAp5puu8NYKh4cjBKmlEPLkj_77HPkRNQNMx4dcTUGuLJS")
376
-
377
-
378
- # 创建 app.py 文件
379
- # with open('app.py', 'w') as f:
380
- # f.write("""YOUR_FULL_STREAMLIT_CODE_HERE""")
381
-
382
- # # 启动 ngrok
383
- # public_url = ngrok.connect(port=8501)
384
- # print(f"Streamlit App URL: {public_url}")
385
-
386
- # # 启动 Streamlit
387
- # !streamlit run app.py --server.port 8501
388
-
389
- from pyngrok import ngrok
390
-
391
- # 使用自定义配置启动 ngrok 隧道
392
- tunnel_config = {
393
- "addr": 8501, # 本地端口
394
- "proto": "http", # 使用 HTTP 协议
395
- }
396
- public_url = ngrok.connect(**tunnel_config)
397
- print(f"Streamlit App URL: {public_url}")
398
-
399
- # # 启动 Streamlit
400
- #!streamlit run app.py --server.port 8501
401
- subprocess.run(["streamlit", "run", "/Users/williamren/Downloads/Mini_Project_1_Part_1.py", "--server.port", "8501"])
402
-