Spaces:

William9999
/

llm_uw25winter

Sleeping

App Files Files Community

William9999 commited on Jan 30

Commit

84b407f

verified ·

1 Parent(s): 6ef3896

Delete Mini_Project_1_Part_1.py

Browse files

Files changed (1) hide show

Mini_Project_1_Part_1.py +0 -402

Mini_Project_1_Part_1.py DELETED Viewed

@@ -1,402 +0,0 @@
-### Import necessary libraries: here you will use streamlit library to run a text search demo, please make sure to install it.
-# !pip install streamlit sentence-transformers gdown matplotlib
-# !pip install pyngrok
-import subprocess
-#subprocess.run(["pip", "install", "streamlit", "sentence-transformers", "gdown", "matplotlib", "pyngrok"], check=True)
-import subprocess
-subprocess.run([
-    "pip", "install",
-    "streamlit",
-    "sentence-transformers",
-    "gdown",
-    "matplotlib",
-    "pyngrok",
-    "tf-keras"  # 添加 tf-keras 到依赖列表
-], check=True)
-import streamlit as st
-import numpy as np
-import numpy.linalg as la
-import pickle
-import os
-import gdown
-from sentence_transformers import SentenceTransformer
-import matplotlib.pyplot as plt
-import math
-from pyngrok import ngrok
-import os
-import subprocess
-### Some predefined utility functions for you to load the text embeddings
-# Function to Load Glove Embeddings
-def load_glove_embeddings(glove_path="Data/embeddings.pkl"):
-    with open(glove_path, "rb") as f:
-        embeddings_dict = pickle.load(f, encoding="latin1")
-    return embeddings_dict
-def get_model_id_gdrive(model_type):
-    if model_type == "25d":
-        word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8"
-        embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2"
-    elif model_type == "50d":
-        embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ"
-        word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9"
-    elif model_type == "100d":
-        word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq"
-        embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp"
-    return word_index_id, embeddings_id
-def download_glove_embeddings_gdrive(model_type):
-    # Get glove embeddings from google drive
-    word_index_id, embeddings_id = get_model_id_gdrive(model_type)
-    # Use gdown to get files from google drive
-    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
-    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
-    # Download word_index pickle file
-    print("Downloading word index dictionary....\n")
-    gdown.download(id=word_index_id, output=word_index_temp, quiet=False)
-    # Download embeddings numpy file
-    print("Donwloading embedings...\n\n")
-    gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
-# @st.cache_data()
-def load_glove_embeddings_gdrive(model_type):
-    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
-    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
-    # Load word index dictionary
-    word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
-    # Load embeddings numpy
-    embeddings = np.load(embeddings_temp)
-    return word_index_dict, embeddings
-@st.cache_resource()
-def load_sentence_transformer_model(model_name):
-    sentenceTransformer = SentenceTransformer(model_name)
-    return sentenceTransformer
-def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"):
-    """
-    Get sentence transformer embeddings for a sentence
-    """
-    # 384 dimensional embedding
-    # Default model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
-    sentenceTransformer = load_sentence_transformer_model(model_name)
-    try:
-        return sentenceTransformer.encode(sentence)
-    except:
-        if model_name == "all-MiniLM-L6-v2":
-            return np.zeros(384)
-        else:
-            return np.zeros(512)
-def get_glove_embeddings(word, word_index_dict, embeddings, model_type):
-    """
-    Get glove embedding for a single word
-    """
-    if word.lower() in word_index_dict:
-        return embeddings[word_index_dict[word.lower()]]
-    else:
-        return np.zeros(int(model_type.split("d")[0]))
-def get_category_embeddings(embeddings_metadata):
-    """
-    Get embeddings for each category
-    1. Split categories into words
-    2. Get embeddings for each word
-    """
-    model_name = embeddings_metadata["model_name"]
-    st.session_state["cat_embed_" + model_name] = {}
-    for category in st.session_state.categories.split(" "):
-        if model_name:
-            if not category in st.session_state["cat_embed_" + model_name]:
-                st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category, model_name=model_name)
-        else:
-            if not category in st.session_state["cat_embed_" + model_name]:
-                st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category)
-def update_category_embeddings(embeddings_metadata):
-    """
-    Update embeddings for each category
-    """
-    get_category_embeddings(embeddings_metadata)
-### Plotting utility functions
-def plot_piechart(sorted_cosine_scores_items):
-    sorted_cosine_scores = np.array([
-            sorted_cosine_scores_items[index][1]
-            for index in range(len(sorted_cosine_scores_items))
-        ]
-    )
-    categories = st.session_state.categories.split(" ")
-    categories_sorted = [
-        categories[sorted_cosine_scores_items[index][0]]
-        for index in range(len(sorted_cosine_scores_items))
-    ]
-    fig, ax = plt.subplots()
-    ax.pie(sorted_cosine_scores, labels=categories_sorted, autopct="%1.1f%%")
-    st.pyplot(fig)  # Figure
-def plot_piechart_helper(sorted_cosine_scores_items):
-    sorted_cosine_scores = np.array(
-        [
-            sorted_cosine_scores_items[index][1]
-            for index in range(len(sorted_cosine_scores_items))
-        ]
-    )
-    categories = st.session_state.categories.split(" ")
-    categories_sorted = [
-        categories[sorted_cosine_scores_items[index][0]]
-        for index in range(len(sorted_cosine_scores_items))
-    ]
-    fig, ax = plt.subplots(figsize=(3, 3))
-    my_explode = np.zeros(len(categories_sorted))
-    my_explode[0] = 0.2
-    if len(categories_sorted) == 3:
-        my_explode[1] = 0.1  # explode this by 0.2
-    elif len(categories_sorted) > 3:
-        my_explode[2] = 0.05
-    ax.pie(
-        sorted_cosine_scores,
-        labels=categories_sorted,
-        autopct="%1.1f%%",
-        explode=my_explode,
-    )
-    return fig
-def plot_piecharts(sorted_cosine_scores_models):
-    scores_list = []
-    categories = st.session_state.categories.split(" ")
-    index = 0
-    for model in sorted_cosine_scores_models:
-        scores_list.append(sorted_cosine_scores_models[model])
-        index += 1
-    if len(sorted_cosine_scores_models) == 2:
-        fig, (ax1, ax2) = plt.subplots(2)
-        categories_sorted = [
-            categories[scores_list[0][index][0]] for index in range(len(scores_list[0]))
-        ]
-        sorted_scores = np.array(
-            [scores_list[0][index][1] for index in range(len(scores_list[0]))]
-        )
-        ax1.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
-        categories_sorted = [
-            categories[scores_list[1][index][0]] for index in range(len(scores_list[1]))
-        ]
-        sorted_scores = np.array(
-            [scores_list[1][index][1] for index in range(len(scores_list[1]))]
-        )
-        ax2.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
-    st.pyplot(fig)
-def plot_alatirchart(sorted_cosine_scores_models):
-    models = list(sorted_cosine_scores_models.keys())
-    tabs = st.tabs(models)
-    figs = {}
-    for model in models:
-        figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
-    for index in range(len(tabs)):
-        with tabs[index]:
-            st.pyplot(figs[models[index]])
-### Your Part To Complete: Follow the instructions in each function below to complete the similarity calculation between text embeddings
-# Task I: Compute Cosine Similarity
-def cosine_similarity(x, y):
-    """
-    Exponentiated cosine similarity
-    1. Compute cosine similarity
-    2. Exponentiate cosine similarity
-    3. Return exponentiated cosine similarity
-    (20 pts)
-    """
-    cosine_sim = np.dot(x, y) / (la.norm(x) * la.norm(y))
-    return np.exp(cosine_sim)
-# Task II: Average Glove Embedding Calculation
-def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
-    """
-    Get averaged glove embeddings for a sentence
-    1. Split sentence into words
-    2. Get embeddings for each word
-    3. Add embeddings for each word
-    4. Divide by number of words
-    5. Return averaged embeddings
-    (30 pts)
-    """
-    words = sentence.split()
-    embedding = np.zeros(int(model_type.split("d")[0]))
-    for word in words:
-        embedding += get_glove_embeddings(word, word_index_dict, embeddings, model_type)
-    return embedding / len(words)
-# Task III: Sort the cosine similarity
-def get_sorted_cosine_similarity(embeddings_metadata):
-    """
-    Get sorted cosine similarity between input sentence and categories
-    Steps:
-    1. Get embeddings for input sentence
-    2. Get embeddings for categories (if not found, update category embeddings)
-    3. Compute cosine similarity between input sentence and categories
-    4. Sort cosine similarity
-    5. Return sorted cosine similarity
-    (50 pts)
-    """
-    categories = st.session_state.categories.split(" ")
-    cosine_sim = {}
-    if embeddings_metadata["embedding_model"] == "glove":
-        word_index_dict = embeddings_metadata["word_index_dict"]
-        embeddings = embeddings_metadata["embeddings"]
-        model_type = embeddings_metadata["model_type"]
-        input_embedding = averaged_glove_embeddings_gdrive(st.session_state.text_search,
-                                                            word_index_dict,
-                                                            embeddings, model_type)
-        for index, category in enumerate(categories):
-            category_embedding = averaged_glove_embeddings_gdrive(category, word_index_dict, embeddings, model_type)
-            cosine_sim[index] = cosine_similarity(input_embedding, category_embedding)
-    else:
-        model_name = embeddings_metadata["model_name"]
-        if not "cat_embed_" + model_name in st.session_state:
-            get_category_embeddings(embeddings_metadata)
-        category_embeddings = st.session_state["cat_embed_" + model_name]
-        input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
-        for index, category in enumerate(categories):
-            cosine_sim[index] = cosine_similarity(input_embedding, category_embeddings[category])
-    sorted_cosine_sim = sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True)
-    return sorted_cosine_sim
-### Below is the main function, creating the app demo for text search engine using the text embeddings.
-if __name__ == "__main__":
-    # Initialize session state variables
-    if "categories" not in st.session_state:
-        st.session_state["categories"] = "Flowers Colors Cars Weather Food"
-    if "text_search" not in st.session_state:
-        st.session_state["text_search"] = "Roses are red, trucks are blue, and Seattle is grey right now"
-    st.sidebar.title("GloVe Twitter")
-    st.sidebar.markdown(
-        """
-        GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
-        2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
-        Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
-        """
-    )
-    model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d", "100d"), index=1)
-    st.title("Search Based Retrieval Demo")
-    st.subheader(
-        "Pass in space separated categories you want this search demo to be about."
-    )
-    st.text_input(
-        label="Categories", key="categories", value=st.session_state["categories"]
-    )
-    st.subheader("Pass in an input word or even a sentence")
-    st.text_input(
-        label="Input your sentence",
-        key="text_search",
-        value=st.session_state["text_search"],
-    )
-    embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
-    word_index_dict_path = "word_index_dict_" + str(model_type) + "_temp.pkl"
-    if not os.path.isfile(embeddings_path) or not os.path.isfile(word_index_dict_path):
-        with st.spinner("Downloading glove embeddings..."):
-            download_glove_embeddings_gdrive(model_type)
-    word_index_dict, embeddings = load_glove_embeddings_gdrive(model_type)
-    if st.session_state.text_search:
-        embeddings_metadata = {
-            "embedding_model": "glove",
-            "word_index_dict": word_index_dict,
-            "embeddings": embeddings,
-            "model_type": model_type,
-        }
-        with st.spinner("Obtaining Cosine similarity for Glove..."):
-            sorted_cosine_sim_glove = get_sorted_cosine_similarity(embeddings_metadata)
-        embeddings_metadata = {"embedding_model": "transformers", "model_name": "all-MiniLM-L6-v2"}
-        with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
-            sorted_cosine_sim_transformer = get_sorted_cosine_similarity(embeddings_metadata)
-        st.subheader(
-            "Closest word I have between: "
-            + st.session_state.categories
-            + " as per different Embeddings"
-        )
-        plot_alatirchart(
-            {
-                "glove_" + str(model_type): sorted_cosine_sim_glove,
-                "sentence_transformer_384": sorted_cosine_sim_transformer,
-            }
-        )
-        st.write("")
-        st.write(
-            "Demo developed by [Your Name](https://www.linkedin.com/in/your_id/ - Optional)"
-        )
-ngrok.set_auth_token("2sEcAp5puu8NYKh4cjBKmlEPLkj_77HPkRNQNMx4dcTUGuLJS")
-# 创建 app.py 文件
-# with open('app.py', 'w') as f:
-#     f.write("""YOUR_FULL_STREAMLIT_CODE_HERE""")
-# # 启动 ngrok
-# public_url = ngrok.connect(port=8501)
-# print(f"Streamlit App URL: {public_url}")
-# # 启动 Streamlit
-# !streamlit run app.py --server.port 8501
-from pyngrok import ngrok
-# 使用自定义配置启动 ngrok 隧道
-tunnel_config = {
-    "addr": 8501,    # 本地端口
-    "proto": "http", # 使用 HTTP 协议
-}
-public_url = ngrok.connect(**tunnel_config)
-print(f"Streamlit App URL: {public_url}")
-# # 启动 Streamlit
-#!streamlit run app.py --server.port 8501
-subprocess.run(["streamlit", "run", "/Users/williamren/Downloads/Mini_Project_1_Part_1.py", "--server.port", "8501"])