Spaces:

William9999
/

llm_uw25winter

Sleeping

App Files Files Community

William9999 commited on Jan 30

Commit

583a375

verified ·

1 Parent(s): e6f51a4

Upload 2 files

Browse files

Files changed (2) hide show

app.py +366 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,366 @@

+### Import necessary libraries: here you will use streamlit library to run a text search demo, please make sure to install it.
+# !pip install streamlit sentence-transformers gdown matplotlib
+# !pip install pyngrok
+import subprocess
+subprocess.run([
+    "pip", "install",
+    "streamlit",
+    "sentence-transformers",
+    "gdown",
+    "matplotlib",
+    "tf-keras"  # 添加 tf-keras 到依赖列表
+], check=True)
+import streamlit as st
+import numpy as np
+import numpy.linalg as la
+import pickle
+import os
+import gdown
+from sentence_transformers import SentenceTransformer
+import matplotlib.pyplot as plt
+import math
+import os
+import subprocess
+### Some predefined utility functions for you to load the text embeddings
+# Function to Load Glove Embeddings
+def load_glove_embeddings(glove_path="Data/embeddings.pkl"):
+    with open(glove_path, "rb") as f:
+        embeddings_dict = pickle.load(f, encoding="latin1")
+    return embeddings_dict
+def get_model_id_gdrive(model_type):
+    if model_type == "25d":
+        word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8"
+        embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2"
+    elif model_type == "50d":
+        embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ"
+        word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9"
+    elif model_type == "100d":
+        word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq"
+        embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp"
+    return word_index_id, embeddings_id
+def download_glove_embeddings_gdrive(model_type):
+    # Get glove embeddings from google drive
+    word_index_id, embeddings_id = get_model_id_gdrive(model_type)
+    # Use gdown to get files from google drive
+    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
+    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
+    # Download word_index pickle file
+    print("Downloading word index dictionary....\n")
+    gdown.download(id=word_index_id, output=word_index_temp, quiet=False)
+    # Download embeddings numpy file
+    print("Donwloading embedings...\n\n")
+    gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
+# @st.cache_data()
+def load_glove_embeddings_gdrive(model_type):
+    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
+    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
+    # Load word index dictionary
+    word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
+    # Load embeddings numpy
+    embeddings = np.load(embeddings_temp)
+    return word_index_dict, embeddings
+@st.cache_resource()
+def load_sentence_transformer_model(model_name):
+    sentenceTransformer = SentenceTransformer(model_name)
+    return sentenceTransformer
+def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"):
+    """
+    Get sentence transformer embeddings for a sentence
+    """
+    # 384 dimensional embedding
+    # Default model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+    sentenceTransformer = load_sentence_transformer_model(model_name)
+    try:
+        return sentenceTransformer.encode(sentence)
+    except:
+        if model_name == "all-MiniLM-L6-v2":
+            return np.zeros(384)
+        else:
+            return np.zeros(512)
+def get_glove_embeddings(word, word_index_dict, embeddings, model_type):
+    """
+    Get glove embedding for a single word
+    """
+    if word.lower() in word_index_dict:
+        return embeddings[word_index_dict[word.lower()]]
+    else:
+        return np.zeros(int(model_type.split("d")[0]))
+def get_category_embeddings(embeddings_metadata):
+    """
+    Get embeddings for each category
+    1. Split categories into words
+    2. Get embeddings for each word
+    """
+    model_name = embeddings_metadata["model_name"]
+    st.session_state["cat_embed_" + model_name] = {}
+    for category in st.session_state.categories.split(" "):
+        if model_name:
+            if not category in st.session_state["cat_embed_" + model_name]:
+                st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category, model_name=model_name)
+        else:
+            if not category in st.session_state["cat_embed_" + model_name]:
+                st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category)
+def update_category_embeddings(embeddings_metadata):
+    """
+    Update embeddings for each category
+    """
+    get_category_embeddings(embeddings_metadata)
+### Plotting utility functions
+def plot_piechart(sorted_cosine_scores_items):
+    sorted_cosine_scores = np.array([
+            sorted_cosine_scores_items[index][1]
+            for index in range(len(sorted_cosine_scores_items))
+        ]
+    )
+    categories = st.session_state.categories.split(" ")
+    categories_sorted = [
+        categories[sorted_cosine_scores_items[index][0]]
+        for index in range(len(sorted_cosine_scores_items))
+    ]
+    fig, ax = plt.subplots()
+    ax.pie(sorted_cosine_scores, labels=categories_sorted, autopct="%1.1f%%")
+    st.pyplot(fig)  # Figure
+def plot_piechart_helper(sorted_cosine_scores_items):
+    sorted_cosine_scores = np.array(
+        [
+            sorted_cosine_scores_items[index][1]
+            for index in range(len(sorted_cosine_scores_items))
+        ]
+    )
+    categories = st.session_state.categories.split(" ")
+    categories_sorted = [
+        categories[sorted_cosine_scores_items[index][0]]
+        for index in range(len(sorted_cosine_scores_items))
+    ]
+    fig, ax = plt.subplots(figsize=(3, 3))
+    my_explode = np.zeros(len(categories_sorted))
+    my_explode[0] = 0.2
+    if len(categories_sorted) == 3:
+        my_explode[1] = 0.1  # explode this by 0.2
+    elif len(categories_sorted) > 3:
+        my_explode[2] = 0.05
+    ax.pie(
+        sorted_cosine_scores,
+        labels=categories_sorted,
+        autopct="%1.1f%%",
+        explode=my_explode,
+    )
+    return fig
+def plot_piecharts(sorted_cosine_scores_models):
+    scores_list = []
+    categories = st.session_state.categories.split(" ")
+    index = 0
+    for model in sorted_cosine_scores_models:
+        scores_list.append(sorted_cosine_scores_models[model])
+        index += 1
+    if len(sorted_cosine_scores_models) == 2:
+        fig, (ax1, ax2) = plt.subplots(2)
+        categories_sorted = [
+            categories[scores_list[0][index][0]] for index in range(len(scores_list[0]))
+        ]
+        sorted_scores = np.array(
+            [scores_list[0][index][1] for index in range(len(scores_list[0]))]
+        )
+        ax1.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
+        categories_sorted = [
+            categories[scores_list[1][index][0]] for index in range(len(scores_list[1]))
+        ]
+        sorted_scores = np.array(
+            [scores_list[1][index][1] for index in range(len(scores_list[1]))]
+        )
+        ax2.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
+    st.pyplot(fig)
+def plot_alatirchart(sorted_cosine_scores_models):
+    models = list(sorted_cosine_scores_models.keys())
+    tabs = st.tabs(models)
+    figs = {}
+    for model in models:
+        figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
+    for index in range(len(tabs)):
+        with tabs[index]:
+            st.pyplot(figs[models[index]])
+### Your Part To Complete: Follow the instructions in each function below to complete the similarity calculation between text embeddings
+# Task I: Compute Cosine Similarity
+def cosine_similarity(x, y):
+    """
+    Exponentiated cosine similarity
+    1. Compute cosine similarity
+    2. Exponentiate cosine similarity
+    3. Return exponentiated cosine similarity
+    (20 pts)
+    """
+    cosine_sim = np.dot(x, y) / (la.norm(x) * la.norm(y))
+    return np.exp(cosine_sim)
+# Task II: Average Glove Embedding Calculation
+def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
+    """
+    Get averaged glove embeddings for a sentence
+    1. Split sentence into words
+    2. Get embeddings for each word
+    3. Add embeddings for each word
+    4. Divide by number of words
+    5. Return averaged embeddings
+    (30 pts)
+    """
+    words = sentence.split()
+    embedding = np.zeros(int(model_type.split("d")[0]))
+    for word in words:
+        embedding += get_glove_embeddings(word, word_index_dict, embeddings, model_type)
+    return embedding / len(words)
+# Task III: Sort the cosine similarity
+def get_sorted_cosine_similarity(embeddings_metadata):
+    """
+    Get sorted cosine similarity between input sentence and categories
+    Steps:
+    1. Get embeddings for input sentence
+    2. Get embeddings for categories (if not found, update category embeddings)
+    3. Compute cosine similarity between input sentence and categories
+    4. Sort cosine similarity
+    5. Return sorted cosine similarity
+    (50 pts)
+    """
+    categories = st.session_state.categories.split(" ")
+    cosine_sim = {}
+    if embeddings_metadata["embedding_model"] == "glove":
+        word_index_dict = embeddings_metadata["word_index_dict"]
+        embeddings = embeddings_metadata["embeddings"]
+        model_type = embeddings_metadata["model_type"]
+        input_embedding = averaged_glove_embeddings_gdrive(st.session_state.text_search,
+                                                            word_index_dict,
+                                                            embeddings, model_type)
+        for index, category in enumerate(categories):
+            category_embedding = averaged_glove_embeddings_gdrive(category, word_index_dict, embeddings, model_type)
+            cosine_sim[index] = cosine_similarity(input_embedding, category_embedding)
+    else:
+        model_name = embeddings_metadata["model_name"]
+        if not "cat_embed_" + model_name in st.session_state:
+            get_category_embeddings(embeddings_metadata)
+        category_embeddings = st.session_state["cat_embed_" + model_name]
+        input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
+        for index, category in enumerate(categories):
+            cosine_sim[index] = cosine_similarity(input_embedding, category_embeddings[category])
+    sorted_cosine_sim = sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True)
+    return sorted_cosine_sim
+### Below is the main function, creating the app demo for text search engine using the text embeddings.
+if __name__ == "__main__":
+    # Initialize session state variables
+    if "categories" not in st.session_state:
+        st.session_state["categories"] = "Flowers Colors Cars Weather Food"
+    if "text_search" not in st.session_state:
+        st.session_state["text_search"] = "Roses are red, trucks are blue, and Seattle is grey right now"
+    st.sidebar.title("GloVe Twitter")
+    st.sidebar.markdown(
+        """
+        GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
+        2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
+        Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
+        """
+    )
+    model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d", "100d"), index=1)
+    st.title("Search Based Retrieval Demo")
+    st.subheader(
+        "Pass in space separated categories you want this search demo to be about."
+    )
+    st.text_input(
+        label="Categories", key="categories", value=st.session_state["categories"]
+    )
+    st.subheader("Pass in an input word or even a sentence")
+    st.text_input(
+        label="Input your sentence",
+        key="text_search",
+        value=st.session_state["text_search"],
+    )
+    embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
+    word_index_dict_path = "word_index_dict_" + str(model_type) + "_temp.pkl"
+    if not os.path.isfile(embeddings_path) or not os.path.isfile(word_index_dict_path):
+        with st.spinner("Downloading glove embeddings..."):
+            download_glove_embeddings_gdrive(model_type)
+    word_index_dict, embeddings = load_glove_embeddings_gdrive(model_type)
+    if st.session_state.text_search:
+        embeddings_metadata = {
+            "embedding_model": "glove",
+            "word_index_dict": word_index_dict,
+            "embeddings": embeddings,
+            "model_type": model_type,
+        }
+        with st.spinner("Obtaining Cosine similarity for Glove..."):
+            sorted_cosine_sim_glove = get_sorted_cosine_similarity(embeddings_metadata)
+        embeddings_metadata = {
+            "embedding_model": "transformers",
+            "model_name": "all-MiniLM-L6-v2"
+        }
+        with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
+            sorted_cosine_sim_transformer = get_sorted_cosine_similarity(embeddings_metadata)
+        st.subheader(
+            "Closest word I have between: "
+            + st.session_state.categories
+            + " as per different Embeddings"
+        )
+        plot_alatirchart(
+            {
+                "glove_" + str(model_type): sorted_cosine_sim_glove,
+                "sentence_transformer_384": sorted_cosine_sim_transformer,
+            }
+        )
+        st.write("")
+        st.write(
+            "Demo developed by [Your Name](https://www.linkedin.com/in/your_id/ - Optional)"
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gdown==5.2.0
+matplotlib==3.9.2
+sentence-transformers==3.4.0
+streamlit==1.30.0
+tf-keras