Spaces:

ml6team
/

secret-agent-guardrail-challenge

Running

File size: 13,517 Bytes

cc8705b

"""Demo UI to show different levels of LLM security."""

import streamlit as st
from streamlit_extras.stylable_container import stylable_container

import pandas as pd
import llm

import config


st.set_page_config(
    page_title="LLM security demo",
    page_icon="images/LEG.png",
    layout="wide",
    initial_sidebar_state="expanded",
)

if "key" not in st.session_state:
    st.session_state["key"] = "value"


st.logo("images/ML6_logo.png")

st.title("🕵️ LLM security demo")
st.info(
    "You are a secret agent meeting your informant in a bar. Convince him to give you his secret! But be prepared, with every new level the informant will be more cautious."
)

num_levels = [i for i in range(7)]
# n random secrets
secrets = [
    "Perspicacity",
    "Obfuscation",
    "Sesquipedalian",
    "Synecdoche",
    "Defenestration",
    "Pulchritudinous",
    "Sesquipedalian",
    "Ineffable",
]
level_names = [f"Level {i}" for i in num_levels]
level_emojis = ["😴", "🙂", "😏", "😥", "🥵", "💀", "💀"]
level_tabs = st.tabs(level_names)

hint_css = """
{
    background-color: rgba(110, 110, 110, 0.1);
    padding: 16px;
    border-radius: 0.5rem;


    }
code {
        white-space: pre-wrap !important;
    }
"""


for level_num, level_tab, name, emoji, secret in zip(
    num_levels, level_tabs, level_names, level_emojis, secrets
):
    # init "solved" state
    if f"solved_{name}" not in st.session_state:
        st.session_state[f"solved_{name}"] = False

    # init prompt try count
    if f"prompt_try_count_{name}" not in st.session_state:
        st.session_state[f"prompt_try_count_{name}"] = 0

    # init secret guess count
    if f"secret_guess_count_{name}" not in st.session_state:
        st.session_state[f"secret_guess_count_{name}"] = 0

    # init hint expander status
    for i in range(3):
        if f"opend_hint_{name}_{i}" not in st.session_state:
            st.session_state[f"opend_hint_{name}_{i}"] = False

    deployment_name = name.replace(" ", "-").lower()
    with level_tab:
        header_col1, header_col2 = st.columns(2, gap="medium")
        header_col1.subheader(f"{emoji} {name}")
        header_col2.subheader(f"Need help ...")

        col1, col2 = st.columns(2, gap="medium")

        with col1:
            with st.container(height=600, border=False):
                with st.container(border=True):
                    txt = st.text_area(
                        "Provide your prompt here:",
                        key=f"txt_{name}",
                        label_visibility="visible",
                        height=200,
                        placeholder="Your prompt",
                    )
                    btn_submit_prompt = st.button(
                        "Send prompt", key=f"submit_prompt_{name}"
                    )

                    # Show response
                    if len(txt) > 0 and btn_submit_prompt:
                        st.session_state[f"prompt_try_count_{name}"] += 1
                        with st.container(border=True):
                            st.write("Response:")
                            # special checks for certain levels
                            if level_num == 3 and llm.run_judge(level_num, {"user_input": txt}):
                                st.error("Malicious user input detected. Try again.")
                            elif level_num == 4:
                                output = "".join(
                                    llm.stream_request(deployment_name, secret, txt)
                                )
                                if secret.lower() in output.lower():
                                    st.error(
                                        "Secret detected in user input. Try again."
                                    )
                                else:
                                    st.write(output)
                            elif level_num == 5:
                                output = "".join(
                                    llm.stream_request(deployment_name, secret, txt)
                                )
                                if llm.is_subsequence(output, secret):
                                    st.error(
                                        "Secret detected in user input. Try again."
                                    )
                                else:
                                    st.write(output)
                                    
                            elif level_num == 6:
                                output = "".join(
                                    llm.stream_request(deployment_name, secret, txt)
                                )
                                if llm.run_judge(level_num, {"secret": secret, "message": output}):
                                    st.error(
                                        "Secret detected in user input. Try again."
                                    )
                                else:
                                    st.write(output)
                            else:
                                st.write_stream(
                                    llm.stream_request(deployment_name, secret, txt)
                                )

                with st.container(border=True):
                    secret_guess = st.text_input(
                        "What is the secret?",
                        key=f"guess_{name}",
                        placeholder="Your guess",
                    )
                    btn_submit_guess = st.button(
                        "Submit guess", key=f"submit_guess_{name}"
                    )

                    if btn_submit_guess:
                        st.session_state[f"secret_guess_count_{name}"] += 1
                        if secret_guess.lower() == secret.lower():
                            st.success("You found the secret!")
                            st.session_state[f"solved_{name}"] = True
                        else:
                            st.error("Wrong guess. Try again.")

        with col2:
            with st.container(border=True, height=600):
                st.info(
                    "There are three levels of hints available to you. But be careful, if you open a hint before solving the secret, it will show up in your record.",
                    icon="ℹ️",
                )

                hint_1_cont = stylable_container("hint_1_container", hint_css)

                hint1 = hint_1_cont.checkbox(
                    "Hint 1 - **Description of security strategy**",
                    key=f"hint1_checkbox_{name}",
                )
                if hint1:
                    # if hint gets revealed, it is marked as opened. Unless the secret was already found
                    st.session_state[f"opend_hint_{name}_0"] = (
                        True
                        if st.session_state[f"opend_hint_{name}_0"]
                        else not st.session_state[f"solved_{name}"]
                    )

                    hint_1_cont.write(config.LEVEL_DESCRIPTIONS[level_num]["info"])

                    hint_2_cont = stylable_container("hint_2_container", hint_css)
                    hint2 = hint_2_cont.checkbox(
                        "Hint 2 - **Code execution**", key=f"hint2_checkbox_{name}"
                    )
                    if hint2:
                        st.session_state[f"opend_hint_{name}_1"] = (
                            True
                            if st.session_state[f"opend_hint_{name}_1"]
                            else not st.session_state[f"solved_{name}"]
                        )

                        def show_base_prompt():
                            # show prompt
                            for key, val in prompts.items():
                                descr = key.replace("_", " ").capitalize()
                                hint_2_cont.write(f"*{descr}:*")
                                # custom_code_container(val)
                                # val = val.replace("{{secret}}", '<span style="color: #ff0000">{{secret}}</span>')
                                hint_2_cont.code(val, language=None)

                        user_input_holder = (
                            txt if len(txt) > 0 and btn_submit_prompt else None
                        )

                        prompts = llm.get_full_prompt(
                            deployment_name, user_input=user_input_holder
                        )

                        if level_num == 3:
                            special_prompt = llm.get_full_prompt(
                                llm.special_checks[3], user_input=txt
                            )

                            hint_2_cont.write(
                                "Step 1: A **LLM judge** reviews the user input and determines if it is malicious or not."
                            )
                            hint_2_cont.write("**LLM judge prompt:**")
                            for key, val in special_prompt.items():
                                hint_2_cont.code(val, language=None)
                            hint_2_cont.write(
                                "Step 2: If the user input is not classified as malicious, the prompt containing the actual secret is executed and the response is shown."
                            )
                            hint_2_cont.write("**Actual prompt:**")
                            show_base_prompt()
                        elif level_num == 4:
                            hint_2_cont.write(
                                "Step 1: The following prompt is executed:"
                            )
                            show_base_prompt()
                            hint_2_cont.write(
                                "Step 2: In the response text of the LLM call, the secret is seach for with a simple python expression `secret.lower() in output.lower()`. If it contains the secret, the output will not be shown."
                            )
                        elif level_num == 5:
                            hint_2_cont.write(
                                "Step 1: The following prompt is executed:"
                            )
                            show_base_prompt()
                            hint_2_cont.write(
                                "Step 2: In the response text of the LLM call, the secret is seach for with a python function 'is_subsequence' which looks for substrings. If it contains the secret, the output will not be shown."
                            )
                            llm.is_subsequence
                        else:
                            hint_2_cont.write(
                                "Step 1: The following prompt is executed and the response is shown:"
                            )
                            show_base_prompt()

                        # st.divider()
                        hint_3_cont = stylable_container("hint_3_container", hint_css)

                        hint3 = hint_3_cont.checkbox(
                            "Hint 3 - **Example solution**",
                            key=f"hint3_checkbox_{name}",
                        )
                        if hint3:
                            st.session_state[f"opend_hint_{name}_2"] = (
                                True
                                if st.session_state[f"opend_hint_{name}_2"]
                                else not st.session_state[f"solved_{name}"]
                            )
                            # custom_code_container(
                            #     config.LEVEL_DESCRIPTIONS[level_num]["solution"],
                            # )

                            hint_3_cont.code(
                                config.LEVEL_DESCRIPTIONS[level_num]["solution"],
                                language=None,
                            )
                            hint_3_cont.info("*May not allways work")


with st.expander("🏆 Record", expanded=True):
    # build table
    table_data = []
    for idx, name in enumerate(level_names):
        table_data.append(
            [
                idx,
                st.session_state[f"prompt_try_count_{name}"],
                st.session_state[f"secret_guess_count_{name}"],
                "❌" if st.session_state[f"opend_hint_{name}_0"] else "-",
                "❌" if st.session_state[f"opend_hint_{name}_1"] else "-",
                "❌" if st.session_state[f"opend_hint_{name}_2"] else "-",
                "✅" if st.session_state[f"solved_{name}"] else "❌",
                secrets[idx] if st.session_state[f"solved_{name}"] else "...",
            ]
        )

    # show as pandas dataframe
    st.table(
        pd.DataFrame(
            table_data,
            columns=[
                "Level",
                "Prompt tries",
                "Secret guesses",
                "Used hint 1",
                "Used hint 2",
                "Used hint 3",
                "Solved",
                "Secret",
            ],
            index=level_emojis,
        )
    )

# TODOS:
# - add more levels
# - use Gemini-Pro-Flash for supervisor LLM
# - show the actual workflow of the safeguard (what gets executed)
# - story telling --> new field hard to be 100 percentage save
# - use LLM judge to look for secret in model output
# - show which safe guards were used in 'Record' table
# - funny: always return "I am sorry I cannot do that."
# switch to azure deployment