from llama_cpp import Llama
import streamlit as st

model_path = "vicuna-13b-v1.5.ggmlv3.q2_K.bin"
llama = Llama(model_path)

def generate_response(messages: list) -> str:
    response = llama.create_chat_completion(messages, max_tokens=-1, stream=False)
    print(f"response: {response}")
    return response['choices'][0]['message']['content']

def main():
    st.title("Chat with Vicuna!")

    # Session state for retaining messages
    if 'messages' not in st.session_state:
        st.session_state.messages = []

    # Display chat messages from history on app rerun
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(f"{message['content']}")

    # Input for the user message
    user_message = st.chat_input("Your Message")

    # React to user input
    if user_message:
        # Display user message in chat message container
        with st.chat_message("user"):
            st.markdown(f"{user_message}")
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": user_message})

        with st.chat_message("assistant"):
            message_placeholder = st.empty()
            full_response = ""

            for char in generate_response([{"role": m["role"], "content": m["content"]} for m in st.session_state.messages]):
                full_response += char
                message_placeholder.markdown(full_response + "❙")

            message_placeholder.markdown(full_response)

        st.session_state.messages.append({"role": "assistant", "content": full_response})


if __name__ == "__main__":
    main()