from llama_cpp import Llama import streamlit as st model_path = "vicuna-13b-v1.5.ggmlv3.q2_K.bin" llama = Llama(model_path) def generate_response(messages: list) -> str: response = llama.create_chat_completion(messages, max_tokens=-1, stream=False) print(f"response: {response}") return response['choices'][0]['message']['content'] def main(): st.title("Chat with Vicuna!") # Session state for retaining messages if 'messages' not in st.session_state: st.session_state.messages = [] # Display chat messages from history on app rerun for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(f"{message['content']}") # Input for the user message user_message = st.chat_input("Your Message") # React to user input if user_message: # Display user message in chat message container with st.chat_message("user"): st.markdown(f"{user_message}") # Add user message to chat history st.session_state.messages.append({"role": "user", "content": user_message}) with st.chat_message("assistant"): message_placeholder = st.empty() full_response = "" for char in generate_response([{"role": m["role"], "content": m["content"]} for m in st.session_state.messages]): full_response += char message_placeholder.markdown(full_response + "❙") message_placeholder.markdown(full_response) st.session_state.messages.append({"role": "assistant", "content": full_response}) if __name__ == "__main__": main()