Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import torch | |
from datasets import DatasetDict, Dataset | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, logging | |
logging.set_verbosity_error() | |
model_name = 'THUDM/chatglm3-6b' | |
############################################# | |
# bitsandbytes parameters | |
############################################# | |
# Activate 4-bit precision for base model loading | |
use_4bit = True | |
# Compute dtype of 4-bit base models | |
bnb_4bit_compute_dtype = 'float16' | |
# Quantization type (fp4 or np4) | |
bnb_4bit_quant_type = 'nf4' | |
# Activate nested quantization for 4-bit base models | |
use_nested_quant = False | |
# device mapping | |
device = torch.device("cpu") # Set device to CPU | |
device_map = {"": -1} # Use -1 for CPU in bnb_config | |
compute_dtype = getattr(torch, bnb_4bit_compute_dtype) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=use_4bit, | |
bnb_4bit_quant_type=bnb_4bit_quant_type, | |
bnb_4bit_compute_dtype=compute_dtype, | |
bnb_4bit_use_double_quant=use_nested_quant, | |
) | |
if compute_dtype == torch.float16 and use_4bit: | |
major, _ = torch.cuda.get_device_capability() | |
if major >= 8: | |
print('='*80) | |
print('Your GPU supports bfloat16, you can accelerate using the argument --fp16') | |
print('='*80) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
trust_remote_code=True, | |
quantization_config=bnb_config, | |
device_map=device_map, | |
) | |
model.config.use_cache = False | |
model.config.pretraining_tp = 1 | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
tokenizer.padding_side = 'left' | |
# Set the title of the Streamlit app | |
st.title("Chatbot with LangChain and HuggingFace Model") | |
# Display the conversation history | |
conversation_text = st.empty() | |
# Get the user input | |
user_input = st.text_input("You: ") | |
history = [] | |
# If the user has submitted input | |
if st.button("Send"): | |
# Generate the chatbot's response | |
response, history = model.chat(tokenizer, user_input, history=history) | |
# Add the response to the conversation history | |
conversation_history.append(f"Bot: {response}") | |
# Update the conversation text | |
conversation_text.markdown("**Conversation:**\n") | |
for message in conversation_history: | |
conversation_text.markdown(f"- {message}") |