import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM , BitsAndBytesConfig import gc import torch def clear_memory(): gc.collect() torch.cuda.empty_cache() model_name = "GIGAParviz/Firooze_test" bnb_config = BitsAndBytesConfig(llm_int8_threshold=6.0) tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config) model = model.to("cpu") model.gradient_checkpointing_enable() pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128) def generate_response(prompt): clear_memory() instruction = f"### Instruction:\n{prompt}\n\n### Response:\n" result = pipe(instruction) return result[0]['generated_text'][len(instruction):] with gr.Blocks() as demo: gr.Markdown("