# -*- coding: utf-8 -*- """Hugging Face Space App with CPU Quantization""" import os import gradio as gr from huggingface_hub import login from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # 登錄 Hugging Face,使用訪問令牌進行身份驗證 HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError( "未找到 Hugging Face 訪問令牌!請設置環境變數 'HF_TOKEN',或者直接提供有效的訪問令牌。" ) login(HF_TOKEN) # 配置 4-bit 量化 MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf" quantization_config = BitsAndBytesConfig(load_in_4bit=True) # 加載量化模型 model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=quantization_config, device_map="auto", token=HF_TOKEN ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) # 定義推理函數 def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( inputs.input_ids, max_length=200, num_beams=5, repetition_penalty=1.2, early_stopping=True ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # 使用 Gradio 構建界面 interface = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."), outputs="text", title="Llama 2 Text Generator (CPU Quantized)", description="Generate text using the Llama-2-13b-chat-hf model with CPU quantization hosted on Hugging Face Spaces." ) # 啟動應用 if __name__ == "__main__": interface.launch()