Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Hugging Face Space App with CPU Quantization""" | |
import os | |
import gradio as gr | |
from huggingface_hub import login | |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
# 登錄 Hugging Face,使用訪問令牌進行身份驗證 | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
if not HF_TOKEN: | |
raise ValueError( | |
"未找到 Hugging Face 訪問令牌!請設置環境變數 'HF_TOKEN',或者直接提供有效的訪問令牌。" | |
) | |
login(HF_TOKEN) | |
# 配置 4-bit 量化 | |
MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf" | |
quantization_config = BitsAndBytesConfig(load_in_4bit=True) | |
# 加載量化模型 | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, | |
quantization_config=quantization_config, | |
device_map="auto", | |
token=HF_TOKEN | |
) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) | |
# 定義推理函數 | |
def generate_text(prompt): | |
inputs = tokenizer(prompt, return_tensors="pt") | |
outputs = model.generate( | |
inputs.input_ids, | |
max_length=200, | |
num_beams=5, | |
repetition_penalty=1.2, | |
early_stopping=True | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# 使用 Gradio 構建界面 | |
interface = gr.Interface( | |
fn=generate_text, | |
inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."), | |
outputs="text", | |
title="Llama 2 Text Generator (CPU Quantized)", | |
description="Generate text using the Llama-2-13b-chat-hf model with CPU quantization hosted on Hugging Face Spaces." | |
) | |
# 啟動應用 | |
if __name__ == "__main__": | |
interface.launch() | |