my-awesome-model / create_model.py
Hank
Add complete Chinese sentiment analysis model
740d610
#!/usr/bin/env python3
"""
中文情感分析模型訓練腳本
這個腳本展示如何創建一個可推理的 Hugging Face 模型
"""
import torch
import pandas as pd
from transformers import (
BertTokenizer, BertForSequenceClassification,
TrainingArguments, Trainer, pipeline
)
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics import accuracy_score
class SentimentDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = self.labels[idx]
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
def create_demo_model():
"""創建一個演示用的情感分析模型"""
# 使用預訓練的中文 BERT
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"},
label2id={"NEGATIVE": 0, "POSITIVE": 1}
)
# 創建一些示例數據
texts = [
"這個產品真的很棒!我非常滿意。",
"質量很差,完全不推薦。",
"服務態度很好,值得信賴。",
"價格太貴了,性價比不高。",
"非常棒的體驗,會再次購買。",
"完全浪費錢,後悔購買。"
]
labels = [1, 0, 1, 0, 1, 0] # 1: POSITIVE, 0: NEGATIVE
# 準備數據集
dataset = SentimentDataset(texts, labels, tokenizer)
# 訓練參數(這裡只做演示,實際訓練需要更多數據)
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
warmup_steps=10,
weight_decay=0.01,
logging_dir='./logs',
save_strategy="no", # 不保存中間檢查點
)
# 創建訓練器
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer,
)
# 簡單訓練(演示用)
trainer.train()
return model, tokenizer
def save_model_for_huggingface(model, tokenizer, save_directory):
"""保存模型為 Hugging Face 格式"""
# 保存模型和 tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"模型已保存到: {save_directory}")
print("包含的檔案:")
import os
for file in os.listdir(save_directory):
print(f" - {file}")
def test_inference(model_directory):
"""測試模型推理功能"""
# 創建推理 pipeline
classifier = pipeline(
"text-classification",
model=model_directory,
tokenizer=model_directory,
return_all_scores=True
)
# 測試樣本
test_texts = [
"這個手機真的很好用!",
"服務態度太差了。",
"質量不錯,值得推薦。"
]
print("\\n=== 模型推理測試 ===")
for text in test_texts:
result = classifier(text)
print(f"文本: {text}")
print(f"結果: {result}")
print("-" * 50)
if __name__ == "__main__":
print("開始創建中文情感分析模型...")
# 創建並訓練模型
model, tokenizer = create_demo_model()
# 保存模型
save_directory = "./my-sentiment-model"
save_model_for_huggingface(model, tokenizer, save_directory)
# 測試推理
test_inference(save_directory)
print("\\n✅ 模型創建完成!")
print("現在你可以:")
print("1. 將模型檔案推送到 Hugging Face")
print("2. 讓其他人使用 transformers 載入你的模型")
print("3. 使用 Inference API 進行線上推理")