|
|
|
""" |
|
中文情感分析模型訓練腳本 |
|
這個腳本展示如何創建一個可推理的 Hugging Face 模型 |
|
""" |
|
|
|
import torch |
|
import pandas as pd |
|
from transformers import ( |
|
BertTokenizer, BertForSequenceClassification, |
|
TrainingArguments, Trainer, pipeline |
|
) |
|
from torch.utils.data import Dataset |
|
import numpy as np |
|
from sklearn.metrics import accuracy_score |
|
|
|
class SentimentDataset(Dataset): |
|
def __init__(self, texts, labels, tokenizer, max_length=128): |
|
self.texts = texts |
|
self.labels = labels |
|
self.tokenizer = tokenizer |
|
self.max_length = max_length |
|
|
|
def __len__(self): |
|
return len(self.texts) |
|
|
|
def __getitem__(self, idx): |
|
text = str(self.texts[idx]) |
|
label = self.labels[idx] |
|
|
|
encoding = self.tokenizer( |
|
text, |
|
truncation=True, |
|
padding='max_length', |
|
max_length=self.max_length, |
|
return_tensors='pt' |
|
) |
|
|
|
return { |
|
'input_ids': encoding['input_ids'].flatten(), |
|
'attention_mask': encoding['attention_mask'].flatten(), |
|
'labels': torch.tensor(label, dtype=torch.long) |
|
} |
|
|
|
def create_demo_model(): |
|
"""創建一個演示用的情感分析模型""" |
|
|
|
|
|
model_name = "bert-base-chinese" |
|
tokenizer = BertTokenizer.from_pretrained(model_name) |
|
model = BertForSequenceClassification.from_pretrained( |
|
model_name, |
|
num_labels=2, |
|
id2label={0: "NEGATIVE", 1: "POSITIVE"}, |
|
label2id={"NEGATIVE": 0, "POSITIVE": 1} |
|
) |
|
|
|
|
|
texts = [ |
|
"這個產品真的很棒!我非常滿意。", |
|
"質量很差,完全不推薦。", |
|
"服務態度很好,值得信賴。", |
|
"價格太貴了,性價比不高。", |
|
"非常棒的體驗,會再次購買。", |
|
"完全浪費錢,後悔購買。" |
|
] |
|
labels = [1, 0, 1, 0, 1, 0] |
|
|
|
|
|
dataset = SentimentDataset(texts, labels, tokenizer) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='./results', |
|
num_train_epochs=1, |
|
per_device_train_batch_size=2, |
|
per_device_eval_batch_size=2, |
|
warmup_steps=10, |
|
weight_decay=0.01, |
|
logging_dir='./logs', |
|
save_strategy="no", |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=dataset, |
|
tokenizer=tokenizer, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
return model, tokenizer |
|
|
|
def save_model_for_huggingface(model, tokenizer, save_directory): |
|
"""保存模型為 Hugging Face 格式""" |
|
|
|
|
|
model.save_pretrained(save_directory) |
|
tokenizer.save_pretrained(save_directory) |
|
|
|
print(f"模型已保存到: {save_directory}") |
|
print("包含的檔案:") |
|
import os |
|
for file in os.listdir(save_directory): |
|
print(f" - {file}") |
|
|
|
def test_inference(model_directory): |
|
"""測試模型推理功能""" |
|
|
|
|
|
classifier = pipeline( |
|
"text-classification", |
|
model=model_directory, |
|
tokenizer=model_directory, |
|
return_all_scores=True |
|
) |
|
|
|
|
|
test_texts = [ |
|
"這個手機真的很好用!", |
|
"服務態度太差了。", |
|
"質量不錯,值得推薦。" |
|
] |
|
|
|
print("\\n=== 模型推理測試 ===") |
|
for text in test_texts: |
|
result = classifier(text) |
|
print(f"文本: {text}") |
|
print(f"結果: {result}") |
|
print("-" * 50) |
|
|
|
if __name__ == "__main__": |
|
print("開始創建中文情感分析模型...") |
|
|
|
|
|
model, tokenizer = create_demo_model() |
|
|
|
|
|
save_directory = "./my-sentiment-model" |
|
save_model_for_huggingface(model, tokenizer, save_directory) |
|
|
|
|
|
test_inference(save_directory) |
|
|
|
print("\\n✅ 模型創建完成!") |
|
print("現在你可以:") |
|
print("1. 將模型檔案推送到 Hugging Face") |
|
print("2. 讓其他人使用 transformers 載入你的模型") |
|
print("3. 使用 Inference API 進行線上推理") |
|
|