#์ธํผ๋ฐ์ค ์ฝ๋ ์์
import os import pandas as pd import torch from transformers import AutoTokenizer, AutoModelForCausalLM
1) ๊ฒฝ๋ก ์ค์
base_dir = '/content/drive/MyDrive/PEFT(Qwen3)' test_excel = os.path.join(base_dir, 'TEST_SET_CLEANED.xlsx') output_excel = os.path.join(base_dir, 'TEST_SET_CLEANED_FIN_2.xlsx')
2) ํ๊น ํ์ด์ค ํ๋ธ ๋ ํฌ ID (๋ถ๋ฅ์ฉ ๋ชจ๋ธ๋ก ๋ณ๊ฒฝ)
model_id = "junghan/Qwen-3-8B-news-classification"
3) ๋ชจ๋ธ & ํ ํฌ๋์ด์ ๋ก๋
tokenizer = AutoTokenizer.from_pretrained( model_id, use_fast=True, trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map={"": "cuda"}, ) model.config.use_cache = True
inference_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.
Instruction:
์๋ ๋ด์ค๋ฅผ ์ฝ๊ณ ๋ฐ๋์ '๊ฒฝ์ ', '๊ธ๋ฆฌ', '์ธํ', ํน์ 'None' ์ค ํ๋๋ก๋ง ๋ถ๋ฅํ์ธ์. ๋ง์ฝ ์ธ ๊ฐ์ง ์นดํ ๊ณ ๋ฆฌ(๊ฒฝ์ , ๊ธ๋ฆฌ, ์ธํ)์ ํด๋นํ์ง ์๋๋ค๋ฉด ๋ฐ๋์ 'None'์ผ๋ก ๋ต๋ณํ์ธ์.
Question:
{question}
Response:
{cot} {category}"""df = pd.read_excel(test_excel, engine='openpyxl') print(f"Loaded {len(df)} examples from {test_excel}")
def predict_label(text: str) -> str: # ๋ด์ค ๋ณธ๋ฌธ(ํน์ ์ ๋ชฉ+๋ณธ๋ฌธ ๋ฑ)์ question์ ๋ฃ์ question = text.strip() prompt = inference_prompt_style.format(question) + tokenizer.eos_token
inputs = tokenizer(
prompt,
return_tensors='pt',
truncation=True,
max_length=2048
).to('cuda')
outputs = model.generate(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=10, # ๋ถ๋ฅ ํ์คํฌ์ด๋ฏ๋ก 10~20์ด๋ฉด ์ถฉ๋ถ
eos_token_id=tokenizer.eos_token_id,
use_cache=True,
)
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# <think> ํ๊ทธ ์ดํ์ ์ฒซ ์ค(์ ๋ต)๋ง ์ถ์ถ
after_think = decoded.split("</think>")[-1].strip()
# ์ฒซ ์ค๋ง ์ ๋ต์ผ๋ก ์ฌ์ฉ
label = after_think.splitlines()[0].strip()
# ํ์ฉ๋ ๊ฐ๋ง ๋ฐํ
if label in ["๊ฒฝ์ ", "๊ธ๋ฆฌ", "์ธํ", ""]:
return label
# ์์ธ์ ์ผ๋ก ๋ค๋ฅธ ๊ฐ์ด ๋์ค๋ฉด ๋น ๋ฌธ์์ด ๋ฐํ
return ""
from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm.auto import tqdm
THEME_HIST ์ปฌ๋ผ๋ช ์ ๋ง๊ฒ ์์ (์: 'TITLE'+'C_TEXT' ๋ฑ)
text_col = "THEME_HIST" # ์ค์ ์ปฌ๋ผ๋ช ์ ๋ง๊ฒ ์์
results = [] with ThreadPoolExecutor(max_workers=4) as executor: futures = {executor.submit(predict_label, row[text_col]): idx for idx, row in df.iterrows()} for future in tqdm(as_completed(futures), total=len(futures), desc="๋ถ๋ฅ ์งํ์ค"): idx = futures[future] try: results.append((idx, future.result())) except Exception as e: results.append((idx, "")) print(f"Error at idx {idx}: {e}")
๊ฒฐ๊ณผ๋ฅผ ์๋ ์์๋๋ก ์ ๋ ฌ
results.sort() labels = [label for idx, label in results] df['category'] = labels
df.to_excel(output_excel, index=False) print(f"๋ถ๋ฅ ๊ฒฐ๊ณผ๊ฐ {output_excel}์ ์ ์ฅ๋์์ต๋๋ค.")
[More Information Needed]
Framework versions
- PEFT 0.16.0
- Downloads last month
- 3