#인퍼런스 코드 예시

import os import pandas as pd import torch from transformers import AutoTokenizer, AutoModelForCausalLM

1) 경로 설정

base_dir = '/content/drive/MyDrive/PEFT(Qwen3)' test_excel = os.path.join(base_dir, 'TEST_SET_CLEANED.xlsx') output_excel = os.path.join(base_dir, 'TEST_SET_CLEANED_FIN_2.xlsx')

2) 허깅페이스 허브 레포 ID (분류용 모델로 변경)

model_id = "junghan/Qwen-3-8B-news-classification"

3) 모델 & 토크나이저 로드

tokenizer = AutoTokenizer.from_pretrained( model_id, use_fast=True, trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map={"": "cuda"}, ) model.config.use_cache = True

inference_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

Instruction:

아래 뉴스를 읽고 반드시 '경제', '금리', '외환', 혹은 'None' 중 하나로만 분류하세요. 만약 세 가지 카테고리(경제, 금리, 외환)에 해당하지 않는다면 반드시 'None'으로 답변하세요.

Question:

{question}

Response:

{cot} {category}"""

df = pd.read_excel(test_excel, engine='openpyxl') print(f"Loaded {len(df)} examples from {test_excel}")

def predict_label(text: str) -> str: # 뉴스 본문(혹은 제목+본문 등)을 question에 넣음 question = text.strip() prompt = inference_prompt_style.format(question) + tokenizer.eos_token

inputs = tokenizer(
    prompt,
    return_tensors='pt',
    truncation=True,
    max_length=2048
).to('cuda')
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=10,  # 분류 태스크이므로 10~20이면 충분
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# <think> 태그 이후의 첫 줄(정답)만 추출
after_think = decoded.split("</think>")[-1].strip()
# 첫 줄만 정답으로 사용
label = after_think.splitlines()[0].strip()
# 허용된 값만 반환
if label in ["경제", "금리", "외환", ""]:
    return label
# 예외적으로 다른 값이 나오면 빈 문자열 반환
return ""

from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm.auto import tqdm

THEME_HIST 컬럼명에 맞게 수정 (예: 'TITLE'+'C_TEXT' 등)

text_col = "THEME_HIST" # 실제 컬럼명에 맞게 수정

results = [] with ThreadPoolExecutor(max_workers=4) as executor: futures = {executor.submit(predict_label, row[text_col]): idx for idx, row in df.iterrows()} for future in tqdm(as_completed(futures), total=len(futures), desc="분류 진행중"): idx = futures[future] try: results.append((idx, future.result())) except Exception as e: results.append((idx, "")) print(f"Error at idx {idx}: {e}")

결과를 원래 순서대로 정렬

results.sort() labels = [label for idx, label in results] df['category'] = labels

df.to_excel(output_excel, index=False) print(f"분류 결과가 {output_excel}에 저장되었습니다.")

[More Information Needed]

Framework versions

PEFT 0.16.0

junghan
/

Qwen-3-8B-news-classification