|
|
|
""" |
|
Test Fine-tuned Model vs Original |
|
|
|
Compare the fine-tuned model with the original FLAN-T5 |
|
on our target words: PANESAR, RAJOURI, XANTHIC |
|
""" |
|
|
|
import torch |
|
from pathlib import Path |
|
from typing import List, Dict |
|
|
|
try: |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
TRANSFORMERS_AVAILABLE = True |
|
except ImportError: |
|
TRANSFORMERS_AVAILABLE = False |
|
|
|
|
|
class ModelComparison: |
|
"""Compare original vs fine-tuned models""" |
|
|
|
def __init__(self): |
|
self.cache_dir = Path(__file__).parent.parent / "cache-dir" |
|
self.fine_tuned_dir = Path(__file__).parent / "fine_tuned_model" |
|
|
|
self.original_model = None |
|
self.original_tokenizer = None |
|
self.fine_tuned_model = None |
|
self.fine_tuned_tokenizer = None |
|
|
|
def load_models(self): |
|
"""Load both original and fine-tuned models""" |
|
print("π Loading original FLAN-T5-small...") |
|
|
|
|
|
self.original_tokenizer = AutoTokenizer.from_pretrained( |
|
"google/flan-t5-small", |
|
cache_dir=str(self.cache_dir) |
|
) |
|
self.original_model = AutoModelForSeq2SeqLM.from_pretrained( |
|
"google/flan-t5-small", |
|
cache_dir=str(self.cache_dir) |
|
) |
|
|
|
print("β
Original model loaded") |
|
|
|
|
|
if self.fine_tuned_dir.exists(): |
|
print("π Loading fine-tuned model...") |
|
|
|
self.fine_tuned_tokenizer = AutoTokenizer.from_pretrained( |
|
str(self.fine_tuned_dir) |
|
) |
|
self.fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained( |
|
str(self.fine_tuned_dir) |
|
) |
|
|
|
print("β
Fine-tuned model loaded") |
|
else: |
|
print("β Fine-tuned model not found - run training first") |
|
return False |
|
|
|
return True |
|
|
|
def generate_clue(self, model, tokenizer, word: str) -> str: |
|
"""Generate a clue using the specified model""" |
|
prompt = f"Generate a crossword clue for: {word}" |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=20, |
|
num_beams=3, |
|
temperature=0.7, |
|
do_sample=True, |
|
early_stopping=True, |
|
pad_token_id=tokenizer.pad_token_id |
|
) |
|
|
|
result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
if prompt in result: |
|
result = result.replace(prompt, "").strip() |
|
|
|
return result |
|
|
|
def compare_models(self): |
|
"""Compare models on target words""" |
|
target_words = [ |
|
"PANESAR", |
|
"TENDULKAR", |
|
"RAJOURI", |
|
"XANTHIC", |
|
"SERENDIPITY", |
|
"BEETHOVEN", |
|
"PIANO", |
|
] |
|
|
|
print("\n㪠COMPARING ORIGINAL vs FINE-TUNED") |
|
print("=" * 70) |
|
|
|
results = [] |
|
|
|
for word in target_words: |
|
print(f"\nπ {word}:") |
|
|
|
|
|
original_clue = self.generate_clue( |
|
self.original_model, |
|
self.original_tokenizer, |
|
word |
|
) |
|
|
|
|
|
fine_tuned_clue = self.generate_clue( |
|
self.fine_tuned_model, |
|
self.fine_tuned_tokenizer, |
|
word |
|
) |
|
|
|
print(f" Original: \"{original_clue}\"") |
|
print(f" Fine-tuned: \"{fine_tuned_clue}\"") |
|
|
|
|
|
in_training = word.upper() in ["TENDULKAR", "BEETHOVEN", "PIANO"] |
|
|
|
if in_training: |
|
print(f" Note: This word WAS in training data") |
|
else: |
|
print(f" Note: This word was NOT in training data") |
|
|
|
results.append({ |
|
"word": word, |
|
"original": original_clue, |
|
"fine_tuned": fine_tuned_clue, |
|
"in_training": in_training |
|
}) |
|
|
|
|
|
print("\n" + "=" * 70) |
|
print("π ANALYSIS") |
|
print("=" * 70) |
|
|
|
print("\nπ― Words in Training Data:") |
|
for result in results: |
|
if result["in_training"]: |
|
print(f" {result['word']:12} β \"{result['fine_tuned']}\"") |
|
|
|
print("\nπ Words NOT in Training Data (Transfer Learning Test):") |
|
for result in results: |
|
if not result["in_training"]: |
|
print(f" {result['word']:12} β \"{result['fine_tuned']}\"") |
|
|
|
print(f"\nπ‘ CONCLUSIONS:") |
|
print(f"1. If fine-tuned model is worse on training data words,") |
|
print(f" then fine-tuning failed completely") |
|
print(f"2. If it's better on training data but bad on new words,") |
|
print(f" then it overfitted and didn't generalize") |
|
print(f"3. If it's better on both, then transfer learning succeeded!") |
|
|
|
def test_training_examples(self): |
|
"""Test on exact training examples to check if model learned""" |
|
print("\nπ Testing on EXACT Training Examples:") |
|
print("=" * 50) |
|
|
|
training_examples = [ |
|
("PIANO", "88-key instrument"), |
|
("BEETHOVEN", "Austrian composer"), |
|
("OXYGEN", "Life-sustaining gas"), |
|
("EINSTEIN", "Relativity physicist"), |
|
] |
|
|
|
for word, expected in training_examples: |
|
generated = self.generate_clue( |
|
self.fine_tuned_model, |
|
self.fine_tuned_tokenizer, |
|
word |
|
) |
|
|
|
print(f"{word:12}: Expected: \"{expected}\"") |
|
print(f"{'':12} Generated: \"{generated}\"") |
|
|
|
|
|
if any(exp_word in generated.lower() for exp_word in expected.lower().split()): |
|
print(f"{'':12} Status: β
Some similarity") |
|
else: |
|
print(f"{'':12} Status: β No similarity") |
|
print() |
|
|
|
|
|
def main(): |
|
"""Main function""" |
|
print("π§ͺ FINE-TUNED MODEL EVALUATION") |
|
print("=" * 50) |
|
|
|
if not TRANSFORMERS_AVAILABLE: |
|
print("β Need transformers library") |
|
return |
|
|
|
comparison = ModelComparison() |
|
|
|
if not comparison.load_models(): |
|
return |
|
|
|
|
|
comparison.test_training_examples() |
|
|
|
|
|
comparison.compare_models() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |