File size: 7,172 Bytes
2ecccdf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
#!/usr/bin/env python3
"""
Test Fine-tuned Model vs Original
Compare the fine-tuned model with the original FLAN-T5
on our target words: PANESAR, RAJOURI, XANTHIC
"""
import torch
from pathlib import Path
from typing import List, Dict
try:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
class ModelComparison:
"""Compare original vs fine-tuned models"""
def __init__(self):
self.cache_dir = Path(__file__).parent.parent / "cache-dir"
self.fine_tuned_dir = Path(__file__).parent / "fine_tuned_model"
self.original_model = None
self.original_tokenizer = None
self.fine_tuned_model = None
self.fine_tuned_tokenizer = None
def load_models(self):
"""Load both original and fine-tuned models"""
print("π Loading original FLAN-T5-small...")
# Load original model
self.original_tokenizer = AutoTokenizer.from_pretrained(
"google/flan-t5-small",
cache_dir=str(self.cache_dir)
)
self.original_model = AutoModelForSeq2SeqLM.from_pretrained(
"google/flan-t5-small",
cache_dir=str(self.cache_dir)
)
print("β
Original model loaded")
# Load fine-tuned model
if self.fine_tuned_dir.exists():
print("π Loading fine-tuned model...")
self.fine_tuned_tokenizer = AutoTokenizer.from_pretrained(
str(self.fine_tuned_dir)
)
self.fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(
str(self.fine_tuned_dir)
)
print("β
Fine-tuned model loaded")
else:
print("β Fine-tuned model not found - run training first")
return False
return True
def generate_clue(self, model, tokenizer, word: str) -> str:
"""Generate a clue using the specified model"""
prompt = f"Generate a crossword clue for: {word}"
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=20,
num_beams=3,
temperature=0.7,
do_sample=True,
early_stopping=True,
pad_token_id=tokenizer.pad_token_id
)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Clean up (remove original prompt if echoed)
if prompt in result:
result = result.replace(prompt, "").strip()
return result
def compare_models(self):
"""Compare models on target words"""
target_words = [
"PANESAR", # Should be: cricketer
"TENDULKAR", # Should be: cricketer (in training data)
"RAJOURI", # Should be: Kashmir district
"XANTHIC", # Should be: yellowish color
"SERENDIPITY", # Should be: happy accident
"BEETHOVEN", # Should be: composer (in training data)
"PIANO", # Should be: instrument (in training data)
]
print("\n㪠COMPARING ORIGINAL vs FINE-TUNED")
print("=" * 70)
results = []
for word in target_words:
print(f"\nπ {word}:")
# Original model
original_clue = self.generate_clue(
self.original_model,
self.original_tokenizer,
word
)
# Fine-tuned model
fine_tuned_clue = self.generate_clue(
self.fine_tuned_model,
self.fine_tuned_tokenizer,
word
)
print(f" Original: \"{original_clue}\"")
print(f" Fine-tuned: \"{fine_tuned_clue}\"")
# Simple quality check
in_training = word.upper() in ["TENDULKAR", "BEETHOVEN", "PIANO"]
if in_training:
print(f" Note: This word WAS in training data")
else:
print(f" Note: This word was NOT in training data")
results.append({
"word": word,
"original": original_clue,
"fine_tuned": fine_tuned_clue,
"in_training": in_training
})
# Summary
print("\n" + "=" * 70)
print("π ANALYSIS")
print("=" * 70)
print("\nπ― Words in Training Data:")
for result in results:
if result["in_training"]:
print(f" {result['word']:12} β \"{result['fine_tuned']}\"")
print("\nπ Words NOT in Training Data (Transfer Learning Test):")
for result in results:
if not result["in_training"]:
print(f" {result['word']:12} β \"{result['fine_tuned']}\"")
print(f"\nπ‘ CONCLUSIONS:")
print(f"1. If fine-tuned model is worse on training data words,")
print(f" then fine-tuning failed completely")
print(f"2. If it's better on training data but bad on new words,")
print(f" then it overfitted and didn't generalize")
print(f"3. If it's better on both, then transfer learning succeeded!")
def test_training_examples(self):
"""Test on exact training examples to check if model learned"""
print("\nπ Testing on EXACT Training Examples:")
print("=" * 50)
training_examples = [
("PIANO", "88-key instrument"),
("BEETHOVEN", "Austrian composer"), # Not exact but close
("OXYGEN", "Life-sustaining gas"),
("EINSTEIN", "Relativity physicist"),
]
for word, expected in training_examples:
generated = self.generate_clue(
self.fine_tuned_model,
self.fine_tuned_tokenizer,
word
)
print(f"{word:12}: Expected: \"{expected}\"")
print(f"{'':12} Generated: \"{generated}\"")
# Check if similar
if any(exp_word in generated.lower() for exp_word in expected.lower().split()):
print(f"{'':12} Status: β
Some similarity")
else:
print(f"{'':12} Status: β No similarity")
print()
def main():
"""Main function"""
print("π§ͺ FINE-TUNED MODEL EVALUATION")
print("=" * 50)
if not TRANSFORMERS_AVAILABLE:
print("β Need transformers library")
return
comparison = ModelComparison()
if not comparison.load_models():
return
# Test on training examples first
comparison.test_training_examples()
# Compare on target words
comparison.compare_models()
if __name__ == "__main__":
main() |