|
|
|
""" |
|
Create Training Dataset for Transfer Learning |
|
|
|
This script creates a proper training dataset of (word, clue) pairs |
|
for fine-tuning FLAN-T5 on crossword clue generation. |
|
|
|
This is REAL transfer learning preparation - not just prompting. |
|
""" |
|
|
|
import json |
|
import csv |
|
import random |
|
from typing import List, Dict, Tuple |
|
from pathlib import Path |
|
from dataclasses import dataclass |
|
|
|
|
|
@dataclass |
|
class CrosswordExample: |
|
"""Single training example""" |
|
word: str |
|
clue: str |
|
category: str = "general" |
|
difficulty: str = "medium" |
|
|
|
|
|
class CrosswordDatasetCreator: |
|
"""Creates training dataset for crossword clue generation""" |
|
|
|
def __init__(self): |
|
self.examples = [] |
|
self.output_dir = Path(__file__).parent / "training_data" |
|
self.output_dir.mkdir(exist_ok=True) |
|
|
|
def add_manual_examples(self): |
|
"""Add manually curated high-quality examples""" |
|
manual_examples = [ |
|
|
|
CrosswordExample("EINSTEIN", "Relativity physicist", "people"), |
|
CrosswordExample("MOZART", "Austrian composer", "people"), |
|
CrosswordExample("SHAKESPEARE", "Hamlet playwright", "people"), |
|
CrosswordExample("PICASSO", "Cubist painter", "people"), |
|
CrosswordExample("NAPOLEON", "French emperor", "people"), |
|
CrosswordExample("CHURCHILL", "British wartime PM", "people"), |
|
|
|
|
|
CrosswordExample("PARIS", "French capital", "geography"), |
|
CrosswordExample("LONDON", "British capital", "geography"), |
|
CrosswordExample("TOKYO", "Japanese capital", "geography"), |
|
CrosswordExample("AMAZON", "South American river", "geography"), |
|
CrosswordExample("SAHARA", "African desert", "geography"), |
|
CrosswordExample("ALPS", "European mountain range", "geography"), |
|
|
|
|
|
CrosswordExample("ELEPHANT", "Large tusked mammal", "animals"), |
|
CrosswordExample("PENGUIN", "Antarctic bird", "animals"), |
|
CrosswordExample("DOLPHIN", "Intelligent marine mammal", "animals"), |
|
CrosswordExample("TIGER", "Striped big cat", "animals"), |
|
CrosswordExample("EAGLE", "Powerful bird of prey", "animals"), |
|
|
|
|
|
CrosswordExample("PIANO", "88-key instrument", "objects"), |
|
CrosswordExample("GUITAR", "Six-string instrument", "objects"), |
|
CrosswordExample("TELESCOPE", "Star-viewing device", "objects"), |
|
CrosswordExample("MICROSCOPE", "Cell-viewing device", "objects"), |
|
CrosswordExample("BICYCLE", "Two-wheeled vehicle", "objects"), |
|
|
|
|
|
CrosswordExample("OXYGEN", "Life-sustaining gas", "science"), |
|
CrosswordExample("GRAVITY", "Force pulling objects down", "science"), |
|
CrosswordExample("PHOTOSYNTHESIS", "Plant energy process", "science"), |
|
CrosswordExample("DNA", "Genetic code molecule", "science"), |
|
CrosswordExample("LASER", "Focused light beam", "science"), |
|
|
|
|
|
CrosswordExample("DEMOCRACY", "Government by the people", "concepts"), |
|
CrosswordExample("FREEDOM", "State of being free", "concepts"), |
|
CrosswordExample("JUSTICE", "Fairness under law", "concepts"), |
|
CrosswordExample("WISDOM", "Deep understanding", "concepts"), |
|
|
|
|
|
CrosswordExample("CRICKET", "Bat and ball sport", "sports"), |
|
CrosswordExample("TENNIS", "Racket sport", "sports"), |
|
CrosswordExample("FOOTBALL", "Team sport with goals", "sports"), |
|
CrosswordExample("BASKETBALL", "Hoop-shooting game", "sports"), |
|
|
|
|
|
CrosswordExample("PIZZA", "Italian bread dish", "food"), |
|
CrosswordExample("SUSHI", "Japanese raw fish dish", "food"), |
|
CrosswordExample("CHOCOLATE", "Sweet cocoa treat", "food"), |
|
CrosswordExample("COFFEE", "Caffeinated morning drink", "food"), |
|
] |
|
|
|
self.examples.extend(manual_examples) |
|
print(f"✅ Added {len(manual_examples)} manual examples") |
|
|
|
def add_thematic_examples(self): |
|
"""Add examples for different themes/categories""" |
|
|
|
|
|
color_examples = [ |
|
CrosswordExample("RED", "Primary color", "colors"), |
|
CrosswordExample("BLUE", "Sky color", "colors"), |
|
CrosswordExample("GREEN", "Grass color", "colors"), |
|
CrosswordExample("YELLOW", "Sun color", "colors"), |
|
CrosswordExample("PURPLE", "Royal color", "colors"), |
|
CrosswordExample("ORANGE", "Citrus color", "colors"), |
|
] |
|
|
|
|
|
math_examples = [ |
|
CrosswordExample("SEVEN", "Lucky number", "numbers"), |
|
CrosswordExample("DOZEN", "Twelve items", "numbers"), |
|
CrosswordExample("CENTURY", "Hundred years", "numbers"), |
|
CrosswordExample("TRIANGLE", "Three-sided shape", "math"), |
|
CrosswordExample("CIRCLE", "Round geometric shape", "math"), |
|
] |
|
|
|
|
|
body_examples = [ |
|
CrosswordExample("HEART", "Pumping organ", "body"), |
|
CrosswordExample("BRAIN", "Thinking organ", "body"), |
|
CrosswordExample("EYES", "Seeing organs", "body"), |
|
CrosswordExample("HANDS", "Grasping appendages", "body"), |
|
] |
|
|
|
|
|
time_examples = [ |
|
CrosswordExample("MONDAY", "Week starter", "time"), |
|
CrosswordExample("JANUARY", "Year starter", "time"), |
|
CrosswordExample("SUMMER", "Hot season", "time"), |
|
CrosswordExample("MORNING", "Day starter", "time"), |
|
] |
|
|
|
all_thematic = color_examples + math_examples + body_examples + time_examples |
|
self.examples.extend(all_thematic) |
|
print(f"✅ Added {len(all_thematic)} thematic examples") |
|
|
|
def add_cricket_examples(self): |
|
"""Add cricket-specific examples for our use case""" |
|
cricket_examples = [ |
|
CrosswordExample("TENDULKAR", "Indian batting legend", "cricket"), |
|
CrosswordExample("BRADMAN", "Australian batting great", "cricket"), |
|
CrosswordExample("KOHLI", "Indian cricket captain", "cricket"), |
|
CrosswordExample("DHONI", "Indian wicket-keeper captain", "cricket"), |
|
CrosswordExample("WICKET", "Three stumps and bails", "cricket"), |
|
CrosswordExample("BOUNDARY", "Four or six runs", "cricket"), |
|
CrosswordExample("BOWLER", "Ball deliverer", "cricket"), |
|
CrosswordExample("BATSMAN", "Run scorer", "cricket"), |
|
CrosswordExample("ASHES", "England-Australia series", "cricket"), |
|
] |
|
|
|
|
|
self.examples.extend(cricket_examples) |
|
print(f"✅ Added {len(cricket_examples)} cricket examples") |
|
|
|
def add_scientific_terms(self): |
|
"""Add scientific/technical terms""" |
|
science_examples = [ |
|
CrosswordExample("OSMOSIS", "Liquid movement through membrane", "science"), |
|
CrosswordExample("MITOSIS", "Cell division process", "science"), |
|
CrosswordExample("ENZYME", "Biological catalyst", "science"), |
|
CrosswordExample("PROTON", "Positive atomic particle", "science"), |
|
CrosswordExample("NEUTRON", "Neutral atomic particle", "science"), |
|
CrosswordExample("ELECTRON", "Negative atomic particle", "science"), |
|
CrosswordExample("CATALYST", "Reaction accelerator", "science"), |
|
CrosswordExample("MOLECULE", "Chemical compound unit", "science"), |
|
CrosswordExample("CHROMOSOME", "DNA carrier", "science"), |
|
|
|
|
|
] |
|
|
|
self.examples.extend(science_examples) |
|
print(f"✅ Added {len(science_examples)} scientific examples") |
|
|
|
def format_for_training(self) -> List[Dict]: |
|
"""Format examples for FLAN-T5 training""" |
|
formatted = [] |
|
|
|
for example in self.examples: |
|
formatted.append({ |
|
"input_text": f"Generate a crossword clue for: {example.word}", |
|
"target_text": example.clue, |
|
"word": example.word, |
|
"category": example.category |
|
}) |
|
|
|
return formatted |
|
|
|
def save_dataset(self): |
|
"""Save the dataset in multiple formats""" |
|
formatted_data = self.format_for_training() |
|
|
|
|
|
json_file = self.output_dir / "crossword_training_data.json" |
|
with open(json_file, 'w') as f: |
|
json.dump(formatted_data, f, indent=2) |
|
|
|
|
|
csv_file = self.output_dir / "crossword_training_data.csv" |
|
with open(csv_file, 'w', newline='') as f: |
|
writer = csv.DictWriter(f, fieldnames=["word", "clue", "category", "input_text", "target_text"]) |
|
writer.writeheader() |
|
for item in formatted_data: |
|
writer.writerow({ |
|
"word": item["word"], |
|
"clue": item["target_text"], |
|
"category": item["category"], |
|
"input_text": item["input_text"], |
|
"target_text": item["target_text"] |
|
}) |
|
|
|
print(f"✅ Dataset saved:") |
|
print(f" JSON: {json_file}") |
|
print(f" CSV: {csv_file}") |
|
print(f" Total examples: {len(formatted_data)}") |
|
|
|
return formatted_data |
|
|
|
def show_sample(self, n=5): |
|
"""Show sample training examples""" |
|
print(f"\n📝 Sample Training Examples:") |
|
print("-" * 50) |
|
|
|
samples = random.sample(self.examples, min(n, len(self.examples))) |
|
for example in samples: |
|
print(f"Input: 'Generate a crossword clue for: {example.word}'") |
|
print(f"Output: '{example.clue}'") |
|
print(f"Category: {example.category}") |
|
print() |
|
|
|
|
|
def create_training_dataset(): |
|
"""Create the complete training dataset""" |
|
print("🔨 Creating Crossword Training Dataset for Transfer Learning") |
|
print("=" * 60) |
|
|
|
creator = CrosswordDatasetCreator() |
|
|
|
|
|
creator.add_manual_examples() |
|
creator.add_thematic_examples() |
|
creator.add_cricket_examples() |
|
creator.add_scientific_terms() |
|
|
|
|
|
creator.show_sample(3) |
|
|
|
|
|
dataset = creator.save_dataset() |
|
|
|
print("\n📊 Dataset Statistics:") |
|
print(f"Total examples: {len(dataset)}") |
|
|
|
|
|
categories = {} |
|
for example in creator.examples: |
|
categories[example.category] = categories.get(example.category, 0) + 1 |
|
|
|
print("\nBy category:") |
|
for category, count in sorted(categories.items()): |
|
print(f" {category}: {count}") |
|
|
|
print("\n🎯 Next Steps:") |
|
print("1. Run the fine-tuning script with this data") |
|
print("2. Test on held-out words (PANESAR, RAJOURI, XANTHIC)") |
|
print("3. Compare with zero-shot prompting results") |
|
|
|
return dataset |
|
|
|
|
|
if __name__ == "__main__": |
|
create_training_dataset() |