#!/usr/bin/env python3 """ Statistical Analysis of Norvig Word Count Files Analyzes a single Norvig word count file (count_1w.txt or count_1w100k.txt) from norvig.com/ngrams/ to understand vocabulary characteristics for crossword generation. Usage: python analyze_norvig_vocabulary.py python analyze_norvig_vocabulary.py --help Examples: python analyze_norvig_vocabulary.py norvig/count_1w100k.txt python analyze_norvig_vocabulary.py norvig/count_1w.txt """ import os import sys import argparse import numpy as np import matplotlib.pyplot as plt import pandas as pd from collections import Counter, defaultdict import seaborn as sns from pathlib import Path # Set style for better plots plt.style.use('seaborn-v0_8') sns.set_palette("husl") def parse_arguments(): """Parse command line arguments""" parser = argparse.ArgumentParser( description='Analyze Norvig word count files for crossword generation', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python analyze_norvig_vocabulary.py norvig/count_1w100k.txt python analyze_norvig_vocabulary.py norvig/count_1w.txt python analyze_norvig_vocabulary.py --help File formats supported: - count_1w100k.txt: Top 100,000 most frequent words - count_1w.txt: Full word count dataset (1M+ words) Output: - Comprehensive statistical analysis - 6-panel visualization saved as norvig_comprehensive_analysis.png - Summary statistics printed to console """ ) parser.add_argument( 'filename', help='Path to Norvig word count file (e.g., norvig/count_1w100k.txt)' ) return parser.parse_args() def load_word_counts(filepath): """Load word count file and return dict of {word: count}""" word_counts = {} total_lines = 0 print(f"Loading {filepath}...") try: with open(filepath, 'r', encoding='utf-8') as f: for line in f: total_lines += 1 parts = line.strip().split('\t') if len(parts) == 2: word, count = parts word_counts[word.upper()] = int(count) elif len(parts) == 1 and line.strip(): # Handle case where count might be missing word = parts[0] word_counts[word.upper()] = 1 print(f"✅ Loaded {len(word_counts):,} words from {filepath}") return word_counts except FileNotFoundError: print(f"❌ File not found: {filepath}") return {} except Exception as e: print(f"❌ Error loading {filepath}: {e}") return {} def analyze_word_lengths(words): """Analyze distribution of word lengths""" lengths = [len(word) for word in words] length_dist = Counter(lengths) return lengths, length_dist def classify_difficulty(rank, total_words): """Classify word difficulty based on frequency rank""" if rank <= total_words * 0.05: # Top 5% return "Very Easy" elif rank <= total_words * 0.20: # Top 20% return "Easy" elif rank <= total_words * 0.60: # Top 60% return "Medium" elif rank <= total_words * 0.85: # Top 85% return "Hard" else: return "Very Hard" def create_comprehensive_analysis(word_counts, filename, base_dir): """Create comprehensive statistical analysis with readable plots""" # Create figure with subplots - 2x3 layout with good spacing fig = plt.figure(figsize=(18, 12)) fig.suptitle(f'Norvig Word Count Analysis - {filename}', fontsize=16, fontweight='bold', y=0.95) # Convert to sorted lists for analysis words = list(word_counts.keys()) counts = list(word_counts.values()) ranks = list(range(1, len(counts) + 1)) # 1. Zipf's Law Analysis (log-log plot) ax1 = plt.subplot(2, 3, 1) plt.loglog(ranks, counts, 'b-', alpha=0.7, linewidth=2) plt.xlabel('Rank (log scale)') plt.ylabel('Frequency (log scale)') plt.title('Zipf\'s Law Validation', fontweight='bold') plt.grid(True, alpha=0.3) # Add theoretical Zipf line for comparison theoretical_zipf = [counts[0] / r for r in ranks] plt.loglog(ranks, theoretical_zipf, 'r--', alpha=0.5, label='Theoretical') plt.legend() # 2. Word Length Distribution ax2 = plt.subplot(2, 3, 2) lengths, length_dist = analyze_word_lengths(words) lengths_list = sorted(length_dist.keys()) counts_list = [length_dist[l] for l in lengths_list] bars = plt.bar(lengths_list, counts_list, alpha=0.7, color='skyblue', edgecolor='navy') plt.xlabel('Word Length (characters)') plt.ylabel('Number of Words') plt.title('Word Length Distribution', fontweight='bold') # Highlight crossword-suitable range (3-12 letters) for i, bar in enumerate(bars): if 3 <= lengths_list[i] <= 12: bar.set_color('lightgreen') elif lengths_list[i] < 3 or lengths_list[i] > 15: bar.set_color('lightcoral') plt.axvspan(3, 12, alpha=0.2, color='green', label='Crossword Range') plt.legend() # 3. Difficulty Distribution ax3 = plt.subplot(2, 3, 3) difficulty_dist = defaultdict(int) for rank in ranks: difficulty = classify_difficulty(rank, len(ranks)) difficulty_dist[difficulty] += 1 diff_labels = list(difficulty_dist.keys()) diff_counts = list(difficulty_dist.values()) colors = ['darkgreen', 'green', 'orange', 'red', 'darkred'] wedges, texts, autotexts = plt.pie(diff_counts, labels=diff_labels, autopct='%1.1f%%', colors=colors[:len(diff_labels)], startangle=90) plt.title('Difficulty Distribution', fontweight='bold') # 4. Cumulative Frequency Coverage ax4 = plt.subplot(2, 3, 4) cumulative_freq = np.cumsum(counts) total_freq = cumulative_freq[-1] coverage_pct = (cumulative_freq / total_freq) * 100 plt.plot(ranks, coverage_pct, 'g-', linewidth=2) plt.xlabel('Vocabulary Size') plt.ylabel('Coverage (%)') plt.title('Cumulative Coverage', fontweight='bold') plt.grid(True, alpha=0.3) # Add key milestone markers milestones = [1000, 5000, 10000, 25000, 50000] for milestone in milestones: if milestone < len(coverage_pct): plt.axvline(x=milestone, color='red', linestyle='--', alpha=0.5) # 5. Crossword Suitability ax5 = plt.subplot(2, 3, 5) crossword_suitable = {word: count for word, count in word_counts.items() if 3 <= len(word) <= 12 and word.isalpha()} total_words = len(word_counts) suitable_words = len(crossword_suitable) unsuitable_words = total_words - suitable_words labels = [f'Suitable\n{suitable_words:,}', f'Not Suitable\n{unsuitable_words:,}'] sizes = [suitable_words, unsuitable_words] colors = ['lightgreen', 'lightcoral'] wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90) plt.title('Crossword Suitability', fontweight='bold') # 6. Difficulty Categories for Crosswords ax6 = plt.subplot(2, 3, 6) # Define crossword difficulty thresholds easy_threshold = 5000 medium_threshold = 25000 easy_words = sum(1 for i, word in enumerate(words[:easy_threshold]) if 3 <= len(word) <= 12 and i < len(words)) medium_words = sum(1 for i, word in enumerate(words[easy_threshold:medium_threshold]) if 3 <= len(word) <= 12 and (i + easy_threshold) < len(words)) hard_words = sum(1 for i, word in enumerate(words[medium_threshold:]) if 3 <= len(word) <= 12 and (i + medium_threshold) < len(words)) categories = ['Easy', 'Medium', 'Hard'] word_counts_cat = [easy_words, medium_words, hard_words] colors_cat = ['lightgreen', 'gold', 'lightcoral'] bars = plt.bar(categories, word_counts_cat, color=colors_cat, alpha=0.8) plt.ylabel('Crossword Words') plt.title('Difficulty Categories\n(Based on Frequency Rank)', fontweight='bold') # Add value labels on bars for bar, count in zip(bars, word_counts_cat): height = bar.get_height() if height > 0: plt.text(bar.get_x() + bar.get_width()/2, height + max(word_counts_cat)*0.02, f'{count:,}', ha='center', va='bottom', fontweight='bold') # Add explanation text box with examples # Get some example words for each category easy_examples = [w for i, w in enumerate(words[:100]) if 3 <= len(w) <= 12][:3] medium_examples = [w for i, w in enumerate(words[7000:12000]) if 3 <= len(w) <= 12][:3] hard_examples = [w for i, w in enumerate(words[30000:35000]) if 3 <= len(w) <= 12][:3] explanation = (f'Easy: Ranks 1-5,000 (most frequent)\n' f' e.g., {", ".join(easy_examples[:3])}\n' f'Medium: Ranks 5,001-25,000\n' f' e.g., {", ".join(medium_examples[:3])}\n' f'Hard: Ranks 25,001+ (least frequent)\n' f' e.g., {", ".join(hard_examples[:3])}\n\n' 'Lower rank = higher frequency = easier') plt.text(0.98, 0.98, explanation, transform=ax6.transAxes, fontsize=8, verticalalignment='top', horizontalalignment='right', bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.9)) # Adjust layout with proper spacing plt.subplots_adjust(left=0.08, bottom=0.08, right=0.95, top=0.88, wspace=0.35, hspace=0.45) # Save the comprehensive analysis with filename in the output name # Extract base name and create clean output filename if 'count_1w100k' in filename: output_name = 'norvig_analysis_100k.png' elif 'count_1w.txt' in filename: output_name = 'norvig_analysis_full.png' else: # Fallback for any other filename - make it filesystem safe safe_name = filename.replace('.txt', '').replace('/', '_').replace('count_', '') output_name = f'norvig_analysis_{safe_name}.png' output_path = base_dir / output_name plt.savefig(output_path, dpi=300, bbox_inches='tight') print(f"📊 Comprehensive analysis saved to: {output_path}") return fig, crossword_suitable def print_summary_statistics(word_counts, filename, crossword_suitable): """Print comprehensive summary statistics""" print("\n" + "="*80) print("📊 NORVIG VOCABULARY STATISTICAL ANALYSIS") print(f"📁 File: {filename}") print("="*80) # Basic statistics total_words = len(word_counts) total_frequency = sum(word_counts.values()) print(f"\n📚 BASIC STATISTICS:") print(f" • Total words: {total_words:,}") print(f" • Total frequency: {total_frequency:,}") print(f" • Average frequency: {total_frequency/total_words:.2f}") # Word length analysis lengths, length_dist = analyze_word_lengths(word_counts.keys()) avg_length = np.mean(lengths) crossword_length_words = sum(count for length, count in length_dist.items() if 3 <= length <= 12) crossword_length_pct = (crossword_length_words / total_words) * 100 print(f"\n📏 WORD LENGTH ANALYSIS:") print(f" • Average word length: {avg_length:.1f} characters") print(f" • Words 3-12 characters: {crossword_length_words:,} ({crossword_length_pct:.1f}%)") print(f" • Most common lengths: {sorted(length_dist.items(), key=lambda x: x[1], reverse=True)[:5]}") # Crossword suitability suitable_count = len(crossword_suitable) suitable_pct = (suitable_count / total_words) * 100 suitable_freq = sum(crossword_suitable.values()) suitable_freq_pct = (suitable_freq / total_frequency) * 100 print(f"\n🧩 CROSSWORD SUITABILITY:") print(f" • Suitable words (3-12 letters, alphabetic): {suitable_count:,} ({suitable_pct:.1f}%)") print(f" • Suitable word frequency coverage: {suitable_freq_pct:.1f}%") # Difficulty distribution for crosswords easy_words = len([w for w, c in list(crossword_suitable.items())[:5000]]) medium_words = len([w for w, c in list(crossword_suitable.items())[5000:25000]]) hard_words = len([w for w, c in list(crossword_suitable.items())[25000:]]) print(f"\n🎯 CROSSWORD DIFFICULTY DISTRIBUTION:") print(f" • Easy (rank 1-5K): {easy_words:,} words") print(f" • Medium (rank 5K-25K): {medium_words:,} words") print(f" • Hard (rank 25K+): {hard_words:,} words") # Top and bottom words examples words_list = list(word_counts.keys()) print(f"\n🔝 TOP 10 MOST FREQUENT WORDS:") for i, word in enumerate(words_list[:10], 1): print(f" {i:2d}. {word:<12} ({word_counts[word]:,})") print(f"\n🔚 BOTTOM 10 LEAST FREQUENT WORDS:") for i, word in enumerate(words_list[-10:], 1): print(f" {i:2d}. {word:<12} ({word_counts[word]:,})") # Zipf's law validation words_list = list(word_counts.keys()) counts_list = list(word_counts.values()) # Calculate correlation coefficient for log-log relationship log_ranks = np.log(range(1, len(counts_list) + 1)) log_freqs = np.log(counts_list) correlation = np.corrcoef(log_ranks, log_freqs)[0, 1] print(f"\n📈 ZIPF'S LAW VALIDATION:") print(f" • Log-log correlation: {correlation:.4f}") print(f" • Zipf compliance: {'✅ Excellent' if abs(correlation) > 0.95 else '⚠️ Moderate' if abs(correlation) > 0.8 else '❌ Poor'}") # Recommendations print(f"\n💡 RECOMMENDATIONS FOR CROSSWORD GENERATION:") print(f" • Dataset size: {total_words:,} words with excellent coverage") print(f" • Filter to 3-12 letters: Reduces to {suitable_count:,} words ({suitable_pct:.1f}%)") print(f" • Difficulty thresholds (for crossword-suitable words):") print(f" - Easy: ranks 1-5,000 ({easy_words:,} suitable words)") print(f" - Medium: ranks 5,001-25,000 ({medium_words:,} suitable words)") print(f" - Hard: ranks 25,001+ ({hard_words:,} suitable words)") print(f" • Quality: ✅ No garbage entries (unlike crossword-specific lists)") print(f" • Source credibility: ✅ Peter Norvig (Google) + Google Books corpus") print("="*80) def main(): """Main analysis function""" # Parse command line arguments args = parse_arguments() # File paths base_dir = Path(__file__).parent input_file = Path(args.filename) # Make path relative to script directory if not absolute if not input_file.is_absolute(): input_file = base_dir / input_file print("🔍 Norvig Vocabulary Statistical Analysis") print("=" * 50) print(f"📁 Analyzing: {input_file}") # Load data word_counts = load_word_counts(input_file) if not word_counts: print(f"❌ Could not load word list from {input_file}. Please check file path.") return # Create comprehensive analysis fig, crossword_suitable = create_comprehensive_analysis(word_counts, input_file.name, base_dir) # Print summary statistics print_summary_statistics(word_counts, input_file.name, crossword_suitable) # Don't show plot interactively in CLI, just save it # plt.show() # Comment out for CLI usage # Generate the same output filename logic for final message if 'count_1w100k' in input_file.name: output_name = 'norvig_analysis_100k.png' elif 'count_1w.txt' in input_file.name: output_name = 'norvig_analysis_full.png' else: safe_name = input_file.name.replace('.txt', '').replace('/', '_').replace('count_', '') output_name = f'norvig_analysis_{safe_name}.png' print(f"\n✅ Analysis complete! Check {base_dir}/{output_name} for detailed plots.") if __name__ == "__main__": main()