from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from typing import List, Optional import pandas as pd def cosine_sim_wer(references: List[str], predictions: List[str]) -> float: """ Calculate a WER-like metric based on cosine similarity between reference and prediction texts. This function computes character-level n-gram similarities between each reference-prediction pair and returns an error rate (100% - average similarity). Handles empty inputs and provides detailed similarity statistics. Args: references: List of reference transcript strings predictions: List of model prediction strings Returns: float: Error rate based on cosine similarity (100% - average similarity) Example: >>> references = ["hello world", "good morning"] >>> predictions = ["hello world", "good evening"] >>> error_rate = cosine_sim_wer(references, predictions) """ # Validate and clean inputs valid_refs, valid_preds = [], [] for ref, pred in zip(references, predictions): if not ref.strip() or not pred.strip(): continue # Skip empty strings valid_refs.append(ref.strip()) valid_preds.append(pred.strip()) # Handle case with no valid pairs if not valid_refs: print("Warning: No valid reference-prediction pairs found") return 100.0 # Maximum error if no valid data # Calculate pairwise similarities similarities = [] for ref, pred in zip(valid_refs, valid_preds): try: # Use character-level n-grams (2-3 chars) for robust comparison vectorizer = CountVectorizer( analyzer='char_wb', # Word-boundary aware character n-grams ngram_range=(2, 3) # Bigrams and trigrams ) # Create document-term matrices vectors = vectorizer.fit_transform([ref, pred]) # Compute cosine similarity similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] similarities.append(similarity * 100) # Convert to percentage except Exception as e: print(f"Error calculating similarity: {e}") similarities.append(0.0) # Default to 0% similarity on error # Compute statistics avg_similarity = np.mean(similarities) min_similarity = np.min(similarities) max_similarity = np.max(similarities) error_rate = 100.0 - avg_similarity # Print diagnostics print(f"Similarity Statistics:") print(f" - Average: {avg_similarity:.2f}%") print(f" - Range: {min_similarity:.2f}% to {max_similarity:.2f}%") print(f" - Valid samples: {len(similarities)}/{len(references)}") return error_rate def create_wer_analysis_dataframe( references: List[str], predictions: List[str], normalized_references: Optional[List[str]] = None, normalized_predictions: Optional[List[str]] = None, output_csv: str = "wer_analysis.csv" ) -> pd.DataFrame: """ Create a comprehensive DataFrame comparing reference and prediction texts with multiple metrics. For each sample, calculates: - Word Error Rate (WER) for original and normalized texts - Cosine similarity for original and normalized texts - Length statistics and differences Args: references: List of original reference texts predictions: List of original prediction texts normalized_references: Optional list of normalized reference texts normalized_predictions: Optional list of normalized prediction texts output_csv: Path to save results (None to skip saving) Returns: pd.DataFrame: Analysis results with one row per sample Example: >>> df = create_wer_analysis_dataframe( ... references=["hello world"], ... predictions=["hello there"], ... output_csv="analysis.csv" ... ) """ from jiwer import wer # Import here to avoid dependency if not using WER records = [] for i, (ref, pred) in enumerate(zip(references, predictions)): # Skip empty samples if not ref.strip() or not pred.strip(): continue # Get normalized versions if provided norm_ref = normalized_references[i] if normalized_references else ref norm_pred = normalized_predictions[i] if normalized_predictions else pred # Calculate metrics with error handling metrics = { 'index': i, 'reference': ref, 'prediction': pred, 'normalized_reference': norm_ref, 'normalized_prediction': norm_pred, 'ref_length': len(ref.split()), 'pred_length': len(pred.split()), 'length_difference': len(pred.split()) - len(ref.split()) } # Calculate WER metrics try: metrics['wer'] = wer(ref, pred) * 100 metrics['normalized_wer'] = wer(norm_ref, norm_pred) * 100 except Exception as e: print(f"WER calculation failed for sample {i}: {e}") metrics.update({'wer': np.nan, 'normalized_wer': np.nan}) # Calculate cosine similarities for prefix, text1, text2 in [ ('', ref, pred), ('normalized_', norm_ref, norm_pred) ]: try: vectorizer = CountVectorizer( analyzer='char_wb', ngram_range=(2, 3) vectors = vectorizer.fit_transform([text1, text2]) similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100 metrics[f'{prefix}similarity'] = similarity except Exception as e: print(f"Similarity calculation failed for sample {i}: {e}") metrics[f'{prefix}similarity'] = np.nan records.append(metrics) # Create DataFrame df = pd.DataFrame(records) # Save to CSV if requested if output_csv: try: df.to_csv(output_csv, index=False) print(f"Analysis saved to {output_csv}") except Exception as e: print(f"Failed to save CSV: {e}") return df