File size: 6,423 Bytes
d657b96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List, Optional
import pandas as pd
def cosine_sim_wer(references: List[str], predictions: List[str]) -> float:
"""
Calculate a WER-like metric based on cosine similarity between reference and prediction texts.
This function computes character-level n-gram similarities between each reference-prediction pair
and returns an error rate (100% - average similarity). Handles empty inputs and provides
detailed similarity statistics.
Args:
references: List of reference transcript strings
predictions: List of model prediction strings
Returns:
float: Error rate based on cosine similarity (100% - average similarity)
Example:
>>> references = ["hello world", "good morning"]
>>> predictions = ["hello world", "good evening"]
>>> error_rate = cosine_sim_wer(references, predictions)
"""
# Validate and clean inputs
valid_refs, valid_preds = [], []
for ref, pred in zip(references, predictions):
if not ref.strip() or not pred.strip():
continue # Skip empty strings
valid_refs.append(ref.strip())
valid_preds.append(pred.strip())
# Handle case with no valid pairs
if not valid_refs:
print("Warning: No valid reference-prediction pairs found")
return 100.0 # Maximum error if no valid data
# Calculate pairwise similarities
similarities = []
for ref, pred in zip(valid_refs, valid_preds):
try:
# Use character-level n-grams (2-3 chars) for robust comparison
vectorizer = CountVectorizer(
analyzer='char_wb', # Word-boundary aware character n-grams
ngram_range=(2, 3) # Bigrams and trigrams
)
# Create document-term matrices
vectors = vectorizer.fit_transform([ref, pred])
# Compute cosine similarity
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
similarities.append(similarity * 100) # Convert to percentage
except Exception as e:
print(f"Error calculating similarity: {e}")
similarities.append(0.0) # Default to 0% similarity on error
# Compute statistics
avg_similarity = np.mean(similarities)
min_similarity = np.min(similarities)
max_similarity = np.max(similarities)
error_rate = 100.0 - avg_similarity
# Print diagnostics
print(f"Similarity Statistics:")
print(f" - Average: {avg_similarity:.2f}%")
print(f" - Range: {min_similarity:.2f}% to {max_similarity:.2f}%")
print(f" - Valid samples: {len(similarities)}/{len(references)}")
return error_rate
def create_wer_analysis_dataframe(
references: List[str],
predictions: List[str],
normalized_references: Optional[List[str]] = None,
normalized_predictions: Optional[List[str]] = None,
output_csv: str = "wer_analysis.csv"
) -> pd.DataFrame:
"""
Create a comprehensive DataFrame comparing reference and prediction texts with multiple metrics.
For each sample, calculates:
- Word Error Rate (WER) for original and normalized texts
- Cosine similarity for original and normalized texts
- Length statistics and differences
Args:
references: List of original reference texts
predictions: List of original prediction texts
normalized_references: Optional list of normalized reference texts
normalized_predictions: Optional list of normalized prediction texts
output_csv: Path to save results (None to skip saving)
Returns:
pd.DataFrame: Analysis results with one row per sample
Example:
>>> df = create_wer_analysis_dataframe(
... references=["hello world"],
... predictions=["hello there"],
... output_csv="analysis.csv"
... )
"""
from jiwer import wer # Import here to avoid dependency if not using WER
records = []
for i, (ref, pred) in enumerate(zip(references, predictions)):
# Skip empty samples
if not ref.strip() or not pred.strip():
continue
# Get normalized versions if provided
norm_ref = normalized_references[i] if normalized_references else ref
norm_pred = normalized_predictions[i] if normalized_predictions else pred
# Calculate metrics with error handling
metrics = {
'index': i,
'reference': ref,
'prediction': pred,
'normalized_reference': norm_ref,
'normalized_prediction': norm_pred,
'ref_length': len(ref.split()),
'pred_length': len(pred.split()),
'length_difference': len(pred.split()) - len(ref.split())
}
# Calculate WER metrics
try:
metrics['wer'] = wer(ref, pred) * 100
metrics['normalized_wer'] = wer(norm_ref, norm_pred) * 100
except Exception as e:
print(f"WER calculation failed for sample {i}: {e}")
metrics.update({'wer': np.nan, 'normalized_wer': np.nan})
# Calculate cosine similarities
for prefix, text1, text2 in [
('', ref, pred),
('normalized_', norm_ref, norm_pred)
]:
try:
vectorizer = CountVectorizer(
analyzer='char_wb',
ngram_range=(2, 3)
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100
metrics[f'{prefix}similarity'] = similarity
except Exception as e:
print(f"Similarity calculation failed for sample {i}: {e}")
metrics[f'{prefix}similarity'] = np.nan
records.append(metrics)
# Create DataFrame
df = pd.DataFrame(records)
# Save to CSV if requested
if output_csv:
try:
df.to_csv(output_csv, index=False)
print(f"Analysis saved to {output_csv}")
except Exception as e:
print(f"Failed to save CSV: {e}")
return df |