MohamedRashad's picture
chore: Add requirements for shakkala and kaldialign
bcc0c7f
"""
@author
______ _ _
| ____| (_) /\ | |
| |__ __ _ _ __ _ ___ / \ | | __ _ ___ _ __ ___ __ _ _ __ _ _
| __/ _` | '__| / __| / /\ \ | |/ _` / __| '_ ` _ \ / _` | '__| | | |
| | | (_| | | | \__ \ / ____ \| | (_| \__ \ | | | | | (_| | | | |_| |
|_| \__,_|_| |_|___/ /_/ \_\_|\__,_|___/_| |_| |_|\__,_|_| \__, |
__/ |
|___/
Email: [email protected]
Date: Mar 15, 2022
"""
# pip install git+https://github.com/pzelasko/kaldialign.git
from kaldialign import edit_distance
def cer(ref, hyp):
"""
Computes the Character Error Rate, defined as the edit distance.
Arguments:
ref (string): a space-separated ground truth string
hyp (string): a space-separated hypothesis
"""
ref, hyp, = ref.replace(' ', '').strip(), hyp.replace(' ', '').strip()
info = edit_distance(ref, hyp)
distance = info['total']
ref_length = float(len(ref))
data = {
'insertions': info['ins'],
'deletions': info['del'],
'substitutions': info['sub'],
'distance': distance,
'ref_length': ref_length,
'Error Rate': (distance / ref_length) * 100
}
return data
def wer(ref, hyp):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
ref (string): a space-separated ground truth string
hyp (string): a space-separated hypothesis
"""
# build mapping of words to integers
b = set(ref.split() + hyp.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts strings)
w1 = [chr(word2char[w]) for w in ref.split()]
w2 = [chr(word2char[w]) for w in hyp.split()]
info = edit_distance(''.join(w1), ''.join(w2))
distance = info['total']
ref_length = float(len(w1))
data = {
'insertions': info['ins'],
'deletions': info['del'],
'substitutions': info['sub'],
'distance': distance,
'ref_length': ref_length,
'Error Rate': (distance / ref_length) * 100
}
return data