|
""" |
|
@author |
|
______ _ _ |
|
| ____| (_) /\ | | |
|
| |__ __ _ _ __ _ ___ / \ | | __ _ ___ _ __ ___ __ _ _ __ _ _ |
|
| __/ _` | '__| / __| / /\ \ | |/ _` / __| '_ ` _ \ / _` | '__| | | | |
|
| | | (_| | | | \__ \ / ____ \| | (_| \__ \ | | | | | (_| | | | |_| | |
|
|_| \__,_|_| |_|___/ /_/ \_\_|\__,_|___/_| |_| |_|\__,_|_| \__, | |
|
__/ | |
|
|___/ |
|
Email: [email protected] |
|
Date: Mar 15, 2022 |
|
""" |
|
|
|
|
|
|
|
from kaldialign import edit_distance |
|
|
|
|
|
def cer(ref, hyp): |
|
""" |
|
Computes the Character Error Rate, defined as the edit distance. |
|
|
|
Arguments: |
|
ref (string): a space-separated ground truth string |
|
hyp (string): a space-separated hypothesis |
|
""" |
|
ref, hyp, = ref.replace(' ', '').strip(), hyp.replace(' ', '').strip() |
|
info = edit_distance(ref, hyp) |
|
distance = info['total'] |
|
ref_length = float(len(ref)) |
|
|
|
data = { |
|
'insertions': info['ins'], |
|
'deletions': info['del'], |
|
'substitutions': info['sub'], |
|
'distance': distance, |
|
'ref_length': ref_length, |
|
'Error Rate': (distance / ref_length) * 100 |
|
} |
|
|
|
return data |
|
|
|
|
|
def wer(ref, hyp): |
|
""" |
|
Computes the Word Error Rate, defined as the edit distance between the |
|
two provided sentences after tokenizing to words. |
|
Arguments: |
|
ref (string): a space-separated ground truth string |
|
hyp (string): a space-separated hypothesis |
|
""" |
|
|
|
|
|
b = set(ref.split() + hyp.split()) |
|
word2char = dict(zip(b, range(len(b)))) |
|
|
|
|
|
w1 = [chr(word2char[w]) for w in ref.split()] |
|
w2 = [chr(word2char[w]) for w in hyp.split()] |
|
|
|
info = edit_distance(''.join(w1), ''.join(w2)) |
|
distance = info['total'] |
|
ref_length = float(len(w1)) |
|
|
|
data = { |
|
'insertions': info['ins'], |
|
'deletions': info['del'], |
|
'substitutions': info['sub'], |
|
'distance': distance, |
|
'ref_length': ref_length, |
|
'Error Rate': (distance / ref_length) * 100 |
|
} |
|
|
|
return data |
|
|
|
|