Spaces:

Billpai
/

test2

Build error

test2 / modules /whisper_extractor /normalizers /basic.py

Billpai

test

f196feb 9 months ago

2.76 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	# This module is modified from [Whisper](https://github.com/openai/whisper.git).

	# ## Citations

	# ```bibtex
	# @inproceedings{openai-whisper,
	# author = {Alec Radford and
	# Jong Wook Kim and
	# Tao Xu and
	# Greg Brockman and
	# Christine McLeavey and
	# Ilya Sutskever},
	# title = {Robust Speech Recognition via Large-Scale Weak Supervision},
	# booktitle = {{ICML}},
	# series = {Proceedings of Machine Learning Research},
	# volume = {202},
	# pages = {28492--28518},
	# publisher = {{PMLR}},
	# year = {2023}
	# }
	# ```
	#

	import re
	import unicodedata

	import regex

	# non-ASCII letters that are not separated by "NFKD" normalization
	ADDITIONAL_DIACRITICS = {
	"œ": "oe",
	"Œ": "OE",
	"ø": "o",
	"Ø": "O",
	"æ": "ae",
	"Æ": "AE",
	"ß": "ss",
	"ẞ": "SS",
	"đ": "d",
	"Đ": "D",
	"ð": "d",
	"Ð": "D",
	"þ": "th",
	"Þ": "th",
	"ł": "l",
	"Ł": "L",
	}


	def remove_symbols_and_diacritics(s: str, keep=""):
	"""
	Replace any other markers, symbols, and punctuations with a space,
	and drop any diacritics (category 'Mn' and some manual mappings)
	"""
	return "".join(
	c
	if c in keep
	else ADDITIONAL_DIACRITICS[c]
	if c in ADDITIONAL_DIACRITICS
	else ""
	if unicodedata.category(c) == "Mn"
	else " "
	if unicodedata.category(c)[0] in "MSP"
	else c
	for c in unicodedata.normalize("NFKD", s)
	)


	def remove_symbols(s: str):
	"""
	Replace any other markers, symbols, punctuations with a space, keeping diacritics
	"""
	return "".join(
	" " if unicodedata.category(c)[0] in "MSP" else c
	for c in unicodedata.normalize("NFKC", s)
	)


	class BasicTextNormalizer:
	def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
	self.clean = (
	remove_symbols_and_diacritics if remove_diacritics else remove_symbols
	)
	self.split_letters = split_letters

	def __call__(self, s: str):
	s = s.lower()
	s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
	s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
	s = self.clean(s).lower()

	if self.split_letters:
	s = " ".join(regex.findall(r"\X", s, regex.U))

	s = re.sub(
	r"\s+", " ", s
	) # replace any successive whitespace characters with a space

	return s