Spaces:
Running
Running
| import time | |
| import librosa | |
| import torch | |
| import torch.nn.functional as F | |
| import soundfile as sf | |
| import logging | |
| logging.getLogger("numba").setLevel(logging.WARNING) | |
| from transformers import ( | |
| Wav2Vec2FeatureExtractor, | |
| HubertModel, | |
| ) | |
| import utils | |
| import torch.nn as nn | |
| cnhubert_base_path = None | |
| class CNHubert(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.model = HubertModel.from_pretrained(cnhubert_base_path) | |
| self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( | |
| cnhubert_base_path | |
| ) | |
| def forward(self, x): | |
| input_values = self.feature_extractor( | |
| x, return_tensors="pt", sampling_rate=16000 | |
| ).input_values.to(x.device) | |
| feats = self.model(input_values)["last_hidden_state"] | |
| return feats | |
| # class CNHubertLarge(nn.Module): | |
| # def __init__(self): | |
| # super().__init__() | |
| # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") | |
| # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") | |
| # def forward(self, x): | |
| # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) | |
| # feats = self.model(input_values)["last_hidden_state"] | |
| # return feats | |
| # | |
| # class CVec(nn.Module): | |
| # def __init__(self): | |
| # super().__init__() | |
| # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") | |
| # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") | |
| # def forward(self, x): | |
| # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) | |
| # feats = self.model(input_values)["last_hidden_state"] | |
| # return feats | |
| # | |
| # class cnw2v2base(nn.Module): | |
| # def __init__(self): | |
| # super().__init__() | |
| # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") | |
| # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") | |
| # def forward(self, x): | |
| # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) | |
| # feats = self.model(input_values)["last_hidden_state"] | |
| # return feats | |
| def get_model(): | |
| model = CNHubert() | |
| model.eval() | |
| return model | |
| # def get_large_model(): | |
| # model = CNHubertLarge() | |
| # model.eval() | |
| # return model | |
| # | |
| # def get_model_cvec(): | |
| # model = CVec() | |
| # model.eval() | |
| # return model | |
| # | |
| # def get_model_cnw2v2base(): | |
| # model = cnw2v2base() | |
| # model.eval() | |
| # return model | |
| def get_content(hmodel, wav_16k_tensor): | |
| with torch.no_grad(): | |
| feats = hmodel(wav_16k_tensor) | |
| return feats.transpose(1, 2) | |
| if __name__ == "__main__": | |
| model = get_model() | |
| src_path = "/Users/Shared/ει³ι’2.wav" | |
| wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) | |
| model = model | |
| wav_16k_tensor = wav_16k_tensor | |
| feats = get_content(model, wav_16k_tensor) | |
| print(feats.shape) | |