Spaces:

JotunnBurton
/

wuwa-bert-vits2

Sleeping

App Files Files Community

JotunnBurton commited on Apr 16

Commit

d5b3961

verified ·

1 Parent(s): d3907c0

Update text/japanese_bert.py

Browse files

Files changed (1) hide show

text/japanese_bert.py +42 -57

text/japanese_bert.py CHANGED Viewed

@@ -10,7 +10,6 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 models = dict()
 def get_bert_feature(
     text,
     word2ph,
@@ -19,75 +18,61 @@ def get_bert_feature(
     style_weight=0.7,
 ):
     sep_text, _ = text2sep_kata(text)
-    sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
-    sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
-    sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
-    style_ids = None
     if style_text:
-        sep_style_text, _ = text2sep_kata(style_text)
-        style_tokens = [tokenizer.tokenize(t) for t in sep_style_text]
-        style_ids = [tokenizer.convert_tokens_to_ids(t) for t in style_tokens]
-        style_ids = [2] + [item for sublist in style_ids for item in sublist] + [3]
-    return get_bert_feature_with_token(
-        sep_ids, word2ph, device, style_ids, style_weight
-    )
-def get_bert_feature_with_token(tokens, word2ph, device=None, style_tokens=None, style_weight=0.7):
-    if (
-        sys.platform == "darwin"
-        and torch.backends.mps.is_available()
-        and device == "cpu"
-    ):
         device = "mps"
     if not device:
         device = "cuda"
-    if device not in models.keys():
-        models[device] = AutoModelForMaskedLM.from_pretrained(MODEL_ID).to(device)
-    def encode(tokens_):
-        inputs = torch.tensor(tokens_).to(device).unsqueeze(0)
-        token_type_ids = torch.zeros_like(inputs).to(device)
-        attention_mask = torch.ones_like(inputs).to(device)
-        inputs = {
-            "input_ids": inputs,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-        }
-        with torch.no_grad():
-            res = models[device](**inputs, output_hidden_states=True)
-            res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
-        return res, inputs["input_ids"].shape[-1]
-    res, main_len = encode(tokens)
-    if main_len != len(word2ph):
-        print(">> DEBUG length mismatch:")
-        print("token len:", main_len)
-        print("word2ph len:", len(word2ph))
-        raise ValueError("Mismatch between token length and word2ph length.")
-    if style_tokens:
-        style_res, _ = encode(style_tokens)
-        style_res_mean = style_res.mean(0)
     phone_level_feature = []
-    for i in range(len(word2ph)):
-        if style_tokens:
             blended = (
-                res[i].repeat(word2ph[i], 1) * (1 - style_weight)
-                + style_res_mean.repeat(word2ph[i], 1) * style_weight
             )
         else:
-            blended = res[i].repeat(word2ph[i], 1)
         phone_level_feature.append(blended)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)
     return phone_level_feature.T
-if __name__ == "__main__":
-    print(get_bert_feature("観覧車", [4, 2]))

 models = dict()
 def get_bert_feature(
     text,
     word2ph,
     style_weight=0.7,
 ):
     sep_text, _ = text2sep_kata(text)
+    text = "".join(sep_text)
     if style_text:
+        style_text = "".join(text2sep_kata(style_text)[0])
+    if sys.platform == "darwin" and torch.backends.mps.is_available() and device == "cpu":
         device = "mps"
     if not device:
         device = "cuda"
+    if device not in models:
+        if config.webui_config.fp16_run:
+            models[device] = AutoModelForMaskedLM.from_pretrained(
+                MODEL_ID, torch_dtype=torch.float16
+            ).to(device)
+        else:
+            models[device] = AutoModelForMaskedLM.from_pretrained(MODEL_ID).to(device)
+    with torch.no_grad():
+        # Tokenize text into subwords for correct alignment
+        tokens = [tokenizer.tokenize(t) for t in sep_text]
+        flat_tokens = [item for sublist in tokens for item in sublist]
+        word2ph_token = [len(t) for t in tokens]
+        word2ph_token = [1] + word2ph_token + [1]  # Account for [CLS] and [SEP]
+        inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)
+        for k in inputs:
+            inputs[k] = inputs[k].to(device)
+        res = models[device](**inputs, output_hidden_states=True)
+        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
+        if style_text:
+            style_inputs = tokenizer(style_text, return_tensors="pt")
+            for k in style_inputs:
+                style_inputs[k] = style_inputs[k].to(device)
+            style_res = models[device](**style_inputs, output_hidden_states=True)
+            style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
+            style_res_mean = style_res.mean(0)
+    if len(word2ph_token) != res.shape[0]:
+        print(f"[ERROR] len(word2ph_token) = {len(word2ph_token)}, but BERT output = {res.shape[0]}")
+        print(f"[DEBUG] input text: {text}")
+        raise ValueError("Mismatch between tokenized word2ph and BERT output length.")
     phone_level_feature = []
+    for i in range(len(word2ph_token)):
+        if style_text:
             blended = (
+                res[i].repeat(word2ph_token[i], 1) * (1 - style_weight)
+                + style_res_mean.repeat(word2ph_token[i], 1) * style_weight
             )
         else:
+            blended = res[i].repeat(word2ph_token[i], 1)
         phone_level_feature.append(blended)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)
     return phone_level_feature.T