Spaces:

JotunnBurton
/

wuwa-bert-vits2

Sleeping

App Files Files Community

JotunnBurton commited on Apr 16

Commit

cccafbc

verified ·

1 Parent(s): d5b3961

Update text/japanese_bert.py

Browse files

Files changed (1) hide show

text/japanese_bert.py +15 -24

text/japanese_bert.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
-import sys
-import os
-from text.japanese import text2sep_kata
 from config import config
 MODEL_ID = "ku-nlp/deberta-v2-large-japanese-char-wwm"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 models = dict()
 def get_bert_feature(
     text,
     word2ph,
@@ -17,9 +17,7 @@ def get_bert_feature(
     style_text=None,
     style_weight=0.7,
 ):
-    sep_text, _ = text2sep_kata(text)
-    text = "".join(sep_text)
     if style_text:
         style_text = "".join(text2sep_kata(style_text)[0])
@@ -37,42 +35,35 @@ def get_bert_feature(
             models[device] = AutoModelForMaskedLM.from_pretrained(MODEL_ID).to(device)
     with torch.no_grad():
-        # Tokenize text into subwords for correct alignment
-        tokens = [tokenizer.tokenize(t) for t in sep_text]
-        flat_tokens = [item for sublist in tokens for item in sublist]
-        word2ph_token = [len(t) for t in tokens]
-        word2ph_token = [1] + word2ph_token + [1]  # Account for [CLS] and [SEP]
         inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)
         for k in inputs:
             inputs[k] = inputs[k].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
         if style_text:
-            style_inputs = tokenizer(style_text, return_tensors="pt")
             for k in style_inputs:
                 style_inputs[k] = style_inputs[k].to(device)
             style_res = models[device](**style_inputs, output_hidden_states=True)
             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
             style_res_mean = style_res.mean(0)
-    if len(word2ph_token) != res.shape[0]:
-        print(f"[ERROR] len(word2ph_token) = {len(word2ph_token)}, but BERT output = {res.shape[0]}")
-        print(f"[DEBUG] input text: {text}")
-        raise ValueError("Mismatch between tokenized word2ph and BERT output length.")
     phone_level_feature = []
-    for i in range(len(word2ph_token)):
         if style_text:
-            blended = (
-                res[i].repeat(word2ph_token[i], 1) * (1 - style_weight)
-                + style_res_mean.repeat(word2ph_token[i], 1) * style_weight
             )
         else:
-            blended = res[i].repeat(word2ph_token[i], 1)
-        phone_level_feature.append(blended)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)
     return phone_level_feature.T

+import sys
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from config import config
+from text.japanese import text2sep_kata
 MODEL_ID = "ku-nlp/deberta-v2-large-japanese-char-wwm"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 models = dict()
 def get_bert_feature(
     text,
     word2ph,
     style_text=None,
     style_weight=0.7,
 ):
+    text = "".join(text2sep_kata(text)[0])
     if style_text:
         style_text = "".join(text2sep_kata(style_text)[0])
             models[device] = AutoModelForMaskedLM.from_pretrained(MODEL_ID).to(device)
     with torch.no_grad():
         inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)
         for k in inputs:
             inputs[k] = inputs[k].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
         if style_text:
+            style_inputs = tokenizer(style_text, return_tensors="pt", add_special_tokens=True)
             for k in style_inputs:
                 style_inputs[k] = style_inputs[k].to(device)
             style_res = models[device](**style_inputs, output_hidden_states=True)
             style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
             style_res_mean = style_res.mean(0)
+    # ✅ Force truncate ให้ความยาวตรงกับ word2ph
+    min_len = min(len(word2ph), res.shape[0])
+    word2phone = word2ph[:min_len]
+    res = res[:min_len]
     phone_level_feature = []
+    for i in range(len(word2phone)):
         if style_text:
+            repeat_feature = (
+                res[i].repeat(word2phone[i], 1) * (1 - style_weight)
+                + style_res_mean.repeat(word2phone[i], 1) * style_weight
             )
         else:
+            repeat_feature = res[i].repeat(word2phone[i], 1)
+        phone_level_feature.append(repeat_feature)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)
     return phone_level_feature.T