File size: 1,480 Bytes
0c074b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re

from omegaconf import OmegaConf

XIUGULUAN_G2P = OmegaConf.to_object(OmegaConf.load("configs/g2p.yaml"))["g2p"][
    "阿美_秀姑巒"
]


def can_form_string(x, symbol_dict):
    def helper(x, symbol_dict, matched_parts):
        if not x:
            return True, matched_parts

        for key in symbol_dict.keys():
            if x.startswith(key):
                result, parts = helper(
                    x[len(key) :], symbol_dict, matched_parts + [key]
                )
                if result:
                    return True, parts

        return False, []

    return helper(x, symbol_dict, [])


def text_to_ipa(text, ignore_comma=True):
    ipa = []
    text = text.lower()
    text = re.sub(r"[.?!]", "", text)
    text = text.replace("'", "’")
    words = text.split()  # change in future

    print(f"ipa: {words}")

    for word in words:
        ipa_parts = ""
        extended_g2p = {**XIUGULUAN_G2P, ",": "" if ignore_comma else ","}
        result, matched_parts = can_form_string(word, extended_g2p)

        if result is False:
            print(f"no match g2p : {word}")
            return ""

        for matched_part in matched_parts:
            ipa_parts = ipa_parts + extended_g2p[matched_part]

        ipa.append(ipa_parts)
    ipa = (
        " ".join(ipa)
        .replace("g", "ɡ")
        .replace("ʦ", "t͡s")
        .replace("ʨ", "t͡ɕ")
        .replace("R", "ʀ")
        .replace("ʤ", "dʒ")
    )
    return ipa