|
|
""" |
|
|
Copyright 2023 Balacoon |
|
|
|
|
|
contains implementation |
|
|
for voice conversion request |
|
|
""" |
|
|
|
|
|
import os |
|
|
import asyncio |
|
|
import base64 |
|
|
import hashlib |
|
|
import json |
|
|
import ssl |
|
|
import time |
|
|
from typing import Tuple |
|
|
|
|
|
import numpy as np |
|
|
import resampy |
|
|
import websockets |
|
|
|
|
|
|
|
|
def prepare_audio(audio: Tuple[int, np.ndarray]) -> np.ndarray: |
|
|
""" |
|
|
ensures that audio is in int16 format, 16khz mono |
|
|
""" |
|
|
sr, wav = audio |
|
|
|
|
|
if wav.dtype == np.int32: |
|
|
max_val = np.max(np.abs(wav)) |
|
|
mult = (32767.0 / 2**31) if max_val > 32768 else 1.0 |
|
|
wav = (wav.astype(np.float32) * mult).astype(np.int16) |
|
|
elif wav.dtype == np.float32 or wav.dtype == np.float64: |
|
|
mult = 32767.0 if np.max(np.abs(wav)) <= 1.0 else 1.0 |
|
|
wav = (wav * mult).astype(np.int16) |
|
|
|
|
|
if wav.ndim == 2: |
|
|
|
|
|
if wav.shape[0] == 2: |
|
|
wav = np.mean(wav, axis=0, keepdims=False) |
|
|
if wav.shape[1] == 2: |
|
|
wav = np.mean(wav, axis=1, keepdims=False) |
|
|
|
|
|
if wav.ndim != 1: |
|
|
return None |
|
|
|
|
|
|
|
|
if sr != 16000: |
|
|
wav = (wav / 32768.0).astype(np.float) |
|
|
wav = resampy.resample(wav, sr, 16000) |
|
|
wav = (wav * 32768.0).astype(np.int16) |
|
|
return wav |
|
|
|
|
|
|
|
|
def create_signature() -> str: |
|
|
""" |
|
|
helper function that creates signature, |
|
|
required to authentificate the request |
|
|
""" |
|
|
int_time = int(time.time() / 1000) |
|
|
signature_input = (os.environ["api_secret"] + str(int_time)).encode() |
|
|
signature = hashlib.sha256(signature_input).hexdigest() |
|
|
return signature |
|
|
|
|
|
|
|
|
async def async_service_request(source: np.ndarray, target: np.ndarray) -> np.ndarray: |
|
|
ssl_context = ssl.create_default_context() |
|
|
|
|
|
async with websockets.connect( |
|
|
os.environ["endpoint"], close_timeout=1024, ssl=ssl_context |
|
|
) as websocket: |
|
|
request_dict = { |
|
|
"source": base64.b64encode(source.tobytes()).decode("utf-8"), |
|
|
"target": base64.b64encode(target.tobytes()).decode("utf-8"), |
|
|
"api_key": os.environ["api_key"], |
|
|
"signature": create_signature(), |
|
|
} |
|
|
request = json.dumps(request_dict) |
|
|
await websocket.send(request) |
|
|
|
|
|
|
|
|
result_lst = [] |
|
|
while True: |
|
|
try: |
|
|
data = await websocket.recv() |
|
|
result_lst.append(np.frombuffer(data, dtype="int16")) |
|
|
except websockets.exceptions.ConnectionClosed: |
|
|
break |
|
|
if data is None: |
|
|
break |
|
|
result = np.concatenate(result_lst) if result_lst else None |
|
|
return result |
|
|
|
|
|
|
|
|
def vc_service_request( |
|
|
source_audio: Tuple[int, np.ndarray], target_audio: Tuple[int, np.ndarray] |
|
|
) -> Tuple[int, np.ndarray]: |
|
|
""" |
|
|
prepares audio (has to be 16khz mono) |
|
|
and runs request to a voice conversion service |
|
|
""" |
|
|
src = prepare_audio(source_audio) |
|
|
tgt = prepare_audio(target_audio) |
|
|
if src is None or tgt is None: |
|
|
return |
|
|
if len(src) >= 60 * 16000 or len(tgt) >= 30 * 16000: |
|
|
|
|
|
return |
|
|
|
|
|
res = asyncio.run(async_service_request(src, tgt)) |
|
|
return 16000, res |
|
|
|