Spaces:

cherrvak
/

voice2motion-demo

Sleeping

App Files Files Community

cherrvak commited on Feb 3

Commit

2c04fa5

1 Parent(s): 753e340

initial commit

Browse files

Files changed (19) hide show

.gitattributes +1 -0
.gitignore +2 -0
app.py +118 -0
assets/onnx_models/decoder.onnx +3 -0
assets/onnx_models/encoder.onnx +3 -0
assets/onnx_models/hubert.onnx +3 -0
model_demo/__init__.py +31 -0
model_demo/frontend/build/asset-manifest.json +10 -0
model_demo/frontend/build/bootstrap.min.css +0 -0
model_demo/frontend/build/index.html +1 -0
model_demo/frontend/build/static/js/main.88082681.js +0 -0
model_demo/frontend/build/static/js/main.88082681.js.LICENSE.txt +98 -0
model_demo/frontend/build/static/js/main.88082681.js.map +0 -0
model_demo/frontend/build/vrm_model/demo.vrm +3 -0
model_demo/inference/audio.py +33 -0
model_demo/inference/constants.py +99 -0
model_demo/inference/infer.py +386 -0
model_demo/inference/landmarks.py +115 -0
requirements.txt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.vrm filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ venv

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import streamlit as st
+from model_demo import model_demo
+from model_demo.inference.infer import init_pipeline
+import os
+import pydub
+import numpy as np
+from model_demo.inference.constants import BLENDSHAPE_NAMES
+def make_downloadable_json(blendshapes, headangles):
+    blendshape_dict = {}
+    for i, name in enumerate(BLENDSHAPE_NAMES):
+        blendshape_dict[name] = blendshapes[:, i].tolist()
+    headangle_dict = {}
+    for i, name in enumerate(["pitch", "yaw", "roll"]):
+        headangle_dict[name] = headangles[:, i].tolist()
+    return str({"blendshapes": blendshape_dict, "headangles": headangle_dict})
+if "pred_dict" not in st.session_state:
+    st.session_state.pred_dict = {}
+current_dir = os.path.dirname(os.path.abspath(__file__))
+onnx_path = "assets/onnx_models"
+hubert_path = f"{onnx_path}/hubert.onnx"
+encoder_path = f"{onnx_path}/encoder.onnx"
+decoder_path = f"{onnx_path}/decoder.onnx"
+pipeline = init_pipeline(hubert_path, encoder_path, decoder_path)
+(col1, col2) = st.columns([2, 3])
+with col1:
+    with st.container(border=True):
+        audio_tab, control_tab, vrm_tab = st.tabs(["Audio", "Controls", "Upload VRM"])
+        with audio_tab:
+            recorded_value = st.audio_input("Record audio")
+            st.write("Or")
+            uploaded_value = st.file_uploader("Upload audio", type=["wav"])
+            audio_value = (
+                recorded_value if recorded_value is not None else uploaded_value
+            )
+        with control_tab:
+            mouth_exaggeration = st.number_input("Lower face exaggeration", value=5.0)
+            brow_exaggeration = st.number_input("Upper face exaggeration", value=4.0)
+            head_wiggle_exaggeration = st.number_input(
+                "Head wiggle exaggeration", value=2.0
+            )
+            unsquinch_fix = st.number_input(
+                "Unsquinch fix",
+                value=0.75,
+            )
+            eye_contact_fix = st.number_input(
+                "Eye contact fix",
+                value=1.5,
+            )
+            exaggerate_above = st.number_input(
+                "Exaggerate above",
+                value=0.01,
+                min_value=0.0,
+                max_value=1.0,
+                step=0.001,
+                format="%.3f",
+            )
+            symmetrize_eyes = st.checkbox("Symmetrize eyes", value=True)
+        with vrm_tab:
+            vrm_file = st.file_uploader("Upload VRM file", type=["vrm"])
+            if vrm_file:
+                # Read the raw bytes from the uploaded file
+                vrm_bytes = vrm_file.read()
+                # Store the raw bytes in the session state
+                st.session_state.pred_dict["vrm_file"] = vrm_bytes
+    submit_button = st.button("Run Inference", disabled=not audio_value)
+if submit_button and audio_value:
+    audio_segment = (
+        pydub.AudioSegment.from_file(audio_value).set_frame_rate(16000).set_channels(1)
+    )
+    audio_array = np.array(audio_segment.get_array_of_samples())
+    blendshapes, head_angles, mean_step_time, mean_rtf, time_to_first_sound = (
+        pipeline.infer_audio_array(
+            np.array(audio_array),
+            32000,
+            48000,
+            mouth_exaggeration,
+            brow_exaggeration,
+            head_wiggle_exaggeration,
+            unsquinch_fix,
+            eye_contact_fix,
+            exaggerate_above,
+            symmetrize_eyes,
+        )
+    )
+    st.session_state.pred_dict["blendshapes"] = blendshapes
+    st.session_state.pred_dict["head_angles"] = head_angles
+    st.session_state.pred_dict["audio_data"] = audio_value.getvalue()
+    processing_string = f"Inference complete at {mean_rtf:.2f}x real-time."
+    with col1:
+        st.write(processing_string)
+        st.download_button(
+            label="Download results as JSON",
+            data=make_downloadable_json(blendshapes, head_angles),
+            file_name="inference_results.json",
+            mime="text/json",
+        )
+with col2:
+    model_demo(
+        blendshapes=st.session_state.pred_dict.get("blendshapes", None),
+        headangles=st.session_state.pred_dict.get("head_angles", None),
+        audio_data=st.session_state.pred_dict.get("audio_data", None),
+        vrm_data=st.session_state.pred_dict.get("vrm_file", None),
+        key="model_viewport",
+    )

assets/onnx_models/decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ae52135ac99c7e48ec8ca77e96a7ff057b36bcd1379e3763ed25a23339781de
+size 22408905

assets/onnx_models/encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32b26754ff956a46742232d0fb17adb444115ffb2d9cf155051d1e9aca27cf18
+size 5909306

assets/onnx_models/hubert.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6fcb81a8315972f672b9433e85886fda467b9c6d76f1799e5bf01f8c68f915b
+size 377746620

model_demo/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+import streamlit.components.v1 as components
+_RELEASE = True
+if not _RELEASE:
+    _component_func = components.declare_component(
+        "model_demo",
+        url="http://localhost:3001",
+    )
+else:
+    # When we're distributing a production version of the component, we'll
+    # replace the `url` param with `path`, and point it to the component's
+    # build directory:
+    parent_dir = os.path.dirname(os.path.abspath(__file__))
+    build_dir = os.path.join(parent_dir, "frontend/build")
+    _component_func = components.declare_component("model_demo", path=build_dir)
+def model_demo(
+    blendshapes=None, headangles=None, audio_data=None, vrm_data=None, key=None
+):
+    component_value = _component_func(
+        blendshapes=blendshapes.tolist() if blendshapes is not None else None,
+        headangles=headangles.tolist() if headangles is not None else None,
+        audio_data=audio_data.decode("latin1") if audio_data is not None else None,
+        vrm_data=[int(b) for b in vrm_data] if vrm_data is not None else None,
+        key=key,
+        default=0,
+    )
+    return component_value

model_demo/frontend/build/asset-manifest.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "files": {
+    "main.js": "./static/js/main.88082681.js",
+    "index.html": "./index.html",
+    "main.88082681.js.map": "./static/js/main.88082681.js.map"
+  },
+  "entrypoints": [
+    "static/js/main.88082681.js"
+  ]
+}

model_demo/frontend/build/bootstrap.min.css ADDED Viewed

The diff for this file is too large to render. See raw diff

model_demo/frontend/build/index.html ADDED Viewed

	@@ -0,0 +1 @@

+ <!doctype html><html lang="en"><head><title>Streamlit Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Component"/><link rel="stylesheet" href="bootstrap.min.css"/><script defer="defer" src="./static/js/main.88082681.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

model_demo/frontend/build/static/js/main.88082681.js ADDED Viewed

The diff for this file is too large to render. See raw diff

model_demo/frontend/build/static/js/main.88082681.js.LICENSE.txt ADDED Viewed

	@@ -0,0 +1,98 @@

+/*
+object-assign
+(c) Sindre Sorhus
+@license MIT
+*/
+/*!
+ * @pixiv/three-vrm v3.3.4
+ * VRM file loader for three.js.
+ *
+ * Copyright (c) 2019-2025 pixiv Inc.
+ * @pixiv/three-vrm is distributed under MIT License
+ * https://github.com/pixiv/three-vrm/blob/release/LICENSE
+ */
+/**
+ * @license
+ * Copyright 2010-2024 Three.js Authors
+ * SPDX-License-Identifier: MIT
+ */
+/**
+ * @license React
+ * react-dom.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * react-jsx-runtime.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * react-reconciler-constants.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * react-reconciler.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * react.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * scheduler.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/** @license React v16.13.1
+ * react-is.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/** @license React v16.14.0
+ * react.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */

model_demo/frontend/build/static/js/main.88082681.js.map ADDED Viewed

The diff for this file is too large to render. See raw diff

model_demo/frontend/build/vrm_model/demo.vrm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfb31b3ab6759ff5f4676130827d8b3aa570d0a44eb4f7431cdb56e8790501a6
+size 16932772

model_demo/inference/audio.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import numpy as np
+class AudioStream:
+    """
+    Class to mimic streaming audio input.
+    """
+    def __init__(
+        self, audio: np.ndarray, min_samples_per_step: int, max_samples_per_step: int
+    ):
+        self.audio = audio
+        self.min_samples_per_step = min_samples_per_step
+        self.max_samples_per_step = max_samples_per_step
+        self.current_idx = 0
+        self.can_step = True
+    def step(self) -> np.ndarray:
+        if not self.can_step:
+            raise StopIteration("End of audio stream")
+        start_idx = self.current_idx
+        if self.min_samples_per_step == self.max_samples_per_step:
+            samples_per_step = self.min_samples_per_step
+        else:
+            samples_per_step = np.random.randint(
+                self.min_samples_per_step, self.max_samples_per_step, (1,)
+            ).item()
+        end_idx = min(start_idx + samples_per_step, len(self.audio))
+        audio_chunk = self.audio[start_idx:end_idx]
+        self.current_idx = end_idx
+        if end_idx >= len(self.audio):
+            self.can_step = False
+        return audio_chunk

model_demo/inference/constants.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+# 3000 frames in a mel spectrogram input
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
+TIMESTEP_S = 30 / 1500
+VIDEO_FPS = 30
+N_AUDIO_SAMPLES_PER_VIDEO_FRAME = SAMPLE_RATE // VIDEO_FPS
+N_VIDEO_FRAMES = CHUNK_LENGTH * VIDEO_FPS  # 900 frames in a 30-second video chunk
+def mel_frames_from_video_frames(n_video_frames):
+    return int(n_video_frames * N_SAMPLES_PER_TOKEN / VIDEO_FPS)
+MEL_FILTER_PATH = os.path.join(
+    os.path.dirname(__file__), "../../assets", "mel_filters.npz"
+)
+LANDMARKER_PATH = "pretrained_models/mediapipe/face_landmarker_v2_with_blendshapes.task"
+BLENDSHAPE_NAMES = [
+    "_neutral",
+    "browDownLeft",
+    "browDownRight",
+    "browInnerUp",
+    "browOuterUpLeft",
+    "browOuterUpRight",
+    "cheekPuff",
+    "cheekSquintLeft",
+    "cheekSquintRight",
+    "eyeBlinkLeft",
+    "eyeBlinkRight",
+    "eyeLookDownLeft",
+    "eyeLookDownRight",
+    "eyeLookInLeft",
+    "eyeLookInRight",
+    "eyeLookOutLeft",
+    "eyeLookOutRight",
+    "eyeLookUpLeft",
+    "eyeLookUpRight",
+    "eyeSquintLeft",
+    "eyeSquintRight",
+    "eyeWideLeft",
+    "eyeWideRight",
+    "jawForward",
+    "jawLeft",
+    "jawOpen",
+    "jawRight",
+    "mouthClose",
+    "mouthDimpleLeft",
+    "mouthDimpleRight",
+    "mouthFrownLeft",
+    "mouthFrownRight",
+    "mouthFunnel",
+    "mouthLeft",
+    "mouthLowerDownLeft",
+    "mouthLowerDownRight",
+    "mouthPressLeft",
+    "mouthPressRight",
+    "mouthPucker",
+    "mouthRight",
+    "mouthRollLower",
+    "mouthRollUpper",
+    "mouthShrugLower",
+    "mouthShrugUpper",
+    "mouthSmileLeft",
+    "mouthSmileRight",
+    "mouthStretchLeft",
+    "mouthStretchRight",
+    "mouthUpperUpLeft",
+    "mouthUpperUpRight",
+    "noseSneerLeft",
+    "noseSneerRight",
+]
+HEAD_ANGLE_NAMES = ["pitch", "yaw", "roll"]
+HEAD_LANDMARK_DIM = len(BLENDSHAPE_NAMES) + len(HEAD_ANGLE_NAMES)
+def get_n_mels(whisper_model_name: str):
+    if "v3" in whisper_model_name:
+        return 128
+    return 80

model_demo/inference/infer.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import numpy as np
+from typing import Optional
+from pathlib import Path
+from typing import Tuple
+import time
+from model_demo.inference.audio import AudioStream
+from model_demo.inference.landmarks import (
+    unscale_and_uncenter_head_angles,
+    clean_up_blendshapes,
+    exaggerate_head_wiggle,
+)
+from model_demo.inference.constants import (
+    N_AUDIO_SAMPLES_PER_VIDEO_FRAME,
+    SAMPLE_RATE,
+    HEAD_LANDMARK_DIM,
+)
+import onnxruntime as ort
+from dataclasses import dataclass
+from typing import Optional, Union
+class InferencePipeline:
+    """
+    Pipeline for running WhisperLike model inference on a video file.
+    Added crossfade functionality to smooth transitions between chunks.
+    """
+    def __init__(
+        self,
+        max_chunk_size: int,
+        crossfade_size: int,
+        batch_size: int,
+    ) -> None:
+        """
+        Initialize streaming inference pipeline.
+        Args:
+            max_chunk_size: Maximum number of frames to process in a single chunk
+            crossfade_size: Number of frames to use for crossfading between chunks
+            batch_size: Batch size for inference
+            device: Device to run on
+        """
+        self.max_chunk_size = max_chunk_size
+        self.max_audio_input_size = (
+            self.max_chunk_size * N_AUDIO_SAMPLES_PER_VIDEO_FRAME
+        )
+        self.crossfade_size = crossfade_size
+        self.audio_crossfade_size = crossfade_size * N_AUDIO_SAMPLES_PER_VIDEO_FRAME
+        self.n_feats = HEAD_LANDMARK_DIM
+        # Maintain state between chunks
+        self.prev_output = np.zeros((batch_size, 0, self.n_feats))
+        self.audio_buffer = np.zeros((batch_size, 0))
+        # Crossfade buffer stores the overlapping region from the previous chunk
+        self.crossfade_buffer = None
+        # Pre-compute crossfade weights
+        self.crossfade_weights = np.linspace(0, 1, crossfade_size)
+        self.crossfade_weights = self.crossfade_weights.reshape(1, -1)
+    def apply_crossfade(
+        self, current_chunk: np.ndarray, update_crossfade_buffer: bool
+    ) -> np.ndarray:
+        """Apply crossfade between previous and current chunk predictions."""
+        if self.crossfade_buffer is not None:
+            # Extract the crossfade region from the current chunk
+            current_fade_region = current_chunk[:, : self.crossfade_size]
+            # Blend the overlapping regions using the pre-computed weights
+            blended_region = np.multiply(
+                self.crossfade_buffer, np.expand_dims((1 - self.crossfade_weights), -1)
+            ) + np.multiply(
+                current_fade_region, np.expand_dims(self.crossfade_weights, -1)
+            )
+            # Replace the beginning of the current chunk with the blended region
+            output = current_chunk.copy()
+            output[:, : self.crossfade_size] = blended_region
+        else:
+            output = current_chunk
+        if update_crossfade_buffer:
+            self.crossfade_buffer = current_chunk[:, -self.crossfade_size :].copy()
+            output = output[:, : -self.crossfade_size]
+        return output
+    def model_generate(self, src, max_len, initial_context=None):
+        """
+        Generate output sequence with optional initial context.
+        Args:
+            src: Source audio features of shape [B, T_a, D], where T_a is the number of
+                audio frames corresponding to max_len video frames
+            max_len: Number of frames to generate
+            initial_context: Optional previous output context (B, J, D), where J is
+                in [1, max_len + 1]
+        Returns:
+            Predicted landmarks [B, max_len - J, D]
+        """
+        pass
+    def infer_chunk(self, audio: np.ndarray, new_audio_len: int) -> np.ndarray:
+        """Process a single chunk of audio, using previous context if available."""
+        n_new_frames = (
+            new_audio_len // N_AUDIO_SAMPLES_PER_VIDEO_FRAME + self.crossfade_size
+        )
+        n_generation_frames = audio.shape[1] // N_AUDIO_SAMPLES_PER_VIDEO_FRAME
+        n_context_frames = (n_generation_frames - n_new_frames) + 1
+        if n_context_frames > 0:
+            initial_context = self.prev_output[:, -n_context_frames:]
+        else:
+            initial_context = None
+        # Generate predictions
+        predictions = self.model_generate(audio, n_generation_frames, initial_context)
+        self.prev_output = np.concatenate([self.prev_output, predictions], axis=1)[
+            :, -self.max_chunk_size :
+        ]
+        return predictions
+    def prepare_input_chunk(self, audio: np.ndarray) -> np.ndarray:
+        new_audio_len = audio.shape[1]
+        self.audio_buffer = np.concatenate([self.audio_buffer, audio], axis=1)[
+            :, -self.max_audio_input_size :
+        ]
+        return self.audio_buffer, new_audio_len
+    def process_output_chunk(
+        self,
+        chunk: np.ndarray,
+        update_crossfade_buffer: bool,
+        mouth_exaggeration: float,
+        brow_exaggeration: float,
+        head_wiggle_exaggeration: float,
+        unsquinch_fix: float,
+        eye_contact_fix: float,
+        exaggerate_above: float,
+        symmetrize_eyes: bool,
+    ) -> np.ndarray:
+        chunk[..., :52] = clean_up_blendshapes(
+            chunk[..., :52],
+            mouth_exaggeration,
+            brow_exaggeration,
+            clear_neutral=True,
+            unsquinch_fix=unsquinch_fix,
+            eye_contact_fix=eye_contact_fix,
+            exaggerate_above=exaggerate_above,
+            symmetrize_eyes=symmetrize_eyes,
+        )
+        if head_wiggle_exaggeration != 1.0:
+            chunk[..., 52:] = exaggerate_head_wiggle(
+                chunk[..., 52:], head_wiggle_exaggeration
+            )
+        if self.crossfade_size > 0 and chunk.shape[1] > self.crossfade_size:
+            chunk = self.apply_crossfade(chunk, update_crossfade_buffer)
+        return chunk
+    def __call__(
+        self,
+        audio: np.ndarray,
+        audio_stream_can_step: bool,
+        mouth_exaggeration: float,
+        brow_exaggeration: float,
+        head_wiggle_exaggeration: float,
+        unsquinch_fix: float,
+        eye_contact_fix: float,
+        exaggerate_above: float,
+        symmetrize_eyes: bool,
+    ) -> np.ndarray:
+        """
+        Run the model on an audio tensor.
+        Args:
+            audio: Audio tensor of shape (batch_size, n_audio_samples)
+        Returns:
+            np.ndarray: Model predictions
+        """
+        input_chunk, new_audio_len = self.prepare_input_chunk(audio)
+        output_chunk = self.infer_chunk(input_chunk, new_audio_len)
+        return self.process_output_chunk(
+            output_chunk,
+            update_crossfade_buffer=audio_stream_can_step,
+            mouth_exaggeration=mouth_exaggeration,
+            brow_exaggeration=brow_exaggeration,
+            head_wiggle_exaggeration=head_wiggle_exaggeration,
+            unsquinch_fix=unsquinch_fix,
+            eye_contact_fix=eye_contact_fix,
+            exaggerate_above=exaggerate_above,
+            symmetrize_eyes=symmetrize_eyes,
+        )
+    def reset(self):
+        """Reset internal state"""
+        self.prev_output = np.zeros_like(self.prev_output)
+        self.audio_buffer = np.zeros_like(self.audio_buffer)
+        self.crossfade_buffer = None
+    def infer_audio_array(
+        self,
+        audio: np.ndarray,
+        min_audio_samples_per_step: int,
+        max_audio_samples_per_step: int,
+        mouth_exaggeration: float = 1.0,
+        brow_exaggeration: float = 1.0,
+        head_wiggle_exaggeration: float = 1.0,
+        unsquinch_fix: float = 0.0,
+        eye_contact_fix: float = 0.0,
+        exaggerate_above: float = 0.0,
+        symmetrize_eyes: bool = False,
+        max_audio_duration: Optional[float] = None,
+    ) -> Tuple[np.ndarray, float, float, float]:
+        """
+        Run the model on an input audio or video file under simulated streaming conditions.
+        Args:
+            audio: Numpy array of audio samples
+            min_audio_samples_per_step: Minimum number of audio samples per step
+            max_audio_samples_per_step: Maximum number of audio samples per step
+            max_audio_duration: Maximum duration of audio to process in seconds
+        Returns:
+            Tuple of:
+                - Blendshapes of shape (T, 52)
+                - Head angles of shape (T, 3)
+                - Mean time per step in seconds
+                - Mean real-time factor
+        """
+        # Reset all buffers
+        self.reset()
+        # Apply duration limit if specified
+        if max_audio_duration is not None:
+            max_audio_duration_frames = int(max_audio_duration * SAMPLE_RATE)
+            audio_len = min(len(audio), max_audio_duration_frames)
+        else:
+            audio_len = len(audio)
+        audio_stream = AudioStream(
+            audio[:audio_len], min_audio_samples_per_step, max_audio_samples_per_step
+        )
+        # Process each chunk
+        outputs = []
+        step_times = []
+        audio_durations = []
+        while audio_stream.can_step:
+            audio_chunk = audio_stream.step()
+            audio_durations.append(audio_chunk.shape[-1] / SAMPLE_RATE)
+            # Process the chunk
+            start_time = time.time()
+            chunk_output = self(
+                np.expand_dims(audio_chunk, 0),
+                audio_stream.can_step,
+                mouth_exaggeration,
+                brow_exaggeration,
+                head_wiggle_exaggeration,
+                unsquinch_fix,
+                eye_contact_fix,
+                exaggerate_above,
+                symmetrize_eyes,
+            )
+            step_times.append(time.time() - start_time)
+            outputs.append(chunk_output)
+        # Concatenate all outputs
+        full_output = np.concatenate(outputs, axis=1)
+        mean_step_time = sum(step_times) / len(step_times)
+        mean_rtf = sum(audio_durations) / sum(step_times)
+        time_to_first_sound = step_times[0] + audio_durations[0]
+        blendshapes = full_output.squeeze(0)[:, :52]
+        head_angles = unscale_and_uncenter_head_angles(
+            full_output.squeeze(0)[:, 52:], bad_frames=[]
+        )
+        return blendshapes, head_angles, mean_step_time, mean_rtf, time_to_first_sound
+@dataclass
+class ONNXModels:
+    hubert_session: ort.InferenceSession
+    encoder_session: ort.InferenceSession
+    decoder_session: ort.InferenceSession
+class ONNXInferencePipeline(InferencePipeline):
+    """
+    ONNX version of the inference pipeline.
+    """
+    def __init__(
+        self,
+        onnx_models: ONNXModels,
+        max_chunk_size: int,
+        crossfade_size: int,
+        batch_size: int,
+    ):
+        """
+        Initialize ONNX inference pipeline.
+        Args:
+            onnx_models: ONNXModels containing hubert and decoder sessions
+            max_chunk_size: Maximum number of frames to process in a single chunk
+            crossfade_size: Number of frames to use for crossfading between chunks
+            batch_size: Batch size for inference
+            device: Device to run inference on
+        """
+        super().__init__(
+            max_chunk_size,
+            crossfade_size,
+            batch_size,
+        )
+        self.onnx_models = onnx_models
+    def model_generate(self, src, max_len, initial_context=None):
+        """
+        Generate output sequence using ONNX models.
+        """
+        # Run HuBERT through ONNX
+        src_np = src.astype(np.float32)
+        hubert_out = self.onnx_models.hubert_session.run(
+            None, {"input_values": src_np}
+        )[0]
+        src = self.onnx_models.encoder_session.run(None, {"src": hubert_out})[0]
+        if initial_context is not None:
+            decoder_in = initial_context.astype(np.float32)
+        else:
+            decoder_in = np.zeros((src.shape[0], 1, HEAD_LANDMARK_DIM)).astype(
+                np.float32
+            )
+        outputs = []
+        for i in range(max_len - decoder_in.shape[1] + 1):
+            # Run decoder step through ONNX
+            next_output = self.onnx_models.decoder_session.run(
+                None,
+                {"src": src.astype(np.float32), "decoder_in": decoder_in},
+            )[0]
+            decoder_in = np.concatenate([decoder_in, next_output], axis=1)
+            outputs.append(next_output)
+        pred_out = np.concatenate(outputs, axis=1)
+        return pred_out
+def init_pipeline(
+    hubert_onnx_path: Path,
+    encoder_onnx_path: Path,
+    decoder_onnx_path: Path,
+    device: str = "cpu",
+    chunk_size: int = 90,
+    crossfade_size: int = 5,
+    batch_size: int = 1,
+) -> Union[InferencePipeline, ONNXInferencePipeline]:
+    """
+    Initialize ONNX inference pipeline based on provided paths.
+    Args:
+        hubert_onnx_path: Path to ONNX HuBERT model
+        decoder_onnx_path: Path to ONNX decoder model
+        chunk_size: Maximum number of frames per chunk
+        crossfade_size: Number of frames for crossfading
+        batch_size: Batch size for inference
+        device: Device to run on
+    Returns:
+        ONNX inference pipeline
+    """
+    # ONNX pipeline
+    providers = (
+        ["CUDAExecutionProvider"] if device == "cuda" else ["CPUExecutionProvider"]
+    )
+    hubert_session = ort.InferenceSession(str(hubert_onnx_path), providers=providers)
+    encoder_session = ort.InferenceSession(str(encoder_onnx_path), providers=providers)
+    decoder_session = ort.InferenceSession(str(decoder_onnx_path), providers=providers)
+    onnx_models = ONNXModels(hubert_session, encoder_session, decoder_session)
+    return ONNXInferencePipeline(onnx_models, chunk_size, crossfade_size, batch_size)

model_demo/inference/landmarks.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import Optional, Tuple, List, Union
+import numpy as np
+from math import e, pi
+from model_demo.inference.constants import BLENDSHAPE_NAMES
+def clean_up_blendshapes(
+    blendshapes: np.ndarray,
+    mouth_exaggeration: float,
+    brow_exaggeration: float,
+    unsquinch_fix: float,
+    eye_contact_fix: float,
+    clear_neutral: bool = False,
+    exaggerate_above: float = 0,
+    symmetrize_eyes: bool = False,
+) -> np.ndarray:
+    """
+    Exaggerate blendshapes by a given factor.
+    Args:
+        blendshapes: Blendshape coefficients of shape (B, T, D) or (T, D)
+        exaggeration_factor: Factor to exaggerate the blendshapes by
+        unsquinch_fix: Factor to reduce eye squint and blink blendshapes by in range [0, 1]
+        eye_contact_fix: Factor to reduce eye look blendshapes by in range [0, 1]
+        clear_neutral: Whether to clear the neutral expression blendshape (set to 0)
+        mouth_only: Whether to exaggerate only the mouth blendshapes
+        exaggerate_above: Landmarks below this value will be exaggerated up, below down
+    Returns:
+        Exaggerated blendshape coefficients of shape (B, T, D) or (T, D)
+    """
+    def modify_blendshapes(
+        blendshapes: np.ndarray, target_substrings: List[str], factor: float
+    ) -> np.ndarray:
+        if factor != 1:
+            for i, shape in enumerate(BLENDSHAPE_NAMES):
+                if any(substring in shape for substring in target_substrings):
+                    blendshapes_offset = blendshapes[..., i] - exaggerate_above
+                    blendshapes[..., i] = blendshapes_offset * factor + exaggerate_above
+        blendshapes = np.clip(blendshapes, 0.0, 1.0)
+        return blendshapes
+    if clear_neutral:
+        blendshapes[..., 0] = 0
+    modify_blendshapes(blendshapes, ["mouth", "jaw", "cheek"], mouth_exaggeration)
+    modify_blendshapes(blendshapes, ["brow", "noseSneer", "eye"], brow_exaggeration)
+    if unsquinch_fix > 0:
+        eye_idx = [
+            i
+            for i, name in enumerate(BLENDSHAPE_NAMES)
+            if "eyeSquint" in name or "eyeBlink" in name
+        ]
+        for idx in eye_idx:
+            blendshapes[..., idx] -= unsquinch_fix
+    if eye_contact_fix > 0:
+        eye_idx = [i for i, name in enumerate(BLENDSHAPE_NAMES) if "eyeLook" in name]
+        for idx in eye_idx:
+            blendshapes[..., idx] -= eye_contact_fix
+    if symmetrize_eyes:
+        # average between eyeBlinkLeft and eyeBlinkRight
+        eye_blink_left_index = BLENDSHAPE_NAMES.index("eyeBlinkLeft")
+        eye_blink_right_index = BLENDSHAPE_NAMES.index("eyeBlinkRight")
+        avg_val = (
+            blendshapes[..., eye_blink_left_index]
+            + blendshapes[..., eye_blink_right_index]
+        ) / 2
+        blendshapes[..., eye_blink_left_index] = avg_val
+        blendshapes[..., eye_blink_right_index] = avg_val
+    blendshapes = np.clip(blendshapes, 0.0, 1.0)
+    return blendshapes
+def exaggerate_head_wiggle(
+    head_angles: np.ndarray[np.float32], exaggeration_factor: float
+) -> np.ndarray[np.float32]:
+    """
+    Exaggerate head angles by a given factor.
+    Args:
+        head_angles: Sequence of pitch, yaw, roll values of shape (temporal_dim, 3)
+        exaggeration_factor: Factor to exaggerate the head angles by
+    Returns:
+        Exaggerated head angles of shape (temporal_dim, 3)
+    """
+    return head_angles * exaggeration_factor
+def unscale_and_uncenter_head_angles(
+    head_angles: np.ndarray[np.float32],
+    mean_pos: Optional[np.ndarray[np.float32]] = None,
+    bad_frames: List[int] = [],
+) -> np.ndarray[np.float32]:
+    """
+    Rescale head angles in range [-1, 1] to [-pi, pi] and uncenter them.
+    Args:
+        head_angles: Sequence of pitch, yaw, roll values of shape (temporal_dim, 3)
+        mean_pos: Mean position to offset the head angles of shape (3,)
+        bad_frames: List of indices of frames where face detection failed
+    Returns:
+        Array of unscaled and uncentered head angles of shape (temporal_dim, 3)
+    """
+    if mean_pos is None:
+        mean_pos = np.zeros(3).astype(np.float32)
+    good_frames = [i for i in range(head_angles.shape[0]) if i not in bad_frames]
+    head_angles[good_frames] = head_angles[good_frames] + mean_pos
+    head_angles[good_frames] = head_angles[good_frames] * pi
+    return head_angles

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+pydub
+numpy
+onnxruntime