Spaces:

jhj0517
/

AdvancedLivePortrait-WebUI

Running

App Files Files Community

jhj0517 commited on Nov 10, 2024

Commit

f28d5bc

2 Parent(s): 617b3ad 52ea725

Merge branch 'master' into huggingface-zero-gpu

Browse files

Files changed (15) hide show

.github/workflows/ci.yml +4 -1
.github/workflows/publish-docker.yml +37 -0
.gitignore +2 -0
LICENSE +201 -0
README.md +3 -2
app.py +77 -39
i18n/translation.yaml +80 -0
modules/live_portrait/live_portrait_inferencer.py +170 -198
modules/utils/constants.py +7 -1
modules/utils/image_helper.py +1 -0
modules/utils/paths.py +9 -2
modules/utils/video_helper.py +315 -0
requirements.txt +7 -1
tests/test_config.py +63 -2
tests/test_video_creation.py +39 -0

.github/workflows/ci.yml CHANGED Viewed

@@ -28,8 +28,11 @@ jobs:
         with:
           python-version: ${{ matrix.python }}
       - name: Install dependencies
-        run: pip install -r requirements.txt pytest
       - name: Run test
         run: python -m pytest -rs tests

         with:
           python-version: ${{ matrix.python }}
+      - name: Install ffmpeg
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg
       - name: Install dependencies
+        run: pip install -r requirements.txt pytest scikit-image moviepy
       - name: Run test
         run: python -m pytest -rs tests

.github/workflows/publish-docker.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Publish to Docker Hub
+on:
+  push:
+    branches:
+      - master
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./docker/Dockerfile
+          push: true
+          tags: ${{ secrets.DOCKER_USERNAME }}/advancedliveportrait-webui:latest
+      - name: Log out of Docker Hub
+        run: docker logout

.gitignore CHANGED Viewed

@@ -4,5 +4,7 @@ models/
 outputs/
 *.png
 *.jpg
 **/.pytest_cache

 outputs/
 *.png
 *.jpg
+*.jpeg
+**/__pycache__
 **/.pytest_cache

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 jhj0517
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -25,7 +25,8 @@ You can try it in Colab
 # Installation And Running
 ### Prerequisite
 1. `3.9` <= `python` <= `3.12` : https://www.python.org/downloads/release/python-3110/
 ## Run Locally
 1. git clone this repository
 ```
@@ -51,7 +52,7 @@ If you're using Windows, right-click the script and then click on ***Run with Po
 ```
 git clone https://github.com/jhj0517/AdvancedLivePortrait-WebUI.git
 ```
-2. Build the imade
 ```
 docker compose -f docker/docker-compose.yaml build
 ```

 # Installation And Running
 ### Prerequisite
 1. `3.9` <= `python` <= `3.12` : https://www.python.org/downloads/release/python-3110/
+2. **(Opitonal, only if you're using Nvidia GPU)** CUDA 12.4 : https://developer.nvidia.com/cuda-12-4-0-download-archive?target_os=Windows
+3. (Optional, only needed if you use Video Driven) `FFmpeg`:  https://ffmpeg.org/download.html <br> After installing `FFmpeg`, make sure to add the FFmpeg/bin folder to your **system PATH**!
 ## Run Locally
 1. git clone this repository
 ```
 ```
 git clone https://github.com/jhj0517/AdvancedLivePortrait-WebUI.git
 ```
+2. Build the image
 ```
 docker compose -f docker/docker-compose.yaml build
 ```

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ class App:
         )
     @staticmethod
-    def create_parameters():
         return [
             gr.Dropdown(label=_("Model Type"), visible=False, interactive=False,
                         choices=[item.value for item in ModelType], value=ModelType.HUMAN.value),
@@ -38,10 +38,21 @@ class App:
             gr.Slider(label=_("WOO"), minimum=-20, maximum=20, step=0.2, value=0),
             gr.Slider(label=_("Smile"), minimum=-2.0, maximum=2.0, step=0.01, value=0),
             gr.Slider(label=_("Source Ratio"), minimum=0, maximum=1, step=0.01, value=1),
-            gr.Slider(label=_("Sample Ratio"), minimum=-0.2, maximum=1.2, step=0.01, value=1),
-            gr.Dropdown(label=_("Sample Parts"),
                         choices=[part.value for part in SamplePart], value=SamplePart.ALL.value),
-            gr.Slider(label=_("Crop Factor"), minimum=1.5, maximum=2.5, step=0.1, value=1.7)
         ]
     def launch(self):
@@ -49,41 +60,68 @@ class App:
             with self.i18n:
                 gr.Markdown(REPO_MARKDOWN, elem_id="md_project")
-                with gr.Row():
-                    with gr.Column():
-                        img_ref = gr.Image(label=_("Reference Image"))
-                with gr.Row():
-                    btn_gen = gr.Button("GENERATE", visible=False)
-                with gr.Row(equal_height=True):
-                    with gr.Column(scale=9):
-                        img_out = gr.Image(label=_("Output Image"))
-                    with gr.Column(scale=1):
-                        expression_parameters = self.create_parameters()
-                        btn_openfolder = gr.Button('📂')
-                        with gr.Accordion("Opt in features", visible=False):
-                            img_sample = gr.Image()
-                            img_motion_link = gr.Image()
-                            tb_exp = gr.Textbox()
-                params = expression_parameters + [img_ref]
-                opt_in_features_params = [img_sample, img_motion_link, tb_exp]
-                gr.on(
-                    triggers=[param.change for param in params],
-                    fn=self.inferencer.edit_expression,
-                    inputs=params + opt_in_features_params,
-                    outputs=img_out,
-                    #show_progress="minimal",
-                    queue=True
-                )
-                btn_openfolder.click(
-                    fn=lambda: self.open_folder(self.args.output_dir), inputs=None, outputs=None
-                )
-                btn_gen.click(self.inferencer.edit_expression,
-                              inputs=params + opt_in_features_params,
-                              outputs=img_out)
             gradio_launch_args = {
                 "inbrowser": self.args.inbrowser,

         )
     @staticmethod
+    def create_expression_parameters():
         return [
             gr.Dropdown(label=_("Model Type"), visible=False, interactive=False,
                         choices=[item.value for item in ModelType], value=ModelType.HUMAN.value),
             gr.Slider(label=_("WOO"), minimum=-20, maximum=20, step=0.2, value=0),
             gr.Slider(label=_("Smile"), minimum=-2.0, maximum=2.0, step=0.01, value=0),
             gr.Slider(label=_("Source Ratio"), minimum=0, maximum=1, step=0.01, value=1),
+            gr.Slider(label=_("Sample Ratio"), minimum=-0.2, maximum=1.2, step=0.01, value=1, visible=False),
+            gr.Dropdown(label=_("Sample Parts"), visible=False,
                         choices=[part.value for part in SamplePart], value=SamplePart.ALL.value),
+            gr.Slider(label=_("Face Crop Factor"), minimum=1.5, maximum=2.5, step=0.1, value=2)
+        ]
+    @staticmethod
+    def create_video_parameters():
+        return [
+            gr.Dropdown(label=_("Model Type"), visible=False, interactive=False,
+                        choices=[item.value for item in ModelType],
+                        value=ModelType.HUMAN.value),
+            gr.Slider(label=_("First frame eyes alignment factor"), minimum=0, maximum=1, step=0.01, value=1),
+            gr.Slider(label=_("First frame mouth alignment factor"), minimum=0, maximum=1, step=0.01, value=1),
+            gr.Slider(label=_("Face Crop Factor"), minimum=1.5, maximum=2.5, step=0.1, value=2),
         ]
     def launch(self):
             with self.i18n:
                 gr.Markdown(REPO_MARKDOWN, elem_id="md_project")
+                with gr.Tabs():
+                    with gr.TabItem(_("Expression Editor")):
+                        with gr.Row():
+                            with gr.Column():
+                                img_ref = gr.Image(label=_("Reference Image"))
+                        with gr.Row():
+                            btn_gen = gr.Button("GENERATE", visible=False)
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=9):
+                                img_out = gr.Image(label=_("Output Image"))
+                            with gr.Column(scale=1):
+                                expression_parameters = self.create_expression_parameters()
+                                btn_openfolder = gr.Button('📂')
+                                with gr.Accordion("Opt in features", visible=False):
+                                    img_sample = gr.Image()
+                        params = expression_parameters + [img_ref]
+                        opt_in_features_params = [img_sample]
+                        gr.on(
+                            triggers=[param.change for param in params],
+                            fn=self.inferencer.edit_expression,
+                            inputs=params + opt_in_features_params,
+                            outputs=img_out,
+                            show_progress="minimal",
+                            queue=True
+                        )
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(self.args.output_dir), inputs=None, outputs=None
+                        )
+                        btn_gen.click(self.inferencer.edit_expression,
+                                      inputs=params + opt_in_features_params,
+                                      outputs=img_out)
+                    with gr.TabItem(_("Video Driven")):
+                        with gr.Row():
+                            img_ref = gr.Image(label=_("Reference Image"))
+                            vid_driven = gr.Video(label=_("Expression Video"))
+                            with gr.Column():
+                                vid_params = self.create_video_parameters()
+                        with gr.Row():
+                            btn_gen = gr.Button(_("GENERATE"), variant="primary")
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=9):
+                                vid_out = gr.Video(label=_("Output Video"), scale=9)
+                            with gr.Column(scale=1):
+                                btn_openfolder = gr.Button('📂')
+                        params = vid_params + [img_ref, vid_driven]
+                        btn_gen.click(
+                            fn=self.inferencer.create_video,
+                            inputs=params,
+                            outputs=vid_out
+                        )
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "videos")),
+                            inputs=None, outputs=None
+                        )
             gradio_launch_args = {
                 "inbrowser": self.args.inbrowser,

i18n/translation.yaml CHANGED Viewed

@@ -24,6 +24,14 @@ en: # English
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 ko: # Korean
   Language: 언어
@@ -51,6 +59,14 @@ ko: # Korean
   OnlyEyes: 눈만
   All: 전부
   Value above 5 may appear distorted: 5 이상은 왜곡돼 보일 수 있습니다.
 ja: # Japanese
   Language: 言語
@@ -78,6 +94,14 @@ ja: # Japanese
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 es: # Spanish
   Language: Idioma
@@ -105,6 +129,14 @@ es: # Spanish
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 fr: # French
   Language: Langue
@@ -132,6 +164,14 @@ fr: # French
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 de: # German
   Language: Sprache
@@ -159,6 +199,14 @@ de: # German
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 zh: # Chinese
   Language: 语言
@@ -186,6 +234,14 @@ zh: # Chinese
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 uk: # Ukrainian
   Language: Мова
@@ -213,6 +269,14 @@ uk: # Ukrainian
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 ru: # Russian
   Language: Язык
@@ -240,6 +304,14 @@ ru: # Russian
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 tr: # Turkish
   Language: Dil
@@ -267,3 +339,11 @@ tr: # Turkish
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted

   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 ko: # Korean
   Language: 언어
   OnlyEyes: 눈만
   All: 전부
   Value above 5 may appear distorted: 5 이상은 왜곡돼 보일 수 있습니다.
+  Expression Editor: 표정 편집기
+  Video Driven: 영상 변환
+  Expression Video: 표정 영상
+  GENERATE: 생성
+  Output Video: 결과 영상
+  First frame mouth alignment factor: 첫 프레임 입 반영 비율
+  First frame eyes alignment factor: 첫 프레임 눈 반영 비율
+  Face Crop Factor: 얼굴 크롭 비율
 ja: # Japanese
   Language: 言語
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 es: # Spanish
   Language: Idioma
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 fr: # French
   Language: Langue
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 de: # German
   Language: Sprache
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 zh: # Chinese
   Language: 语言
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 uk: # Ukrainian
   Language: Мова
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 ru: # Russian
   Language: Язык
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 tr: # Turkish
   Language: Dil
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor

modules/live_portrait/live_portrait_inferencer.py CHANGED Viewed

@@ -4,17 +4,19 @@ import cv2
 import time
 import copy
 import dill
 from ultralytics import YOLO
 import safetensors.torch
 import gradio as gr
 from gradio_i18n import Translate, gettext as _
 from ultralytics.utils import LOGGER as ultralytics_logger
 from enum import Enum
-from typing import Union
 import spaces
 from modules.utils.paths import *
 from modules.utils.image_helper import *
 from modules.live_portrait.model_downloader import *
 from modules.live_portrait.live_portrait_wrapper import LivePortraitWrapper
 from modules.utils.camera import get_rotation_matrix
@@ -33,8 +35,17 @@ class LivePortraitInferencer:
                  model_dir: str = MODELS_DIR,
                  output_dir: str = OUTPUTS_DIR):
         self.model_dir = model_dir
-        os.makedirs(os.path.join(self.model_dir, "animal"), exist_ok=True)
         self.output_dir = output_dir
         self.model_config = load_yaml(MODEL_CONFIG)["model_params"]
         self.appearance_feature_extractor = None
@@ -121,7 +132,7 @@ class LivePortraitInferencer:
         )
         self.stitching_retargeting_module = {"stitching": self.stitching_retargeting_module}
-        if self.pipeline is None:
             self.pipeline = LivePortraitWrapper(
                 InferenceConfig(),
                 self.appearance_feature_extractor,
@@ -137,26 +148,24 @@ class LivePortraitInferencer:
     @spaces.GPU
     def edit_expression(self,
                         model_type: str = ModelType.HUMAN.value,
-                        rotate_pitch=0,
-                        rotate_yaw=0,
-                        rotate_roll=0,
-                        blink=0,
-                        eyebrow=0,
-                        wink=0,
-                        pupil_x=0,
-                        pupil_y=0,
-                        aaa=0,
-                        eee=0,
-                        woo=0,
-                        smile=0,
-                        src_ratio=1,
-                        sample_ratio=1,
-                        sample_parts="All",
-                        crop_factor=1.5,
-                        src_image=None,
-                        sample_image=None,
-                        motion_link=None,
-                        add_exp=None):
         if isinstance(model_type, ModelType):
             model_type = model_type.value
         if model_type not in [mode.value for mode in ModelType]:
@@ -168,200 +177,159 @@ class LivePortraitInferencer:
             )
         try:
-            rotate_yaw = -rotate_yaw
-            new_editor_link = None
-            if isinstance(motion_link, np.ndarray) and motion_link:
-                self.psi = motion_link[0]
-                new_editor_link = motion_link.copy()
-            elif src_image is not None:
-                if id(src_image) != id(self.src_image) or self.crop_factor != crop_factor:
-                    self.crop_factor = crop_factor
-                    self.psi = self.prepare_source(src_image, crop_factor)
-                    self.src_image = src_image
-                new_editor_link = []
-                new_editor_link.append(self.psi)
-            else:
-                return None
-            psi = self.psi
-            s_info = psi.x_s_info
-            #delta_new = copy.deepcopy()
-            s_exp = s_info['exp'] * src_ratio
-            s_exp[0, 5] = s_info['exp'][0, 5]
-            s_exp += s_info['kp']
-            es = ExpressionSet()
-            if isinstance(sample_image, np.ndarray) and sample_image:
-                if id(self.sample_image) != id(sample_image):
-                    self.sample_image = sample_image
-                    d_image_np = (sample_image * 255).byte().numpy()
-                    d_face = self.crop_face(d_image_np[0], 1.7)
-                    i_d = self.prepare_src_image(d_face)
-                    self.d_info = self.pipeline.get_kp_info(i_d)
-                    self.d_info['exp'][0, 5, 0] = 0
-                    self.d_info['exp'][0, 5, 1] = 0
-                # "OnlyExpression", "OnlyRotation", "OnlyMouth", "OnlyEyes", "All"
-                if sample_parts == SamplePart.ONLY_EXPRESSION.value or sample_parts == SamplePart.ONLY_EXPRESSION.ALL.value:
-                    es.e += self.d_info['exp'] * sample_ratio
-                if sample_parts == SamplePart.ONLY_ROTATION.value or sample_parts == SamplePart.ONLY_ROTATION.ALL.value:
-                    rotate_pitch += self.d_info['pitch'] * sample_ratio
-                    rotate_yaw += self.d_info['yaw'] * sample_ratio
-                    rotate_roll += self.d_info['roll'] * sample_ratio
-                elif sample_parts == SamplePart.ONLY_MOUTH.value:
-                    self.retargeting(es.e, self.d_info['exp'], sample_ratio, (14, 17, 19, 20))
-                elif sample_parts == SamplePart.ONLY_EYES.value:
-                    self.retargeting(es.e, self.d_info['exp'], sample_ratio, (1, 2, 11, 13, 15, 16))
-            es.r = self.calc_fe(es.e, blink, eyebrow, wink, pupil_x, pupil_y, aaa, eee, woo, smile,
-                                rotate_pitch, rotate_yaw, rotate_roll)
-            if isinstance(add_exp, ExpressionSet):
-                es.add(add_exp)
-            new_rotate = get_rotation_matrix(s_info['pitch'] + es.r[0], s_info['yaw'] + es.r[1],
-                                             s_info['roll'] + es.r[2])
-            x_d_new = (s_info['scale'] * (1 + es.s)) * ((s_exp + es.e) @ new_rotate) + s_info['t']
-            x_d_new = self.pipeline.stitching(psi.x_s_user, x_d_new)
-            crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, x_d_new)
-            crop_out = self.pipeline.parse_output(crop_out['out'])[0]
-            crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb), cv2.INTER_LINEAR)
-            out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(np.uint8)
-            temp_out_img_path, out_img_path = get_auto_incremental_file_path(TEMP_DIR, "png"), get_auto_incremental_file_path(OUTPUTS_DIR, "png")
-            save_image(numpy_array=crop_out, output_path=temp_out_img_path)
-            save_image(numpy_array=out, output_path=out_img_path)
-            new_editor_link.append(es)
-            return out
         except Exception as e:
             raise
     @spaces.GPU
     def create_video(self,
-                     retargeting_eyes,
-                     retargeting_mouth,
-                     turn_on,
-                     tracking_src_vid,
-                     animate_without_vid,
-                     command,
-                     crop_factor,
-                     src_images=None,
-                     driving_images=None,
-                     motion_link=None,
-                     progress=gr.Progress()):
-        if not turn_on:
-            return None, None
-        src_length = 1
-        if src_images is None:
-            if motion_link is not None:
-                self.psi_list = [motion_link[0]]
-            else:
-                return None, None
-        if src_images is not None:
-            src_length = len(src_images)
-            if id(src_images) != id(self.src_images) or self.crop_factor != crop_factor:
-                self.crop_factor = crop_factor
-                self.src_images = src_images
-                if 1 < src_length:
-                    self.psi_list = self.prepare_source(src_images, crop_factor, True, tracking_src_vid)
-                else:
-                    self.psi_list = [self.prepare_source(src_images, crop_factor)]
-        cmd_list, cmd_length = self.parsing_command(command, motion_link)
-        if cmd_list is None:
-            return None,None
-        cmd_idx = 0
-        driving_length = 0
-        if driving_images is not None:
-            if id(driving_images) != id(self.driving_images):
-                self.driving_images = driving_images
-                self.driving_values = self.prepare_driving_video(driving_images)
-            driving_length = len(self.driving_values)
-        total_length = max(driving_length, src_length)
-        if animate_without_vid:
-            total_length = max(total_length, cmd_length)
-        c_i_es = ExpressionSet()
-        c_o_es = ExpressionSet()
-        d_0_es = None
-        out_list = []
-        psi = None
-        for i in range(total_length):
-            if i < src_length:
-                psi = self.psi_list[i]
-                s_info = psi.x_s_info
-                s_es = ExpressionSet(erst=(s_info['kp'] + s_info['exp'], torch.Tensor([0, 0, 0]), s_info['scale'], s_info['t']))
-            new_es = ExpressionSet(es=s_es)
-            if i < cmd_length:
-                cmd = cmd_list[cmd_idx]
-                if 0 < cmd.change:
-                    cmd.change -= 1
-                    c_i_es.add(cmd.es)
-                    c_i_es.sub(c_o_es)
-                elif 0 < cmd.keep:
-                    cmd.keep -= 1
-                new_es.add(c_i_es)
-                if cmd.change == 0 and cmd.keep == 0:
-                    cmd_idx += 1
-                    if cmd_idx < len(cmd_list):
-                        c_o_es = ExpressionSet(es=c_i_es)
-                        cmd = cmd_list[cmd_idx]
-                        c_o_es.div(cmd.change)
-            elif 0 < cmd_length:
-                new_es.add(c_i_es)
-            if i < driving_length:
-                d_i_info = self.driving_values[i]
-                d_i_r = torch.Tensor([d_i_info['pitch'], d_i_info['yaw'], d_i_info['roll']])#.float().to(device="cuda:0")
-                if d_0_es is None:
-                    d_0_es = ExpressionSet(erst = (d_i_info['exp'], d_i_r, d_i_info['scale'], d_i_info['t']))
-                    self.retargeting(s_es.e, d_0_es.e, retargeting_eyes, (11, 13, 15, 16))
-                    self.retargeting(s_es.e, d_0_es.e, retargeting_mouth, (14, 17, 19, 20))
-                new_es.e += d_i_info['exp'] - d_0_es.e
-                new_es.r += d_i_r - d_0_es.r
-                new_es.t += d_i_info['t'] - d_0_es.t
-            r_new = get_rotation_matrix(
-                s_info['pitch'] + new_es.r[0], s_info['yaw'] + new_es.r[1], s_info['roll'] + new_es.r[2])
-            d_new = new_es.s * (new_es.e @ r_new) + new_es.t
-            d_new = self.pipeline.stitching(psi.x_s_user, d_new)
-            crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, d_new)
-            crop_out = self.pipeline.parse_output(crop_out['out'])[0]
-            crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb),
-                                                cv2.INTER_LINEAR)
-            out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(
-                np.uint8)
-            out_list.append(out)
-            progress(i/total_length, "predicting..")
-        if len(out_list) == 0:
-            return None
-        out_imgs = torch.cat([pil2tensor(img_rgb) for img_rgb in out_list])
-        return out_imgs
     def download_if_no_models(self,
                               model_type: str = ModelType.HUMAN.value,
@@ -535,7 +503,6 @@ class LivePortraitInferencer:
     @staticmethod
     def retargeting(delta_out, driving_exp, factor, idxes):
         for idx in idxes:
-            # delta_out[0, idx] -= src_exp[0, idx] * factor
             delta_out[0, idx] += driving_exp[0, idx] * factor
     @staticmethod
@@ -560,8 +527,15 @@ class LivePortraitInferencer:
     @spaces.GPU
     def prepare_src_image(self, img):
-        h, w = img.shape[:2]
-        input_shape = [256,256]
         if h != input_shape[0] or w != input_shape[1]:
             if 256 < h: interpolation = cv2.INTER_AREA
             else: interpolation = cv2.INTER_LINEAR
@@ -632,11 +606,9 @@ class LivePortraitInferencer:
         return psi_list
     def prepare_driving_video(self, face_images):
-        print("Prepare driving video...")
-        f_img_np = (face_images * 255).byte().numpy()
         out_list = []
-        for f_img in f_img_np:
             i_d = self.prepare_src_image(f_img)
             d_info = self.pipeline.get_kp_info(i_d)
             out_list.append(d_info)

 import time
 import copy
 import dill
+import torch
 from ultralytics import YOLO
 import safetensors.torch
 import gradio as gr
 from gradio_i18n import Translate, gettext as _
 from ultralytics.utils import LOGGER as ultralytics_logger
 from enum import Enum
+from typing import Union, List, Dict, Tuple
 import spaces
 from modules.utils.paths import *
 from modules.utils.image_helper import *
+from modules.utils.video_helper import *
 from modules.live_portrait.model_downloader import *
 from modules.live_portrait.live_portrait_wrapper import LivePortraitWrapper
 from modules.utils.camera import get_rotation_matrix
                  model_dir: str = MODELS_DIR,
                  output_dir: str = OUTPUTS_DIR):
         self.model_dir = model_dir
         self.output_dir = output_dir
+        relative_dirs = [
+            os.path.join(self.model_dir, "animal"),
+            os.path.join(self.output_dir, "videos"),
+            os.path.join(self.output_dir, "temp"),
+            os.path.join(self.output_dir, "temp", "video_frames"),
+            os.path.join(self.output_dir, "temp", "video_frames", "out"),
+        ]
+        for dir_path in relative_dirs:
+            os.makedirs(dir_path, exist_ok=True)
         self.model_config = load_yaml(MODEL_CONFIG)["model_params"]
         self.appearance_feature_extractor = None
         )
         self.stitching_retargeting_module = {"stitching": self.stitching_retargeting_module}
+        if self.pipeline is None or model_type != self.model_type:
             self.pipeline = LivePortraitWrapper(
                 InferenceConfig(),
                 self.appearance_feature_extractor,
     @spaces.GPU
     def edit_expression(self,
                         model_type: str = ModelType.HUMAN.value,
+                        rotate_pitch: float = 0,
+                        rotate_yaw: float = 0,
+                        rotate_roll: float = 0,
+                        blink: float = 0,
+                        eyebrow: float = 0,
+                        wink: float = 0,
+                        pupil_x: float = 0,
+                        pupil_y: float = 0,
+                        aaa: float = 0,
+                        eee: float = 0,
+                        woo: float = 0,
+                        smile: float = 0,
+                        src_ratio: float = 1,
+                        sample_ratio: float = 1,
+                        sample_parts: str = SamplePart.ALL.value,
+                        crop_factor: float = 2.3,
+                        src_image: Optional[str] = None,
+                        sample_image: Optional[str] = None,) -> None:
         if isinstance(model_type, ModelType):
             model_type = model_type.value
         if model_type not in [mode.value for mode in ModelType]:
             )
         try:
+            with torch.autocast(device_type=self.device, enabled=(self.device == "cuda")):
+                rotate_yaw = -rotate_yaw
+                if src_image is not None:
+                    if id(src_image) != id(self.src_image) or self.crop_factor != crop_factor:
+                        self.crop_factor = crop_factor
+                        self.psi = self.prepare_source(src_image, crop_factor)
+                        self.src_image = src_image
+                else:
+                    return None
+                psi = self.psi
+                s_info = psi.x_s_info
+                #delta_new = copy.deepcopy()
+                s_exp = s_info['exp'] * src_ratio
+                s_exp[0, 5] = s_info['exp'][0, 5]
+                s_exp += s_info['kp']
+                es = ExpressionSet()
+                if isinstance(sample_image, np.ndarray) and sample_image:
+                    if id(self.sample_image) != id(sample_image):
+                        self.sample_image = sample_image
+                        d_image_np = (sample_image * 255).byte().numpy()
+                        d_face = self.crop_face(d_image_np[0], 1.7)
+                        i_d = self.prepare_src_image(d_face)
+                        self.d_info = self.pipeline.get_kp_info(i_d)
+                        self.d_info['exp'][0, 5, 0] = 0
+                        self.d_info['exp'][0, 5, 1] = 0
+                    # "OnlyExpression", "OnlyRotation", "OnlyMouth", "OnlyEyes", "All"
+                    if sample_parts == SamplePart.ONLY_EXPRESSION.value or sample_parts == SamplePart.ONLY_EXPRESSION.ALL.value:
+                        es.e += self.d_info['exp'] * sample_ratio
+                    if sample_parts == SamplePart.ONLY_ROTATION.value or sample_parts == SamplePart.ONLY_ROTATION.ALL.value:
+                        rotate_pitch += self.d_info['pitch'] * sample_ratio
+                        rotate_yaw += self.d_info['yaw'] * sample_ratio
+                        rotate_roll += self.d_info['roll'] * sample_ratio
+                    elif sample_parts == SamplePart.ONLY_MOUTH.value:
+                        self.retargeting(es.e, self.d_info['exp'], sample_ratio, (14, 17, 19, 20))
+                    elif sample_parts == SamplePart.ONLY_EYES.value:
+                        self.retargeting(es.e, self.d_info['exp'], sample_ratio, (1, 2, 11, 13, 15, 16))
+                es.r = self.calc_fe(es.e, blink, eyebrow, wink, pupil_x, pupil_y, aaa, eee, woo, smile,
+                                    rotate_pitch, rotate_yaw, rotate_roll)
+                new_rotate = get_rotation_matrix(s_info['pitch'] + es.r[0], s_info['yaw'] + es.r[1],
+                                                 s_info['roll'] + es.r[2])
+                x_d_new = (s_info['scale'] * (1 + es.s)) * ((s_exp + es.e) @ new_rotate) + s_info['t']
+                x_d_new = self.pipeline.stitching(psi.x_s_user, x_d_new)
+                crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, x_d_new)
+                crop_out = self.pipeline.parse_output(crop_out['out'])[0]
+                crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb), cv2.INTER_LINEAR)
+                out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(np.uint8)
+                temp_out_img_path, out_img_path = get_auto_incremental_file_path(TEMP_DIR, "png"), get_auto_incremental_file_path(OUTPUTS_DIR, "png")
+                save_image(numpy_array=crop_out, output_path=temp_out_img_path)
+                save_image(numpy_array=out, output_path=out_img_path)
+                return out
         except Exception as e:
             raise
     @spaces.GPU
     def create_video(self,
+                     model_type: str = ModelType.HUMAN.value,
+                     retargeting_eyes: float = 1,
+                     retargeting_mouth: float = 1,
+                     crop_factor: float = 2.3,
+                     src_image: Optional[str] = None,
+                     driving_vid_path: Optional[str] = None,
+                     progress: gr.Progress = gr.Progress()
+                     ):
+        if self.pipeline is None or model_type != self.model_type:
+            self.load_models(
+                model_type=model_type
+            )
+        try:
+            vid_info = get_video_info(vid_input=driving_vid_path)
+            if src_image is not None:
+                if id(src_image) != id(self.src_image) or self.crop_factor != crop_factor:
+                    self.crop_factor = crop_factor
+                    self.src_image = src_image
+                    self.psi_list = [self.prepare_source(src_image, crop_factor)]
+            progress(0, desc="Extracting frames from the video..")
+            driving_images, vid_sound = extract_frames(driving_vid_path, os.path.join(self.output_dir, "temp", "video_frames")), extract_sound(driving_vid_path)
+            driving_length = 0
+            if driving_images is not None:
+                if id(driving_images) != id(self.driving_images):
+                    self.driving_images = driving_images
+                    self.driving_values = self.prepare_driving_video(driving_images)
+                driving_length = len(self.driving_values)
+            total_length = len(driving_images)
+            c_i_es = ExpressionSet()
+            c_o_es = ExpressionSet()
+            d_0_es = None
+            psi = None
+            with torch.autocast(device_type=self.device, enabled=(self.device == "cuda")):
+                for i in range(total_length):
+                    if i == 0:
+                        psi = self.psi_list[i]
+                        s_info = psi.x_s_info
+                        s_es = ExpressionSet(erst=(s_info['kp'] + s_info['exp'], torch.Tensor([0, 0, 0]), s_info['scale'], s_info['t']))
+                    new_es = ExpressionSet(es=s_es)
+                    if i < driving_length:
+                        d_i_info = self.driving_values[i]
+                        d_i_r = torch.Tensor([d_i_info['pitch'], d_i_info['yaw'], d_i_info['roll']]) # .float().to(device="cuda:0")
+                        if d_0_es is None:
+                            d_0_es = ExpressionSet(erst = (d_i_info['exp'], d_i_r, d_i_info['scale'], d_i_info['t']))
+                            self.retargeting(s_es.e, d_0_es.e, retargeting_eyes, (11, 13, 15, 16))
+                            self.retargeting(s_es.e, d_0_es.e, retargeting_mouth, (14, 17, 19, 20))
+                        new_es.e += d_i_info['exp'] - d_0_es.e
+                        new_es.r += d_i_r - d_0_es.r
+                        new_es.t += d_i_info['t'] - d_0_es.t
+                    r_new = get_rotation_matrix(
+                        s_info['pitch'] + new_es.r[0], s_info['yaw'] + new_es.r[1], s_info['roll'] + new_es.r[2])
+                    d_new = new_es.s * (new_es.e @ r_new) + new_es.t
+                    d_new = self.pipeline.stitching(psi.x_s_user, d_new)
+                    crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, d_new)
+                    crop_out = self.pipeline.parse_output(crop_out['out'])[0]
+                    crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb),
+                                                        cv2.INTER_LINEAR)
+                    out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(
+                        np.uint8)
+                    out_frame_path = get_auto_incremental_file_path(os.path.join(self.output_dir, "temp", "video_frames", "out"), "png")
+                    save_image(out, out_frame_path)
+                    progress(i/total_length, desc=f"Generating frames {i}/{total_length} ..")
+                video_path = create_video_from_frames(TEMP_VIDEO_OUT_FRAMES_DIR, frame_rate=vid_info.frame_rate, output_dir=os.path.join(self.output_dir, "videos"))
+                return video_path
+        except Exception as e:
+            raise
     def download_if_no_models(self,
                               model_type: str = ModelType.HUMAN.value,
     @staticmethod
     def retargeting(delta_out, driving_exp, factor, idxes):
         for idx in idxes:
             delta_out[0, idx] += driving_exp[0, idx] * factor
     @staticmethod
     @spaces.GPU
     def prepare_src_image(self, img):
+        if isinstance(img, str):
+            img = image_path_to_array(img)
+        if len(img.shape) <= 3:
+            img = img[np.newaxis, ...]
+        d, h, w, c = img.shape
+        img = img[0] # Select first dimension
+        input_shape = [256, 256]
         if h != input_shape[0] or w != input_shape[1]:
             if 256 < h: interpolation = cv2.INTER_AREA
             else: interpolation = cv2.INTER_LINEAR
         return psi_list
     def prepare_driving_video(self, face_images):
+        # print("Prepare driving video...")
         out_list = []
+        for f_img in face_images:
             i_d = self.prepare_src_image(f_img)
             d_info = self.pipeline.get_kp_info(i_d)
             out_list.append(d_info)

modules/utils/constants.py CHANGED Viewed

@@ -31,4 +31,10 @@ GRADIO_CSS = """
 #blink_slider .md.svelte-7ddecg.chatbot.prose {
     font-size: 0.7em;
 }
-"""

 #blink_slider .md.svelte-7ddecg.chatbot.prose {
     font-size: 0.7em;
 }
+"""
+SOUND_FILE_EXT = ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.m4a', '.wma']
+IMAGE_FILE_EXT = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']
+VIDEO_FILE_EXT = ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.mpeg', '.mpg', '.m4v', '.3gp', '.ts', '.vob', '.gif']
+TRANSPARENT_VIDEO_FILE_EXT = ['.webm', '.mov', '.gif']
+SUPPORTED_VIDEO_FILE_EXT = ['.mp4', '.mov', '.webm', '.gif']

modules/utils/image_helper.py CHANGED Viewed

@@ -56,6 +56,7 @@ def calc_crop_limit(center, img_size, crop_size):
 def save_image(numpy_array: np.ndarray, output_path: str):
     out = Image.fromarray(numpy_array)
     out.save(output_path, compress_level=1, format="png")
 def image_path_to_array(image_path: str) -> np.ndarray:

 def save_image(numpy_array: np.ndarray, output_path: str):
     out = Image.fromarray(numpy_array)
     out.save(output_path, compress_level=1, format="png")
+    return output_path
 def image_path_to_array(image_path: str) -> np.ndarray:

modules/utils/paths.py CHANGED Viewed

@@ -6,7 +6,10 @@ PROJECT_ROOT_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), ".."
 MODELS_DIR = os.path.join(PROJECT_ROOT_DIR, "models")
 MODELS_ANIMAL_DIR = os.path.join(MODELS_DIR, "animal")
 OUTPUTS_DIR = os.path.join(PROJECT_ROOT_DIR, "outputs")
 TEMP_DIR = os.path.join(OUTPUTS_DIR, "temp")
 EXP_OUTPUT_DIR = os.path.join(OUTPUTS_DIR, "exp_data")
 MODEL_CONFIG = os.path.join(PROJECT_ROOT_DIR, "modules", "config", "models.yaml")
 MODEL_PATHS = {
@@ -31,7 +34,7 @@ I18N_YAML_PATH = os.path.join(PROJECT_ROOT_DIR, "i18n", "translation.yaml")
 def get_auto_incremental_file_path(dir_path: str, extension: str, prefix: str = ""):
-    counter = 0
     while True:
         if prefix:
             filename = f"{prefix}_{counter:05d}.{extension}"
@@ -39,6 +42,7 @@ def get_auto_incremental_file_path(dir_path: str, extension: str, prefix: str =
             filename = f"{counter:05d}.{extension}"
         full_path = os.path.join(dir_path, filename)
         if not os.path.exists(full_path):
             return full_path
         counter += 1
@@ -50,7 +54,10 @@ def init_dirs():
         MODELS_ANIMAL_DIR,
         OUTPUTS_DIR,
         EXP_OUTPUT_DIR,
-        TEMP_DIR
     ]:
         os.makedirs(dir_path, exist_ok=True)

 MODELS_DIR = os.path.join(PROJECT_ROOT_DIR, "models")
 MODELS_ANIMAL_DIR = os.path.join(MODELS_DIR, "animal")
 OUTPUTS_DIR = os.path.join(PROJECT_ROOT_DIR, "outputs")
+OUTPUTS_VIDEOS_DIR = os.path.join(OUTPUTS_DIR, "videos")
 TEMP_DIR = os.path.join(OUTPUTS_DIR, "temp")
+TEMP_VIDEO_FRAMES_DIR = os.path.join(TEMP_DIR, "video_frames")
+TEMP_VIDEO_OUT_FRAMES_DIR = os.path.join(TEMP_VIDEO_FRAMES_DIR, "out")
 EXP_OUTPUT_DIR = os.path.join(OUTPUTS_DIR, "exp_data")
 MODEL_CONFIG = os.path.join(PROJECT_ROOT_DIR, "modules", "config", "models.yaml")
 MODEL_PATHS = {
 def get_auto_incremental_file_path(dir_path: str, extension: str, prefix: str = ""):
+    counter = len(os.listdir(dir_path))
     while True:
         if prefix:
             filename = f"{prefix}_{counter:05d}.{extension}"
             filename = f"{counter:05d}.{extension}"
         full_path = os.path.join(dir_path, filename)
         if not os.path.exists(full_path):
+            full_path = os.path.normpath(full_path)
             return full_path
         counter += 1
         MODELS_ANIMAL_DIR,
         OUTPUTS_DIR,
         EXP_OUTPUT_DIR,
+        TEMP_DIR,
+        TEMP_VIDEO_FRAMES_DIR,
+        TEMP_VIDEO_OUT_FRAMES_DIR,
+        OUTPUTS_VIDEOS_DIR
     ]:
         os.makedirs(dir_path, exist_ok=True)

modules/utils/video_helper.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import subprocess
+import os
+from typing import List, Optional, Union
+import cv2
+from PIL import Image
+import numpy as np
+from dataclasses import dataclass
+import re
+from pathlib import Path
+from modules.utils.constants import SOUND_FILE_EXT, VIDEO_FILE_EXT, IMAGE_FILE_EXT
+from modules.utils.paths import (TEMP_VIDEO_FRAMES_DIR, TEMP_VIDEO_OUT_FRAMES_DIR, OUTPUTS_VIDEOS_DIR,
+                                 get_auto_incremental_file_path)
+@dataclass
+class VideoInfo:
+    num_frames: Optional[int] = None
+    frame_rate: Optional[int] = None
+    duration: Optional[float] = None
+    has_sound: Optional[bool] = None
+    codec: Optional[str] = None
+def extract_frames(
+    vid_input: str,
+    output_temp_dir: str = TEMP_VIDEO_FRAMES_DIR,
+    start_number: int = 0,
+    clean=True
+):
+    """
+    Extract frames as jpg files and save them into output_temp_dir. This needs FFmpeg installed.
+    """
+    if clean:
+        clean_temp_dir(temp_dir=output_temp_dir)
+    os.makedirs(output_temp_dir, exist_ok=True)
+    output_path = os.path.join(output_temp_dir, "%05d.jpg")
+    command = [
+        'ffmpeg',
+        '-loglevel', 'error',
+        '-y',  # Enable overwriting
+        '-i', vid_input,
+        '-qscale:v', '2',
+        '-vf', f'scale=iw:ih',
+        '-start_number', str(start_number),
+        f'{output_path}'
+    ]
+    try:
+        subprocess.run(command, check=True)
+        print(f"Video frames extracted to \"{os.path.normpath(output_temp_dir)}\"")
+    except subprocess.CalledProcessError as e:
+        print("Error occurred while extracting frames from the video")
+        raise RuntimeError(f"An error occurred: {str(e)}")
+    return get_frames_from_dir(output_temp_dir)
+def extract_sound(
+    vid_input: str,
+    output_temp_dir: str = TEMP_VIDEO_FRAMES_DIR,
+):
+    """
+    Extract audio from a video file and save it as a separate sound file. This needs FFmpeg installed.
+    """
+    if Path(vid_input).suffix == ".gif":
+        print("Sound extracting process has passed because gif has no sound")
+        return None
+    os.makedirs(output_temp_dir, exist_ok=True)
+    output_path = os.path.join(output_temp_dir, "sound.mp3")
+    command = [
+        'ffmpeg',
+        '-loglevel', 'error',
+        '-y',  # Enable overwriting
+        '-i', vid_input,
+        '-vn',
+        output_path
+    ]
+    try:
+        subprocess.run(command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Warning: Failed to extract sound from the video: {e}")
+    return output_path
+def get_video_info(vid_input: str) -> VideoInfo:
+    """
+    Extract video information using ffmpeg.
+    """
+    command = [
+        'ffmpeg',
+        '-i', vid_input,
+        '-map', '0:v:0',
+        '-c', 'copy',
+        '-f', 'null',
+        '-'
+    ]
+    try:
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                encoding='utf-8', errors='replace', check=True)
+        output = result.stderr
+        num_frames = None
+        frame_rate = None
+        duration = None
+        has_sound = False
+        codec = None
+        for line in output.splitlines():
+            if 'Stream #0:0' in line and 'Video:' in line:
+                fps_match = re.search(r'(\d+(?:\.\d+)?) fps', line)
+                if fps_match:
+                    frame_rate = float(fps_match.group(1))
+                codec_match = re.search(r'Video: (\w+)', line)
+                if codec_match:
+                    codec = codec_match.group(1)
+            elif 'Duration:' in line:
+                duration_match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2}\.\d{2})', line)
+                if duration_match:
+                    h, m, s = map(float, duration_match.groups())
+                    duration = h * 3600 + m * 60 + s
+            elif 'Stream' in line and 'Audio:' in line:
+                has_sound = True
+        if frame_rate and duration:
+            num_frames = int(frame_rate * duration)
+        print(f"Video info - frame_rate: {frame_rate}, duration: {duration}, total frames: {num_frames}")
+        return VideoInfo(
+            num_frames=num_frames,
+            frame_rate=frame_rate,
+            duration=duration,
+            has_sound=has_sound,
+            codec=codec
+        )
+    except subprocess.CalledProcessError as e:
+        print("Error occurred while getting info from the video")
+        return VideoInfo()
+def create_video_from_frames(
+    frames_dir: str,
+    frame_rate: Optional[int] = None,
+    sound_path: Optional[str] = None,
+    output_dir: Optional[str] = None,
+    output_mime_type: Optional[str] = None,
+):
+    """
+    Create a video from frames and save it to the output_path. This needs FFmpeg installed.
+    """
+    if not os.path.exists(frames_dir):
+        raise "frames_dir does not exist"
+    frames_dir = os.path.normpath(frames_dir)
+    if output_dir is None:
+        output_dir = OUTPUTS_VIDEOS_DIR
+    os.makedirs(output_dir, exist_ok=True)
+    frame_img_mime_type = ".png"
+    pix_format = "yuv420p"
+    vid_codec, audio_codec = "libx264", "aac"
+    if output_mime_type is None:
+        output_mime_type = ".mp4"
+    output_mime_type = output_mime_type.lower()
+    if output_mime_type == ".mov":
+        pix_format = "yuva444p10le"
+        vid_codec, audio_codec = "prores_ks", "aac"
+    elif output_mime_type == ".webm":
+        pix_format = "yuva420p"
+        vid_codec, audio_codec = "libvpx-vp9", "libvorbis"
+    elif output_mime_type == ".gif":
+        pix_format = None
+        vid_codec, audio_codec = "gif", None
+    output_path = get_auto_incremental_file_path(output_dir, output_mime_type.replace(".", ""))
+    if sound_path is None:
+        temp_sound = os.path.normpath(os.path.join(TEMP_VIDEO_FRAMES_DIR, "sound.mp3"))
+        if os.path.exists(temp_sound):
+            sound_path = temp_sound
+    if frame_rate is None:
+        frame_rate = 25  # Default frame rate for ffmpeg
+    command = [
+        'ffmpeg',
+        '-loglevel', 'error',
+        '-y',
+        '-framerate', str(frame_rate),
+        '-i', os.path.join(frames_dir, f"%05d{frame_img_mime_type}"),
+        '-c:v', vid_codec,
+        '-vf', 'crop=trunc(iw/2)*2:trunc(ih/2)*2' if pix_format else None,
+    ]
+    if output_mime_type == ".gif":
+        command += [
+            "-filter_complex", "[0:v] palettegen=reserve_transparent=on [p]; [0:v][p] paletteuse",
+            "-loop", "0"
+        ]
+    else:
+        command += [
+            '-pix_fmt', pix_format
+        ]
+    command += [output_path]
+    if output_mime_type != ".gif" and sound_path is not None:
+        command += [
+            '-i', sound_path,
+            '-c:a', audio_codec,
+            '-strict', 'experimental',
+            '-b:a', '192k',
+            '-shortest'
+        ]
+    try:
+        subprocess.run(command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred while creating video from frames")
+        raise
+    return output_path
+def create_video_from_numpy_list(frame_list: List[np.ndarray],
+                                 frame_rate: Optional[int] = None,
+                                 sound_path: Optional[str] = None,
+                                 output_dir: Optional[str] = None
+                                 ):
+    if output_dir is None:
+        output_dir = OUTPUTS_VIDEOS_DIR
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = get_auto_incremental_file_path(output_dir, "mp4")
+    if frame_rate is None:
+        frame_rate = 25
+    if sound_path is None:
+        temp_sound = os.path.join(TEMP_VIDEO_FRAMES_DIR, "sound.mp3")
+        if os.path.exists(temp_sound):
+            sound_path = temp_sound
+    height, width, layers = frame_list[0].shape
+    fourcc = cv2.VideoWriter.fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (width, height))
+    for frame in frame_list:
+        out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+    out.release()
+def get_frames_from_dir(vid_dir: str,
+                        available_extensions: Optional[Union[List, str]] = None,
+                        as_numpy: bool = False) -> List:
+    """Get image file paths list from the dir"""
+    if available_extensions is None:
+        available_extensions = [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    if isinstance(available_extensions, str):
+        available_extensions = [available_extensions]
+    frame_names = [
+        p for p in os.listdir(vid_dir)
+        if os.path.splitext(p)[-1] in available_extensions
+    ]
+    if not frame_names:
+        return []
+    frame_names.sort(key=lambda x: int(os.path.splitext(x)[0]))
+    frames = [os.path.join(vid_dir, name) for name in frame_names]
+    if as_numpy:
+        frames = [np.array(Image.open(frame)) for frame in frames]
+    return frames
+def clean_temp_dir(temp_dir: Optional[str] = None):
+    """Removes media files from the video frames directory."""
+    if temp_dir is None:
+        temp_dir = TEMP_VIDEO_FRAMES_DIR
+        temp_out_dir = TEMP_VIDEO_OUT_FRAMES_DIR
+    else:
+        temp_out_dir = os.path.join(temp_dir, "out")
+    clean_files_with_extension(temp_dir, SOUND_FILE_EXT)
+    clean_files_with_extension(temp_dir, IMAGE_FILE_EXT)
+    if os.path.exists(temp_out_dir):
+        clean_files_with_extension(temp_out_dir, IMAGE_FILE_EXT)
+def clean_files_with_extension(dir_path: str, extensions: List):
+    """Remove files with the given extensions from the directory."""
+    for filename in os.listdir(dir_path):
+        if filename.lower().endswith(tuple(extensions)):
+            file_path = os.path.join(dir_path, filename)
+            try:
+                os.remove(file_path)
+            except Exception as e:
+                print("Error while removing image files")

requirements.txt CHANGED Viewed

@@ -13,4 +13,10 @@ ultralytics
 tyro
 dill
 gradio
-gradio-i18n

 tyro
 dill
 gradio
+gradio-i18n
+# Tests
+# pytest
+# scikit-image
+# moviepy

tests/test_config.py CHANGED Viewed

@@ -4,13 +4,18 @@ import os
 import torch
 import functools
 import numpy as np
 from modules.utils.paths import *
 TEST_IMAGE_URL = "https://github.com/microsoft/onnxjs-demo/raw/master/src/assets/EmotionSampleImages/sad_baby.jpg"
-TEST_IMAGE_PATH = os.path.join(PROJECT_ROOT_DIR, "tests", "test.png")
-TEST_EXPRESSION_OUTPUT_PATH = os.path.join(PROJECT_ROOT_DIR, "tests", "edited_expression.png")
 TEST_EXPRESSION_AAA = 100
@@ -40,6 +45,62 @@ def are_images_different(image1_path: str, image2_path: str):
         return True
 @functools.lru_cache
 def is_cuda_available():
     return torch.cuda.is_available()

 import torch
 import functools
 import numpy as np
+import cv2
+from skimage.metrics import structural_similarity as compare_ssim
+from moviepy.editor import VideoFileClip
 from modules.utils.paths import *
 TEST_IMAGE_URL = "https://github.com/microsoft/onnxjs-demo/raw/master/src/assets/EmotionSampleImages/sad_baby.jpg"
+TEST_VIDEO_URL = "https://github.com/jhj0517/sample-medias/raw/master/vids/human-face/expression01_short.mp4"
+TEST_IMAGE_PATH = os.path.normpath(os.path.join(PROJECT_ROOT_DIR, "tests", "test.png"))
+TEST_VIDEO_PATH = os.path.normpath(os.path.join(PROJECT_ROOT_DIR, "tests", "test_expression.mp4"))
+TEST_EXPRESSION_OUTPUT_PATH = os.path.normpath(os.path.join(PROJECT_ROOT_DIR, "tests", "edited_expression.png"))
 TEST_EXPRESSION_AAA = 100
         return True
+def are_videos_different(video1_path: str, video2_path: str):
+    cap1 = cv2.VideoCapture(video1_path)
+    cap2 = cv2.VideoCapture(video2_path)
+    while True:
+        ret1, frame1 = cap1.read()
+        ret2, frame2 = cap2.read()
+        if not ret1 or not ret2:
+            if ret1 != ret2:
+                return True
+            break
+        if frame1.shape != frame2.shape:
+            frame1 = cv2.resize(frame1, (frame2.shape[1], frame2.shape[0]))
+        score, _ = compare_ssim(frame1, frame2, full=True, multichannel=True)
+        if score < 0.99:
+            return True
+    cap1.release()
+    cap2.release()
+    return False
+def validate_video(video_path):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print("Could not open video file.")
+        return False
+    frame_count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame_count += 1
+    cap.release()
+    if frame_count == 0:
+        print("No frames found in video file.")
+        return False
+    return True
+def has_sound(video_path: str):
+    try:
+        video = VideoFileClip(video_path)
+        return video.audio is not None
+    except Exception as e:
+        return False
 @functools.lru_cache
 def is_cuda_available():
     return torch.cuda.is_available()

tests/test_video_creation.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import pytest
+from test_config import *
+from modules.live_portrait.live_portrait_inferencer import LivePortraitInferencer
+from modules.utils.image_helper import save_image
+@pytest.mark.parametrize(
+    "input_image,expression_video",
+    [
+        (TEST_IMAGE_PATH, TEST_VIDEO_PATH),
+    ]
+)
+def test_video_creation(
+    input_image: str,
+    expression_video: str
+):
+    if not os.path.exists(TEST_IMAGE_PATH):
+        download_image(
+            TEST_IMAGE_URL,
+            TEST_IMAGE_PATH
+        )
+    if not os.path.exists(TEST_VIDEO_PATH):
+        download_image(
+            TEST_VIDEO_URL,
+            TEST_VIDEO_PATH
+        )
+    inferencer = LivePortraitInferencer()
+    output_video_path = inferencer.create_video(
+        driving_vid_path=expression_video,
+        src_image=input_image,
+    )
+    assert os.path.exists(output_video_path)
+    assert validate_video(output_video_path)
+    assert has_sound(output_video_path)