Spaces:

thng292
/

FaceXFormer-Demo

Sleeping

App Files Files Community

thng292 commited on May 4

Commit

d0ac7e9

verified ·

1 Parent(s): 5bbf229

Upload 18 files

Browse files

Files changed (18) hide show

.Dockerignore +2 -0
.gitignore +7 -0
LICENSE +21 -0
app.py +171 -0
images/sunglasses_1.png +0 -0
images/sunglasses_2.png +0 -0
images/sunglasses_3.jpg +0 -0
images/sunglasses_4.png +0 -0
images/sunglasses_5.jpg +0 -0
images/sunglasses_6.png +0 -0
landmark_detection.py +177 -0
mediapipe_facedetection.py +0 -0
mtcnn_facedetection.py +18 -0
network/__init__.py +1 -0
network/models/__init__.py +2 -0
network/models/facexformer.py +392 -0
network/models/transformer.py +271 -0
requirements.txt +14 -0

.Dockerignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ facexformer
2	+ ckpts

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+# Exclude model's weights
+*.pt
+.venv/
+.vscode/
+**/__pycache__
+data
+saves

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Kartik Narayan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# import streamlit as st
+# from streamlit_webrtc import webrtc_streamer
+# import torch
+# torch.classes.__path__ = []
+import sys
+import os
+from glob import glob
+import gradio as gr
+from fastrtc import WebRTC
+from fastrtc import VideoStreamHandler
+from PIL import Image
+import landmark_detection
+import numpy as np
+from time import time
+import cv2
+from mtcnn_facedetection import detect_faces
+from selfie_filter import apply_sunglasses, process_video
+radius = 2
+filter_img = None
+def do_facial_landmark_recognition(
+    image: np.ndarray, face_boxes: list[landmark_detection.BoundingBox]
+):
+    faces = landmark_detection.get_faces(image, face_boxes)
+    landmarks_batch = landmark_detection.get_landmarks(faces)
+    for i, landmarks in enumerate(landmarks_batch):
+        for landmark in landmarks:
+            image = cv2.circle(image, landmark, radius, (255, 0, 0), -1)
+    return image, landmarks_batch
+def do_facial_landmark_recognition_with_mtcnn(image: np.ndarray):
+    face_boxes = detect_faces(image)
+    return do_facial_landmark_recognition(image, face_boxes)
+def video_frame_callback_gradio(frame: np.array):
+    flipped = cv2.flip(frame, 1)
+    flipped, landmarks_batch = do_facial_landmark_recognition_with_mtcnn(flipped)
+    # Apply sunglasses filter
+    image = apply_sunglasses(flipped, landmarks_batch, filter_img)
+    return image  # , AdditionalOutputs(flipped, flipped)
+css = """.my-group {max-width: 600px !important;}
+         .my-column {display: flex !important; justify-content: center !important; align-items: center !important;}"""
+image_extensions = [
+    "*.jpg",
+    "*.jpeg",
+    "*.png",
+    "*.gif",
+    "*.bmp",
+    "*.tiff",
+    "*.webp",
+]
+all_image_files = []
+for ext in image_extensions:
+    pattern = os.path.join("images", "**", ext)  # '**' for recursive search
+    image_files = glob(pattern, recursive=True)
+    all_image_files.extend(image_files)
+all_image_files.sort()
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_classes=["my-column"]):
+        gr.HTML(
+            """
+            <h1 style='text-align: center'>
+            Live Filter with FaceXFormer
+            </h1>
+            """
+        )
+        with gr.Group(elem_classes=["my-group"]):
+            selected_filter = gr.Dropdown(
+                choices=all_image_files,
+                label="Choose filter",
+                value="images/sunglasses_1.png",
+            )
+            def change_filter(filter_path):
+                global filter_img
+                try:
+                    filter_img = cv2.imread(filter_path, cv2.IMREAD_UNCHANGED)
+                except:
+                    gr.Error("Error open" + filter_path)
+            change_filter(selected_filter.value)
+            selected_filter.change(
+                change_filter, inputs=[selected_filter], show_progress="full"
+            )
+        with gr.Group(elem_classes=["my-group"]):
+            stream = WebRTC(label="Stream", rtc_configuration=None)
+            stream.stream(
+                fn=VideoStreamHandler(
+                    video_frame_callback_gradio, fps=12, skip_frames=True
+                ),
+                inputs=[stream],
+                outputs=[stream],
+                time_limit=None,
+            )
+        with gr.Group(elem_classes=["my-group"]):
+            with gr.Column(elem_classes=["my-column"]):
+                gr.HTML(
+                    """
+                    <h1 style='text-align: center'>
+                    Or just apply the filter to a video
+                    </h1>
+                    """
+                )
+                input_video = gr.Video(sources="upload", include_audio=False)
+                output_video = gr.Video(interactive=False, include_audio=False)
+                submit = gr.Button(variant="primary")
+            with gr.Column(elem_classes=["my-column"]):
+                submit.click(
+                    lambda input_path: process_video(input_path, filter_img),
+                    inputs=[input_video],
+                    outputs=[output_video],
+                    show_progress="full",
+                )
+def test(times=10):
+    image = np.array(Image.open("tmp.jpg").resize((512, 512)))
+    # faces = ai.get_faces(image)
+    start = time()
+    frame_times = [None] * times
+    for i in range(times):
+        before = time()
+        do_facial_landmark_recognition_with_mtcnn(image)
+        after = time()
+        frame_times[i] = after - before
+    end = time()
+    print(f"Num Images: {times}")
+    print(f"Total time: {end - start}")
+    print(
+        f"Max frametime: {max(frame_times)}, FPS: {1 / max(frame_times)}",
+    )
+    print(
+        f"Min frametime: {min(frame_times)}, FPS: {1 / min(frame_times)}",
+    )
+    print(
+        f"Avg frametime: {sum(frame_times) / len(frame_times)}, FPS: {1 / (sum(frame_times) / len(frame_times))}",
+    )
+if __name__ == "__main__":
+    no_params = 0
+    for name, i in landmark_detection.model.named_parameters(recurse=True):
+        no_params += i.numel()
+        print(name, i.numel())
+    print(no_params)
+    if "--test" in sys.argv:
+        test()
+        exit(0)
+    else:
+        demo.launch()

images/sunglasses_1.png ADDED Viewed

images/sunglasses_2.png ADDED Viewed

images/sunglasses_3.jpg ADDED Viewed

images/sunglasses_4.png ADDED Viewed

images/sunglasses_5.jpg ADDED Viewed

images/sunglasses_6.png ADDED Viewed

landmark_detection.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch
+import torchvision
+from torchvision.transforms import InterpolationMode
+from network.models.facexformer import FaceXFormer
+from dataclasses import dataclass
+import numpy as np
+# import mediapipe as mp
+# import cv2
+# device = "cuda:0"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float32
+# weights_path = "ckpts/model.pt"
+weights_path = "ckpts/pytorch_model.bin"
+# face_model_path = "ckpts/blaze_face_short_range.tflite"
+# import mediapipe as mp
+# BaseOptions = mp.tasks.BaseOptions
+# FaceDetector = mp.tasks.vision.FaceDetector
+# FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
+# FaceDetectorResult = mp.tasks.vision.FaceDetectorResult
+# VisionRunningMode = mp.tasks.vision.RunningMode
+# options = FaceDetectorOptions(
+#     base_options=BaseOptions(model_asset_path=face_model_path),
+#     running_mode=VisionRunningMode.LIVE_STREAM,
+# )
+# face_detector = FaceDetector.create_from_options(options)
+transforms_image = torchvision.transforms.Compose(
+    [
+        torchvision.transforms.ToPILImage(),
+        torchvision.transforms.Resize(
+            size=(224, 224), interpolation=InterpolationMode.BICUBIC
+        ),
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+        ),
+    ]
+)
+def load_model(weights_path):
+    model = FaceXFormer().to(device)
+    checkpoint = torch.load(weights_path, map_location=device)
+    model.load_state_dict(checkpoint)
+    # model.load_state_dict(checkpoint["state_dict_backbone"])
+    model = model.eval()
+    model = model.to(dtype=dtype)
+    # model = torch.compile(model, mode="reduce-overhead")
+    return model
+model = load_model(weights_path)
+def adjust_bbox(
+    x_min, y_min, x_max, y_max, image_width, image_height, margin_percentage=50
+):
+    width = x_max - x_min
+    height = y_max - y_min
+    increase_width = width * (margin_percentage / 100.0) / 2
+    increase_height = height * (margin_percentage / 100.0) / 2
+    x_min_adjusted = int(max(0, x_min - increase_width))
+    y_min_adjusted = int(max(0, y_min - increase_height))
+    x_max_adjusted = int(min(image_width, x_max + increase_width))
+    y_max_adjusted = int(min(image_height, y_max + increase_height))
+    return x_min_adjusted, y_min_adjusted, x_max_adjusted, y_max_adjusted
+def denorm_points(points, h, w, align_corners=False):
+    if align_corners:
+        denorm_points = (
+            (points + 1) / 2 * torch.tensor([w - 1, h - 1]).to(points).view(1, 1, 2)
+        )
+    else:
+        denorm_points = (
+            (points + 1) * torch.tensor([w, h]).to(points).view(1, 1, 2) - 1
+        ) / 2
+    return denorm_points
+@dataclass
+class BoundingBox:
+    x_min: int
+    y_min: int
+    x_max: int
+    y_max: int
+@dataclass
+class FaceImg:
+    image: np.ndarray
+    x_min: int
+    y_min: int
+def get_faces_img(img: np.ndarray, boxes: list[BoundingBox]):
+    if boxes is None or len(boxes) == 0:
+        return []
+    results: list[FaceImg] = []
+    for box in boxes:
+        x_min, y_min, x_max, y_max = box.x_min, box.y_min, box.x_max, box.y_max
+        # Padding
+        x_min, y_min, x_max, y_max = adjust_bbox(
+            x_min, y_min, x_max, y_max, img.shape[1], img.shape[0]
+        )
+        image = img[y_min:y_max, x_min:x_max]
+        results.append(FaceImg(image, int(x_min), int(y_min)))
+    return results
+@dataclass
+class Face:
+    image: torch.Tensor
+    x_min: int
+    y_min: int
+    original_w: int
+    original_h: int
+def get_faces(img: np.ndarray, boxes: list[BoundingBox]):
+    images = get_faces_img(img, boxes)
+    images = [
+        Face(
+            transforms_image(face_image.image),
+            face_image.x_min,
+            face_image.y_min,
+            face_image.image.shape[1],
+            face_image.image.shape[0],
+        )
+        for face_image in images
+    ]
+    return images
+def get_landmarks(faces: list[Face]):
+    if len(faces) == 0:
+        return []
+    images = torch.stack([face.image for face in faces]).to(device=device, dtype=dtype)
+    tasks = torch.tensor([1] * len(faces), device=device, dtype=dtype)
+    with torch.inference_mode():
+        # with torch.amp.autocast("cuda"):
+        (
+            batch_landmarks,
+            headposes,
+            attributes,
+            visibilities,
+            ages,
+            geders,
+            races,
+            segs,
+        ) = model.predict(images, None, tasks)
+    batch_denormed = [
+        denorm_points(landmarks, face.original_h, face.original_w)[0]
+        for landmarks, face in zip(batch_landmarks.view(-1, 68, 2), faces)
+    ]
+    results = []
+    for landmarks, face in zip(batch_denormed, faces):
+        results.append(
+            [(int(x + face.x_min), int(y + face.y_min)) for x, y in landmarks]
+        )
+    return results

mediapipe_facedetection.py ADDED Viewed

File without changes

mtcnn_facedetection.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from landmark_detection import device, BoundingBox
+from facenet_pytorch import MTCNN
+import numpy as np
+mtcnn = MTCNN(keep_all=True, device=device).eval()
+def detect_faces(img) -> list[BoundingBox]:
+    boxes, probs = mtcnn.detect(img)
+    return [
+        BoundingBox(
+            x_min=int(box[0]),
+            y_min=int(box[1]),
+            x_max=int(box[2]),
+            y_max=int(box[3]),
+        )
+        for box in boxes
+    ] if boxes is not None else []

network/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .models import FaceXFormer

network/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .transformer import TwoWayTransformer, LayerNorm2d
2	+ from .facexformer import FaceXFormer

network/models/facexformer.py ADDED Viewed

	@@ -0,0 +1,392 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+from typing import Any, Optional, Tuple, Type
+from torchvision.models import swin_b, convnext_base
+from .transformer import TwoWayTransformer, LayerNorm2d
+from transformers.utils.generic import ModelOutput
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
+class FaceDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: 256,
+        transformer: nn.Module,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.landmarks_token = nn.Embedding(1, transformer_dim)
+        self.pose_token = nn.Embedding(1, transformer_dim)
+        self.attribute_token = nn.Embedding(1, transformer_dim)
+        self.visibility_token = nn.Embedding(1, transformer_dim)
+        self.age_token = nn.Embedding(1, transformer_dim)
+        self.gender_token = nn.Embedding(1, transformer_dim)
+        self.race_token = nn.Embedding(1, transformer_dim)
+        self.mask_tokens = nn.Embedding(11, transformer_dim)
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+            ),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(
+                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+            ),
+            activation(),
+        )
+        self.output_hypernetwork_mlps = MLP(
+            transformer_dim, transformer_dim, transformer_dim // 8, 3
+        )
+        self.landmarks_prediction_head = MLP(transformer_dim, transformer_dim, 136, 3)
+        self.pose_prediction_head = MLP(transformer_dim, transformer_dim, 3, 3)
+        self.attribute_prediction_head = MLP(transformer_dim, transformer_dim, 40, 3)
+        self.visibility_prediction_head = MLP(transformer_dim, transformer_dim, 29, 3)
+        self.age_prediction_head = MLP(transformer_dim, transformer_dim, 8, 3)
+        self.gender_prediction_head = MLP(transformer_dim, transformer_dim, 2, 3)
+        self.race_prediction_head = MLP(transformer_dim, transformer_dim, 5, 3)
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        output_tokens = torch.cat(
+            [
+                self.landmarks_token.weight,
+                self.pose_token.weight,
+                self.attribute_token.weight,
+                self.visibility_token.weight,
+                self.age_token.weight,
+                self.gender_token.weight,
+                self.race_token.weight,
+                self.mask_tokens.weight,
+            ],
+            dim=0,
+        )
+        tokens = output_tokens.unsqueeze(0).expand(image_embeddings.size(0), -1, -1)
+        src = image_embeddings
+        pos_src = image_pe.expand(image_embeddings.size(0), -1, -1, -1)
+        b, c, h, w = src.shape
+        hs, src = self.transformer(src, pos_src, tokens)
+        landmarks_token_out = hs[:, 0, :]
+        pose_token_out = hs[:, 1, :]
+        attribute_token_out = hs[:, 2, :]
+        visibility_token_out = hs[:, 3, :]
+        age_token_out = hs[:, 4, :]
+        gender_token_out = hs[:, 5, :]
+        race_token_out = hs[:, 6, :]
+        mask_token_out = hs[:, 7:, :]
+        landmark_output = self.landmarks_prediction_head(landmarks_token_out)
+        headpose_output = self.pose_prediction_head(pose_token_out)
+        attribute_output = self.attribute_prediction_head(attribute_token_out)
+        visibility_output = self.visibility_prediction_head(visibility_token_out)
+        age_output = self.age_prediction_head(age_token_out)
+        gender_output = self.gender_prediction_head(gender_token_out)
+        race_output = self.race_prediction_head(race_token_out)
+        src = src.transpose(1, 2).view(b, c, h, w)
+        upscaled_embedding = self.output_upscaling(src)
+        hyper_in = self.output_hypernetwork_mlps(mask_token_out)
+        b, c, h, w = upscaled_embedding.shape
+        seg_output = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        return (
+            landmark_output,
+            headpose_output,
+            attribute_output,
+            visibility_output,
+            age_output,
+            gender_output,
+            race_output,
+            seg_output,
+        )
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+class FaceXFormerMLP(nn.Module):
+    def __init__(self, input_dim):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, 256)  # 128, 256, 512, 1024 => 256
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+class FaceXFormer(nn.Module):
+    def __init__(self):
+        super(FaceXFormer, self).__init__()
+        # Backbone: Swin-B
+        swin_v2 = swin_b(weights="IMAGENET1K_V1")
+        self.backbone = torch.nn.Sequential(*(list(swin_v2.children())[:-1]))
+        self.backbone.requires_grad_(False)
+        # # Backbone: ConvNext-B
+        # convnext_v2 = convnext_base(weights='IMAGENET1K_V1')
+        # self.backbone = torch.nn.Sequential(
+        #     *(list(convnext_v2.children())[:-1]))
+        self.target_layer_names = ["0.1", "0.3", "0.5", "0.7"]
+        self.multi_scale_features = []
+        embed_dim = 1024
+        out_chans = 256
+        self.pe_layer = PositionEmbeddingRandom(out_chans // 2)
+        for name, module in self.backbone.named_modules():
+            if name in self.target_layer_names:
+                module.register_forward_hook(self.save_features_hook(name))
+        self.face_decoder = FaceDecoder(
+            transformer_dim=256,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=256,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+        )
+        num_encoder_blocks = 4
+        hidden_sizes = [128, 256, 512, 1024]
+        decoder_hidden_size = 256
+        mlps = []
+        for i in range(num_encoder_blocks):
+            mlp = FaceXFormerMLP(input_dim=hidden_sizes[i])
+            mlps.append(mlp)
+        self.linear_c = nn.ModuleList(mlps)
+        self.linear_fuse = nn.Conv2d(
+            in_channels=decoder_hidden_size * num_encoder_blocks,  # 1024
+            out_channels=decoder_hidden_size,  # 256
+            kernel_size=1,
+            bias=False,
+        )
+    def save_features_hook(self, name):
+        def hook(module, input, output):
+            self.multi_scale_features.append(output.permute(0, 3, 1, 2).contiguous())
+        return hook
+    def predict(self, x, labels, tasks):
+        self.multi_scale_features.clear()
+        _, _, h, w = x.shape
+        features = self.backbone(x).squeeze()
+        batch_size = self.multi_scale_features[-1].shape[0]
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(self.multi_scale_features, self.linear_c):
+            height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+            encoder_hidden_state = encoder_hidden_state.reshape(
+                batch_size, -1, height, width
+            )
+            encoder_hidden_state = nn.functional.interpolate(
+                encoder_hidden_state,
+                size=self.multi_scale_features[0].size()[2:],
+                mode="bilinear",
+                align_corners=False,
+            )
+            all_hidden_states += (encoder_hidden_state,)
+        fused_states = self.linear_fuse(torch.cat(all_hidden_states[::-1], dim=1))
+        image_pe = self.pe_layer(
+            (fused_states.shape[2], fused_states.shape[3])
+        ).unsqueeze(0)
+        (
+            landmark_output,
+            headpose_output,
+            attribute_output,
+            visibility_output,
+            age_output,
+            gender_output,
+            race_output,
+            seg_output,
+        ) = self.face_decoder(image_embeddings=fused_states, image_pe=image_pe)
+        segmentation_indices = tasks == 0
+        seg_output = seg_output[segmentation_indices]
+        landmarks_indices = tasks == 1
+        landmark_output = landmark_output[landmarks_indices]
+        headpose_indices = tasks == 2
+        headpose_output = headpose_output[headpose_indices]
+        attribute_indices = tasks == 3
+        attribute_output = attribute_output[attribute_indices]
+        age_indices = tasks == 4
+        age_output = age_output[age_indices]
+        gender_output = gender_output[age_indices]
+        race_output = race_output[age_indices]
+        visibility_indices = tasks == 5
+        visibility_output = visibility_output[visibility_indices]
+        return (
+            landmark_output,
+            headpose_output,
+            attribute_output,
+            visibility_output,
+            age_output,
+            gender_output,
+            race_output,
+            seg_output,
+        )
+    def loss(
+        self, predictions: torch.Tensor, labels: torch.Tensor, num_items_in_batch=None
+    ):
+        # print(predictions.shape)
+        # print(labels.shape)
+        # print("predic:", predictions)
+        # print("labels:", labels)
+        # Used L2 loss for now
+        loss = torch.nn.functional.mse_loss(predictions, labels, reduction="sum")
+        if num_items_in_batch:
+            loss /= num_items_in_batch
+        return loss
+    def forward(self, x, labels, num_items_in_batch=None):
+        self.multi_scale_features.clear()
+        _, _, h, w = x.shape
+        features = self.backbone(x).squeeze()
+        batch_size = self.multi_scale_features[-1].shape[0]
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(self.multi_scale_features, self.linear_c):
+            height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+            encoder_hidden_state = encoder_hidden_state.reshape(
+                batch_size, -1, height, width
+            )
+            encoder_hidden_state = nn.functional.interpolate(
+                encoder_hidden_state,
+                size=self.multi_scale_features[0].size()[2:],
+                mode="bilinear",
+                align_corners=False,
+            )
+            all_hidden_states += (encoder_hidden_state,)
+        fused_states = self.linear_fuse(torch.cat(all_hidden_states[::-1], dim=1))
+        image_pe = self.pe_layer(
+            (fused_states.shape[2], fused_states.shape[3])
+        ).unsqueeze(0)
+        (
+            landmark_output,
+            headpose_output,
+            attribute_output,
+            visibility_output,
+            age_output,
+            gender_output,
+            race_output,
+            seg_output,
+        ) = self.face_decoder(image_embeddings=fused_states, image_pe=image_pe)
+        # All tasks are landmark prediction
+        if labels is not None:
+            loss = self.loss(landmark_output.view(-1, 68, 2), labels)
+        else:
+            loss = None
+        return ModelOutput(
+            loss=loss,
+        )

network/models/transformer.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import Tensor, nn
+import math
+from typing import Tuple, Type
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch
+torchaudio
+torchvision
+git+https://github.com/thng292/facenet-pytorch.git
+gradio
+fastrtc
+streamlit
+streamlit-webrtc
+opencv-python
+huggingface_hub[cli]
+transformers[torch]
+datasets
+mediapipe
+deepspeed