Spaces:

ShesterG
/

TTIC-SHuBERT-ASLVideo-to-EnglishText

Running on Zero

App Files Files Community

ShesterG commited on Jul 10

Commit

ceeabec

1 Parent(s): 6854601

Add application file

Browse files

Files changed (19) hide show

__pycache__/body_features.cpython-38.pyc +0 -0
__pycache__/crop_face.cpython-38.pyc +0 -0
__pycache__/crop_hands.cpython-38.pyc +0 -0
__pycache__/dinov2_features.cpython-38.pyc +0 -0
__pycache__/inference.cpython-38.pyc +0 -0
__pycache__/kpe_mediapipe.cpython-38.pyc +0 -0
__pycache__/shubert.cpython-38.pyc +0 -0
app.py +536 -8
attention.py +107 -0
block.py +322 -0
body_features.py +358 -0
crop_face.py +415 -0
crop_hands.py +445 -0
dinov2_features.py +351 -0
features.py +115 -0
inference.py +738 -0
kpe_mediapipe.py +408 -0
shubert.py +479 -0
shubert_inference.py +439 -0

__pycache__/body_features.cpython-38.pyc ADDED Viewed

Binary file (10.2 kB). View file

__pycache__/crop_face.cpython-38.pyc ADDED Viewed

Binary file (12.1 kB). View file

__pycache__/crop_hands.cpython-38.pyc ADDED Viewed

Binary file (12.1 kB). View file

__pycache__/dinov2_features.cpython-38.pyc ADDED Viewed

Binary file (10.6 kB). View file

__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (19.7 kB). View file

__pycache__/kpe_mediapipe.cpython-38.pyc ADDED Viewed

Binary file (12 kB). View file

__pycache__/shubert.cpython-38.pyc ADDED Viewed

Binary file (8.64 kB). View file

app.py CHANGED Viewed

@@ -1,14 +1,542 @@
 import gradio as gr
 import spaces
-import torch
-zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
 @spaces.GPU
-def greet(n):
-    print(zero.device) # <-- 'cuda:0' 🤗
-    return f"Hello {zero + n} Tensor"
-demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
-demo.launch()

 import gradio as gr
+import os
+import tempfile
+import huggingface_hub
+import shutil
+import logging
+import traceback
+from features import SHuBERTProcessor
 import spaces
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Set writable cache directories
+def setup_cache_directories():
+    """Set up cache directories with proper error handling"""
+    try:
+        cache_dirs = {
+            'MPLCONFIGDIR': '/tmp/matplotlib',
+            'TRANSFORMERS_CACHE': '/tmp/huggingface',
+            'HF_HOME': '/tmp/huggingface',
+            'FONTCONFIG_PATH': '/tmp/fontconfig',
+            'TORCH_HOME': '/tmp/torch',  # PyTorch cache directory
+        }
+        for env_var, path in cache_dirs.items():
+            os.environ[env_var] = path
+            os.makedirs(path, exist_ok=True, mode=0o777)
+            logger.info(f"Cache directory created: {env_var} = {path}")
+        # Also set XDG_CACHE_HOME to override default .cache location
+        os.environ['XDG_CACHE_HOME'] = '/tmp/cache'
+        os.makedirs('/tmp/cache', exist_ok=True, mode=0o777)
+        logger.info(f"Cache directory created: XDG_CACHE_HOME = /tmp/cache")
+        # Clear any existing PyTorch Hub cache to avoid corruption issues
+        torch_hub_dir = '/tmp/torch/hub'
+        if os.path.exists(torch_hub_dir):
+            shutil.rmtree(torch_hub_dir)
+            logger.info("Cleared existing PyTorch Hub cache")
+        os.makedirs(torch_hub_dir, exist_ok=True, mode=0o777)
+        logger.info(f"Created clean PyTorch Hub cache directory: {torch_hub_dir}")
+        # Copy updated DINOv2 files to torch cache after clearing
+        # This ensures they're available when PyTorch Hub downloads the repo
+        try:
+            src_dir = os.path.dirname(os.path.abspath(__file__))
+            target_dir = '/tmp/torch/hub/facebookresearch_dinov2_main/dinov2/layers'
+            for filename in ['attention.py', 'block.py']:
+                src_path = os.path.join(src_dir, filename)
+                if os.path.exists(src_path):
+                    # We'll copy these after the initial hub download
+                    logger.info(f"Found {filename} in project directory - will copy after hub download")
+                else:
+                    logger.warning(f"Could not find {filename} in project directory")
+        except Exception as e:
+            logger.warning(f"Error preparing DINOv2 files: {e}")
+        return True
+    except Exception as e:
+        logger.error(f"Error creating cache directories: {str(e)}")
+        return False
+# Configuration for Hugging Face Spaces
+MODEL_REPO = "ShesterG/SHuBERT"
+TOKEN = os.environ.get('HF_TOKEN')
+def validate_environment():
+    """Validate required environment variables and setup"""
+    if not TOKEN:
+        raise ValueError("HF_TOKEN environment variable not set. This is required to access private model repository.")
+    # Check available disk space
+    free_space = shutil.disk_usage('/').free / (1024*1024*1024)  # GB
+    logger.info(f"Available disk space: {free_space:.2f} GB")
+    if free_space < 2:  # Less than 2GB
+        logger.warning("Low disk space available. This may cause issues.")
+    return True
+def download_models():
+    """Download all required models from Hugging Face Hub with enhanced error handling"""
+    logger.info("Starting model download process...")
+    try:
+        # Validate environment first
+        validate_environment()
+        logger.info("Downloading entire models folder...")
+        # Download the entire models folder
+        models_path = huggingface_hub.snapshot_download(
+            repo_id=MODEL_REPO,
+            allow_patterns="models/*",  # Download everything in models folder
+            token=TOKEN,
+            cache_dir=os.environ['TRANSFORMERS_CACHE']
+        )
+        # Build config dict with expected file paths
+        config = {
+            'yolov8_model_path': os.path.join(models_path, "models/yolov8n.pt"),
+            'dino_face_model_path': os.path.join(models_path, "models/dinov2face.pth"),
+            'dino_hands_model_path': os.path.join(models_path, "models/dinov2hand.pth"),
+            'mediapipe_face_model_path': os.path.join(models_path, "models/face_landmarker_v2_with_blendshapes.task"),
+            'mediapipe_hands_model_path': os.path.join(models_path, "models/hand_landmarker.task"),
+            'shubert_model_path': os.path.join(models_path, "models/checkpoint_836_400000.pt"),
+            'slt_model_config': os.path.join(models_path, "models/byt5_base/config.json"),
+            'slt_model_checkpoint': os.path.join(models_path, "models/checkpoint-11625"),
+            'slt_tokenizer_checkpoint': os.path.join(models_path, "models/byt5_base"),
+            'temp_dir': 'temp'
+        }
+        # Verify all required files and folders exist
+        logger.info("Verifying downloaded files...")
+        missing_files = []
+        for key, path in config.items():
+            if key == 'temp_dir':  # Skip temp_dir check
+                continue
+            if not os.path.exists(path):
+                missing_files.append(f"{key}: {path}")
+                logger.error(f"Missing: {path}")
+            else:
+                logger.info(f"✓ Found: {path}")
+        if missing_files:
+            logger.error(f"Missing {len(missing_files)} required files/folders:")
+            for missing in missing_files:
+                logger.error(f"  - {missing}")
+            raise FileNotFoundError(f"Required files not found: {missing_files}")
+        logger.info("All models downloaded and verified successfully!")
+        logger.info(f"Models root path: {models_path}")
+        return config
+    except Exception as e:
+        logger.error(f"Error downloading models: {str(e)}")
+        logger.error(f"Traceback: {traceback.format_exc()}")
+        # Additional debugging info
+        try:
+            cache_contents = os.listdir(os.environ['TRANSFORMERS_CACHE'])
+            logger.info(f"Cache directory contents: {cache_contents}")
+        except:
+            logger.error("Cannot access cache directory")
+        return None
+def initialize_processor(config):
+    """Initialize SHuBERT processor with error handling"""
+    try:
+        logger.info("Initializing SHuBERT processor...")
+        processor = SHuBERTProcessor(config)
+        logger.info("SHuBERT processor initialized successfully!")
+        return processor
+    except Exception as e:
+        logger.error(f"Error initializing SHuBERT processor: {str(e)}")
+        logger.error(f"Traceback: {traceback.format_exc()}")
+        return None
+# Initialize the application
+def initialize_app():
+    """Initialize the entire application with comprehensive error handling"""
+    try:
+        # Setup cache directories
+        if not setup_cache_directories():
+            raise RuntimeError("Failed to setup cache directories")
+        # Download models
+        config = download_models()
+        if config is None:
+            raise RuntimeError("Failed to download models")
+        # Initialize processor
+        processor = initialize_processor(config)
+        if processor is None:
+            raise RuntimeError("Failed to initialize SHuBERT processor")
+        logger.info("Application initialized successfully!")
+        return config, processor
+    except Exception as e:
+        error_msg = f"Application initialization failed: {str(e)}"
+        logger.error(error_msg)
+        logger.error(f"Full traceback: {traceback.format_exc()}")
+        raise RuntimeError(error_msg)
+# Global variables for application state
+config = None
+processor = None
+initialization_error = None
+try:
+    config, processor = initialize_app()
+except Exception as e:
+    initialization_error = str(e)
+    logger.error(f"Startup failed: {initialization_error}")
+def copy_dinov2_files_if_needed():
+    """Copy updated DINOv2 files after PyTorch Hub download if needed"""
+    try:
+        src_dir = os.path.dirname(os.path.abspath(__file__))
+        target_dir = '/tmp/torch/hub/facebookresearch_dinov2_main/dinov2/layers'
+        # Check if PyTorch Hub has downloaded the repository
+        hub_main_dir = '/tmp/torch/hub/facebookresearch_dinov2_main'
+        if os.path.exists(hub_main_dir):
+            # Ensure the target directory exists
+            os.makedirs(target_dir, exist_ok=True)
+            files_copied = 0
+            for filename in ['attention.py', 'block.py']:
+                src_path = os.path.join(src_dir, filename)
+                target_path = os.path.join(target_dir, filename)
+                if os.path.exists(src_path):
+                    # Always overwrite with our robust versions
+                    shutil.copy2(src_path, target_path)
+                    # Make sure it's readable
+                    os.chmod(target_path, 0o644)
+                    logger.info(f"Replaced {filename} with robust version (numpy/Python 3.8 compatible)")
+                    files_copied += 1
+                else:
+                    logger.error(f"Source file not found: {src_path}")
+            if files_copied > 0:
+                # Clear Python's import cache to ensure new files are used
+                import importlib
+                import sys
+                # Remove any cached imports of dinov2 modules
+                modules_to_remove = [key for key in sys.modules.keys() if 'dinov2' in key]
+                for module in modules_to_remove:
+                    del sys.modules[module]
+                    logger.info(f"Cleared cached import: {module}")
+                logger.info(f"Successfully replaced {files_copied} DINOv2 files with robust versions")
+                return True
+        else:
+            logger.info("PyTorch Hub repository not yet downloaded")
+            return False
+    except Exception as e:
+        logger.error(f"Error copying DINOv2 files: {e}")
+        logger.error(f"Traceback: {traceback.format_exc()}")
+        return False
 @spaces.GPU
+def process_video(video_file):
+    """Process uploaded video file with enhanced error handling"""
+    # Check if initialization was successful
+    if initialization_error:
+        return f"Application initialization failed: {initialization_error}\n\nPlease check the logs for more details."
+    if processor is None:
+        return "Error: Model not initialized properly. Please check the logs."
+    if video_file is None:
+        return "Please upload a video file."
+    logger.info(f"=== Starting video processing ===")
+    logger.info(f"Video file input: {video_file}")
+    logger.info(f"Video file type: {type(video_file)}")
+    try:
+        # Create temp directory with proper permissions
+        temp_dir = config['temp_dir']
+        os.makedirs(temp_dir, exist_ok=True, mode=0o777)
+        logger.info(f"Temp directory: {temp_dir}")
+        # Generate unique filename to avoid conflicts
+        import time
+        timestamp = str(int(time.time() * 1000))
+        file_extension = '.mp4'  # Default extension
+        # Try to get original extension if available
+        try:
+            if hasattr(video_file, 'name') and video_file.name:
+                file_extension = os.path.splitext(video_file.name)[1] or '.mp4'
+            elif isinstance(video_file, str):
+                file_extension = os.path.splitext(video_file)[1] or '.mp4'
+        except:
+            pass
+        temp_video_path = os.path.join(temp_dir, f"video_{timestamp}{file_extension}")
+        logger.info(f"Target temp video path: {temp_video_path}")
+        # Handle Gradio file upload - video_file is typically a string path to temp file
+        logger.info(f"Processing video file: {video_file} (type: {type(video_file)})")
+        if isinstance(video_file, str):
+            # Gradio provides a file path string
+            source_path = video_file
+            # Handle both absolute and relative paths
+            if not os.path.isabs(source_path):
+                # Try current working directory first
+                abs_source_path = os.path.abspath(source_path)
+                logger.info(f"Converting relative path {source_path} to absolute: {abs_source_path}")
+                if os.path.exists(abs_source_path):
+                    source_path = abs_source_path
+                else:
+                    # Try looking in common Gradio temp directories
+                    possible_paths = [
+                        source_path,
+                        os.path.join('/tmp', os.path.basename(source_path)),
+                        os.path.join('/tmp/gradio', os.path.basename(source_path)),
+                        abs_source_path
+                    ]
+                    found_path = None
+                    for path in possible_paths:
+                        logger.info(f"Checking path: {path}")
+                        if os.path.exists(path):
+                            found_path = path
+                            logger.info(f"Found file at: {path}")
+                            break
+                    if found_path:
+                        source_path = found_path
+                    else:
+                        logger.error(f"Could not find source file in any expected location")
+                        logger.error(f"Tried paths: {possible_paths}")
+                        raise FileNotFoundError(f"Source video file not found in any expected location: {video_file}")
+            logger.info(f"Final source file path: {source_path}")
+            logger.info(f"Source file exists: {os.path.exists(source_path)}")
+            if os.path.exists(source_path):
+                try:
+                    # Check source file permissions and size
+                    stat_info = os.stat(source_path)
+                    logger.info(f"Source file size: {stat_info.st_size} bytes, mode: {oct(stat_info.st_mode)}")
+                    # Try to read the file content
+                    with open(source_path, 'rb') as src:
+                        content = src.read()
+                        logger.info(f"Successfully read {len(content)} bytes from source")
+                    # Write to destination (with a different name to avoid conflicts)
+                    final_temp_path = os.path.join(temp_dir, f"processed_{timestamp}{file_extension}")
+                    with open(final_temp_path, 'wb') as dst:
+                        dst.write(content)
+                        logger.info(f"Successfully wrote to destination: {final_temp_path}")
+                    # Update temp_video_path to the final location
+                    temp_video_path = final_temp_path
+                except PermissionError as e:
+                    logger.error(f"Permission error reading source file: {e}")
+                    # Try alternative approach - use a completely different temp location
+                    try:
+                        import tempfile
+                        # Create a new temporary file in system temp directory
+                        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp:
+                            alternative_temp_path = tmp.name
+                        logger.info(f"Trying alternative temp path: {alternative_temp_path}")
+                        # Try to copy using system copy command as fallback
+                        import subprocess
+                        result = subprocess.run(['cp', source_path, alternative_temp_path],
+                                              capture_output=True, text=True)
+                        if result.returncode == 0:
+                            logger.info("Successfully copied using system cp command")
+                            temp_video_path = alternative_temp_path
+                        else:
+                            logger.error(f"System cp failed: {result.stderr}")
+                            raise PermissionError(f"Cannot read video file due to permission restrictions: {e}")
+                    except Exception as e2:
+                        logger.error(f"Alternative copy method also failed: {e2}")
+                        raise PermissionError(f"Cannot read video file due to permission restrictions: {e}")
+            else:
+                raise FileNotFoundError(f"Source video file not found: {source_path}")
+        elif hasattr(video_file, 'read'):
+            # If it's a file-like object with read method
+            try:
+                content = video_file.read()
+                with open(temp_video_path, 'wb') as f:
+                    f.write(content)
+                logger.info(f"Saved video from file object: {temp_video_path} ({len(content)} bytes)")
+            except Exception as e:
+                logger.error(f"Error reading from file object: {e}")
+                raise ValueError(f"Cannot read from file object: {e}")
+        else:
+            # Handle other cases - try to extract file path or content
+            logger.info(f"Attempting to handle unknown file type: {type(video_file)}")
+            try:
+                # Check if it has a name attribute (common for file objects)
+                if hasattr(video_file, 'name'):
+                    source_path = video_file.name
+                    logger.info(f"Found name attribute: {source_path}")
+                    if os.path.exists(source_path):
+                        with open(source_path, 'rb') as src:
+                            content = src.read()
+                        with open(temp_video_path, 'wb') as dst:
+                            dst.write(content)
+                        logger.info(f"Successfully copied from name attribute")
+                    else:
+                        raise FileNotFoundError(f"File from name attribute not found: {source_path}")
+                else:
+                    logger.error(f"Unsupported video file type: {type(video_file)}")
+                    raise ValueError(f"Unsupported video file type: {type(video_file)}")
+            except Exception as e:
+                logger.error(f"Failed to handle unknown file type: {e}")
+                raise ValueError(f"Cannot process video file: {e}")
+        # Set proper permissions on the saved file
+        os.chmod(temp_video_path, 0o666)
+        # Verify file exists and has content
+        if not os.path.exists(temp_video_path) or os.path.getsize(temp_video_path) == 0:
+            raise ValueError("Video file is empty or could not be saved")
+        # Copy DINOv2 files if needed before processing
+        # This needs to happen right after PyTorch Hub downloads but before model loading
+        logger.info("Ensuring DINOv2 files are ready for processing...")
+        copy_dinov2_files_if_needed()
+        # Set up a monitoring patch for torch.hub.load to replace files immediately after download
+        original_torch_hub_load = None
+        try:
+            import torch.hub
+            original_torch_hub_load = torch.hub.load
+            def patched_torch_hub_load(*args, **kwargs):
+                logger.info(f"PyTorch Hub load called with: {args[0] if args else 'unknown'}")
+                # Call the original function first
+                result = original_torch_hub_load(*args, **kwargs)
+                # If this was a DINOv2 call, immediately replace the files
+                if args and 'dinov2' in str(args[0]):
+                    logger.info("DINOv2 downloaded! Immediately replacing with robust versions...")
+                    # Try multiple times to ensure files are replaced
+                    import time
+                    for attempt in range(5):
+                        if copy_dinov2_files_if_needed():
+                            logger.info("Successfully replaced DINOv2 files!")
+                            break
+                        else:
+                            logger.info(f"Attempt {attempt + 1} failed, retrying in 1 second...")
+                            time.sleep(1)
+                return result
+            # Temporarily patch torch.hub.load
+            torch.hub.load = patched_torch_hub_load
+            logger.info("Patched torch.hub.load to replace DINOv2 files after download")
+        except Exception as e:
+            logger.warning(f"Could not patch torch.hub.load: {e}")
+        logger.info(f"Processing video: {temp_video_path}")
+        try:
+            output_text = processor.process_video(temp_video_path)
+        finally:
+            # Restore original function
+            if original_torch_hub_load:
+                try:
+                    import torch.hub
+                    torch.hub.load = original_torch_hub_load
+                    logger.info("Restored original torch.hub.load")
+                except:
+                    pass
+        logger.info(f"Video processed successfully. Output: {output_text[:100]}...")
+        # Clean up temp file
+        if os.path.exists(temp_video_path):
+            os.remove(temp_video_path)
+            logger.info("Temporary video file cleaned up")
+        return output_text
+    except Exception as e:
+        logger.error(f"Error processing video: {str(e)}")
+        logger.error(f"Traceback: {traceback.format_exc()}")
+        return f"Error processing video: {str(e)}\n\nPlease check that your video is a valid ASL video under 10 seconds."
+# Create Gradio interface
+def create_interface():
+   """Create the Gradio interface"""
+   description = """
+   Upload an ASL* video to get an English translation.  *Sign languages belonging to the same sign language family as ASL (e.g. Ghanaian Sign Language, as well as others listed in Table 7, Row 1 of https://aclanthology.org/2023.findings-emnlp.664.pdf) might also have non-trivial performance, although the model is trained only on ASL data.
+This app uses TTIC's foundation model SHuBERT (introduced in an ACL 2025 paper, see http://shubert.pals.ttic.edu).
+   **Requirements:**
+   - We recommend that videos be under 60 seconds.  Performance for longer videos has not been tested.
+   - The signer should be the main part of the video. Videos recorded from a phone camera, tablet, or personal computer should work well. Studio recordings where the signer is farther from the camera may not work as well.
+   - Supported formats: MP4, MOV
+   **Note:**
+   - Videos will be deleted after the output is generated.
+   - Inquires or Feedback? Please email us at [email protected]
+   """
+   if initialization_error:
+       description += f"\n\n:warning: **Initialization Error:** {initialization_error}"
+   return gr.Interface(
+       fn=process_video,
+       inputs=gr.Video(label="ASL Video (under 60 seconds)", format="mp4"),
+       # inputs=gr.File(
+       #     label="Upload ASL Video (under 60 seconds)",
+       #     file_types=[".mp4", ".avi", ".mov", ".webm"],
+       #     type="filepath"  # This tells Gradio to provide the file path directly
+       # ),
+       outputs=gr.Textbox(label="English Translation", lines=5),
+       title="ASL Video to English Text Translation",
+       description=description,
+       article="",
+       examples=[],
+       allow_flagging="never"
+   )
+# Create the demo
+demo = create_interface()
+if __name__ == "__main__":
+    # Launch with better configuration for Hugging Face Spaces
+    logger.info("Launching Gradio interface...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

attention.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+import torch
+from torch import nn, Tensor
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Attention)")
+    else:
+        warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Attention)")
+try:
+    from typing import Optional
+    from typing import Union
+    FloatOrNone = Union[float, None]
+except ImportError:
+    FloatOrNone = float | None
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def init_weights(
+        self, init_attn_std: FloatOrNone = None, init_proj_std: FloatOrNone = None, factor: float = 1.0
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        nn.init.normal_(self.qkv.weight, std=init_attn_std)
+        nn.init.normal_(self.proj.weight, std=init_proj_std)
+        if self.qkv.bias is not None:
+            nn.init.zeros_(self.qkv.bias)
+        if self.proj.bias is not None:
+            nn.init.zeros_(self.proj.bias)
+    def forward(self, x: Tensor, is_causal: bool = False) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        x = nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, dropout_p=self.attn_drop if self.training else 0, is_causal=is_causal
+        )
+        x = x.transpose(1, 2).contiguous().view(B, N, C)
+        x = self.proj_drop(self.proj(x))
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

block.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict, Optional
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+try:
+    from typing import Optional
+    from typing import Union
+    FloatOrNone = Union[float, None]
+except ImportError:
+    FloatOrNone = float | None
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Block)")
+    else:
+        warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+class CausalAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        ls_init_value: Optional[float] = None,
+        is_causal: bool = True,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        dropout_prob: float = 0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.is_causal = is_causal
+        self.ls1 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity()
+        self.attention_norm = norm_layer(dim)
+        self.attention = Attention(dim, num_heads, attn_drop=dropout_prob, proj_drop=dropout_prob)
+        self.ffn_norm = norm_layer(dim)
+        ffn_hidden_dim = int(dim * ffn_ratio)
+        self.feed_forward = Mlp(
+            in_features=dim,
+            hidden_features=ffn_hidden_dim,
+            drop=dropout_prob,
+            act_layer=act_layer,
+        )
+        self.ls2 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity()
+    def init_weights(
+        self,
+        init_attn_std: FloatOrNone = None,
+        init_proj_std: FloatOrNone = None,
+        init_fc_std: FloatOrNone = None,
+        factor: float = 1.0,
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        init_fc_std = init_fc_std or (2 * self.dim) ** -0.5
+        self.attention.init_weights(init_attn_std, init_proj_std)
+        self.attention_norm.reset_parameters()
+        nn.init.normal_(self.feed_forward.fc1.weight, std=init_fc_std)
+        nn.init.normal_(self.feed_forward.fc2.weight, std=init_proj_std)
+        self.ffn_norm.reset_parameters()
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        x_attn = x + self.ls1(self.attention(self.attention_norm(x), self.is_causal))
+        x_ffn = x_attn + self.ls2(self.feed_forward(self.ffn_norm(x_attn)))
+        return x_ffn
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

body_features.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import cv2
+import numpy as np
+import os
+import pickle
+import gzip
+from datetime import datetime
+from pathlib import Path
+import decord
+import argparse
+import json
+import glob
+import time
+from typing import Dict, List, Optional, Tuple, Union, Any
+class PoseProcessor:
+    """
+    A class for processing pose landmarks and converting them to normalized numpy arrays.
+    """
+    def __init__(self, pose_indices: Optional[List[int]] = None,
+                 normalize_keypoints: bool = True, fill_missing_value: float = -9999.0):
+        """
+        Initialize the PoseProcessor.
+        Args:
+            pose_indices: List of pose landmark indices to extract.
+                         Default is [0,11,12,13,14,15,16] (nose, shoulders, elbows, wrists)
+            normalize_keypoints: Whether to normalize keypoints to signing space
+            fill_missing_value: Value to use for missing keypoints
+        """
+        self.pose_indices = pose_indices if pose_indices else [0, 11, 12, 13, 14, 15, 16]
+        self.normalize_keypoints = normalize_keypoints
+        self.fill_missing_value = fill_missing_value
+        # Number of coordinates per keypoint (x, y)
+        self.coords_per_keypoint = 2
+        self.output_shape = (len(self.pose_indices), self.coords_per_keypoint)
+    def normalize_pose_keypoints(self, pose_landmarks: List[List[float]]) -> List[List[float]]:
+        """
+        Normalize pose keypoints to signing space.
+        Args:
+            pose_landmarks: List of pose landmarks from MediaPipe
+        Returns:
+            List of normalized pose keypoints
+        """
+        # Extract relevant landmarks for normalization
+        left_shoulder = np.array(pose_landmarks[11][:2])
+        right_shoulder = np.array(pose_landmarks[12][:2])
+        left_eye = np.array(pose_landmarks[2][:2])
+        nose = np.array(pose_landmarks[0][:2])
+        # Calculate head unit in normalized space
+        head_unit = np.linalg.norm(right_shoulder - left_shoulder) / 2
+        # Define signing space dimensions in normalized space
+        signing_space_width = 6 * head_unit
+        signing_space_height = 7 * head_unit
+        # Calculate signing space bounding box in normalized space
+        signing_space_top = left_eye[1] - 0.5 * head_unit
+        signing_space_bottom = signing_space_top + signing_space_height
+        signing_space_left = nose[0] - signing_space_width / 2
+        signing_space_right = signing_space_left + signing_space_width
+        # Create transformation matrix
+        translation_matrix = np.array([[1, 0, -signing_space_left],
+                                       [0, 1, -signing_space_top],
+                                       [0, 0, 1]])
+        scale_matrix = np.array([[1 / signing_space_width, 0, 0],
+                                 [0, 1 / signing_space_height, 0],
+                                 [0, 0, 1]])
+        shift_matrix = np.array([[1, 0, -0.5],
+                                 [0, 1, -0.5],
+                                 [0, 0, 1]])
+        transformation_matrix = shift_matrix @ scale_matrix @ translation_matrix
+        # Apply transformation to pose keypoints
+        normalized_keypoints = []
+        for landmark in pose_landmarks:
+            keypoint = np.array([landmark[0], landmark[1], 1])
+            normalized_keypoint = transformation_matrix @ keypoint
+            normalized_keypoints.append(normalized_keypoint[:2].tolist())
+        return normalized_keypoints
+    def process_frame_landmarks(self, frame_landmarks: Optional[Dict[str, Any]]) -> np.ndarray:
+        """
+        Process landmarks for a single frame.
+        Args:
+            frame_landmarks: Dictionary containing pose landmarks for one frame
+        Returns:
+            Numpy array of processed pose keypoints
+        """
+        if frame_landmarks is None or frame_landmarks.get('pose_landmarks') is None:
+            # Return missing value array
+            return np.full(self.output_shape, self.fill_missing_value).flatten()
+        # Get pose landmarks
+        pose_landmarks = frame_landmarks['pose_landmarks'][0]
+        # Normalize keypoints if required
+        if self.normalize_keypoints:
+            # Take first 25 landmarks for normalization (MediaPipe pose has 33 total)
+            normalized_landmarks = self.normalize_pose_keypoints(pose_landmarks[:25])
+        else:
+            normalized_landmarks = pose_landmarks
+        # Extract only the specified indices
+        selected_landmarks = [normalized_landmarks[i] for i in self.pose_indices]
+        # Convert to numpy array and flatten
+        frame_keypoints = np.array(selected_landmarks).flatten()
+        return frame_keypoints
+    def process_landmarks_sequence(self, landmarks_data: Dict[int, Any]) -> np.ndarray:
+        """
+        Process landmarks for an entire sequence (video).
+        Args:
+            landmarks_data: Dictionary containing landmarks for each frame
+        Returns:
+            Numpy array of shape (num_frames, num_keypoints * 2)
+        """
+        # Get number of frames
+        if not landmarks_data:
+            return np.array([])
+        max_frame = max(landmarks_data.keys())
+        num_frames = max_frame + 1
+        video_pose_landmarks = []
+        prev_pose = None
+        for i in range(num_frames):
+            frame_landmarks = landmarks_data.get(i, None)
+            if frame_landmarks is None:
+                # Use previous pose if available, otherwise use missing values
+                if prev_pose is not None:
+                    frame_keypoints = prev_pose
+                else:
+                    frame_keypoints = np.full(self.output_shape, self.fill_missing_value).flatten()
+            else:
+                # Process current frame
+                frame_keypoints = self.process_frame_landmarks(frame_landmarks)
+                if not np.all(frame_keypoints == self.fill_missing_value):
+                    prev_pose = frame_keypoints
+            video_pose_landmarks.append(frame_keypoints)
+        # Convert to numpy array
+        video_pose_landmarks = np.array(video_pose_landmarks)
+        # Apply any post-processing (like the original code's wrist masking)
+        # video_pose_landmarks = self._apply_post_processing(video_pose_landmarks)
+        return video_pose_landmarks
+    def _apply_post_processing(self, pose_array: np.ndarray) -> np.ndarray:
+        """
+        Apply post-processing to the pose array.
+        Args:
+            pose_array: Input pose array
+        Returns:
+            Post-processed pose array
+        """
+        # The original code fills left and right wrist with -9999
+        # This corresponds to indices 15 and 16 in the original pose landmarks
+        # In our selected indices [0,11,12,13,14,15,16], wrists are at positions 5 and 6
+        # Each keypoint has 2 coordinates, so wrists are at positions 10-11 and 12-13
+        # if len(self.pose_indices) >= 7 and 15 in self.pose_indices and 16 in self.pose_indices:
+        #     # Find positions of wrists in our selected indices
+        #     left_wrist_idx = self.pose_indices.index(15) * 2  # *2 because each keypoint has x,y
+        #     right_wrist_idx = self.pose_indices.index(16) * 2
+        #     # Fill wrist coordinates with missing value
+        #     pose_array[:, left_wrist_idx:left_wrist_idx+2] = self.fill_missing_value
+        #     pose_array[:, right_wrist_idx:right_wrist_idx+2] = self.fill_missing_value
+        return pose_array
+    def process_landmarks_from_file(self, pose_file_path: str) -> np.ndarray:
+        """
+        Process landmarks from a JSON file.
+        Args:
+            pose_file_path: Path to the pose landmarks JSON file
+        Returns:
+            Numpy array of processed pose keypoints
+        """
+        try:
+            with open(pose_file_path, 'r') as f:
+                landmarks_data = json.load(f)
+            # Convert string keys to integers
+            landmarks_data = {int(k): v for k, v in landmarks_data.items()}
+            return self.process_landmarks_sequence(landmarks_data)
+        except Exception as e:
+            print(f"Error processing {pose_file_path}: {e}")
+            return np.array([])
+    def process_and_save_landmarks(self, landmarks_data: Dict[int, Any],
+                                  output_path: str, filename: str) -> str:
+        """
+        Process landmarks and save to file.
+        Args:
+            landmarks_data: Dictionary containing landmarks for each frame
+            output_path: Directory to save the processed landmarks
+            filename: Name for the output file (without extension)
+        Returns:
+            Path to the saved file
+        """
+        # Process landmarks
+        processed_landmarks = self.process_landmarks_sequence(landmarks_data)
+        # Create output directory if it doesn't exist
+        output_dir = Path(output_path)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Save to file
+        save_path = output_dir / f"{filename}.npy"
+        np.save(save_path, processed_landmarks)
+        return str(save_path)
+# Convenience functions for backward compatibility
+def process_pose_landmarks(landmarks_data: Dict[int, Any],
+                          normalize: bool = True,
+                          pose_indices: Optional[List[int]] = None) -> np.ndarray:
+    """
+    Convenience function to process pose landmarks.
+    Args:
+        landmarks_data: Dictionary containing landmarks for each frame
+        normalize: Whether to normalize keypoints to signing space
+        pose_indices: List of pose landmark indices to extract
+    Returns:
+        Numpy array of processed pose keypoints
+    """
+    processor = PoseProcessor(pose_indices=pose_indices, normalize_keypoints=normalize)
+    return processor.process_landmarks_sequence(landmarks_data)
+def keypoints_to_numpy(pose_file: str, pose_emb_path: str):
+    """
+    Original function for backward compatibility with command-line usage.
+    """
+    try:
+        processor = PoseProcessor()
+        processed_landmarks = processor.process_landmarks_from_file(pose_file)
+        if processed_landmarks.size > 0:
+            # Save the processed landmarks
+            video_name = Path(pose_file).stem
+            save_path = Path(pose_emb_path) / f"{video_name}.npy"
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+            np.save(save_path, processed_landmarks)
+    except Exception as e:
+        print(f"Error processing {pose_file}: {e}")
+# Utility functions for batch processing
+def get_mp4_files(directory: str) -> List[str]:
+    """Get all MP4 files in a directory."""
+    if not os.path.exists(directory):
+        raise FileNotFoundError(f'Directory not found: {directory}')
+    mp4_files = glob.glob(os.path.join(directory, '*.mp4'))
+    return [os.path.abspath(file) for file in mp4_files]
+def load_file(filename: str):
+    """Load a pickled and gzipped file."""
+    with gzip.open(filename, "rb") as f:
+        return pickle.load(f)
+def is_string_in_file(file_path: str, target_string: str) -> bool:
+    """Check if a string exists in a file."""
+    try:
+        with Path(file_path).open("r") as f:
+            for line in f:
+                if target_string in line:
+                    return True
+        return False
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--index', type=int, required=True,
+                        help='index of the sub_list to work with')
+    parser.add_argument('--files_list', type=str, required=True,
+                        help='path to the pose file')
+    parser.add_argument('--pose_features_path', type=str, required=True,
+                        help='path to the pose features file')
+    parser.add_argument('--batch_size', type=int, required=True,
+                        help='batch size')
+    parser.add_argument('--time_limit', type=int, required=True,
+                        help='time limit')
+    args = parser.parse_args()
+    start_time = time.time()
+    # Load files list
+    fixed_list = load_file(args.files_list)
+    # Initialize processor
+    processor = PoseProcessor()
+    # Process files in batches
+    video_batches = [fixed_list[i:i + args.batch_size] for i in range(0, len(fixed_list), args.batch_size)]
+    for pose_file in video_batches[args.index]:
+        pose_file_path = Path(pose_file)
+        output_path = Path(args.pose_features_path) / f"{pose_file_path.stem}.npy"
+        if output_path.exists():
+            print(f"Skipping {pose_file} - output already exists")
+            continue
+        current_time = time.time()
+        if current_time - start_time > args.time_limit:
+            print("Time limit reached. Stopping execution.")
+            break
+        try:
+            print(f"Processing {pose_file}")
+            keypoints_to_numpy(pose_file, args.pose_features_path)
+            print(f"Successfully processed {pose_file}")
+        except Exception as e:
+            print(f"Error processing {pose_file}: {e}")
+if __name__ == "__main__":
+    main()

crop_face.py ADDED Viewed

	@@ -0,0 +1,415 @@

+import cv2
+import numpy as np
+import os
+import pickle
+import gzip
+from datetime import datetime
+from pathlib import Path
+import decord
+import argparse
+import json
+import time
+from typing import Dict, Optional, Tuple, List, Union, Any
+class FaceExtractor:
+    """
+    A class for extracting face regions from videos based on pose and face landmarks.
+    Creates face frames with only eyes and mouth visible on grey background.
+    """
+    def __init__(self, output_size: Tuple[int, int] = (224, 224),
+                 scale_factor: float = 1.2, grey_background_color: int = 128):
+        """
+        Initialize the FaceExtractor.
+        Args:
+            output_size: Size of the output face frames (width, height)
+            scale_factor: Scale factor for bounding box expansion
+            grey_background_color: Color value for grey background (0-255)
+        """
+        self.output_size = output_size
+        self.scale_factor = scale_factor
+        self.grey_background_color = grey_background_color
+        # Face landmark indices for eyes and mouth
+        self.left_eye_indices = [69, 168, 156, 118, 54]
+        self.right_eye_indices = [168, 299, 347, 336, 301]
+        self.mouth_indices = [164, 212, 432, 18]
+    def resize_frame(self, frame: np.ndarray, frame_size: Tuple[int, int]) -> Optional[np.ndarray]:
+        """Resize frame to specified size."""
+        if frame is not None and frame.size > 0:
+            return cv2.resize(frame, frame_size, interpolation=cv2.INTER_AREA)
+        else:
+            return None
+    def calculate_bounding_box(self, landmarks: List[List[float]], indices: List[int],
+                             image_shape: Tuple[int, int, int]) -> Tuple[int, int, int, int]:
+        """Calculate bounding box for specific landmark indices."""
+        x_coordinates = [landmarks[i][0] for i in indices]
+        y_coordinates = [landmarks[i][1] for i in indices]
+        left = min(x_coordinates)
+        right = max(x_coordinates)
+        top = min(y_coordinates)
+        bottom = max(y_coordinates)
+        return (int(left * image_shape[1]), int(top * image_shape[0]),
+                int(right * image_shape[1]), int(bottom * image_shape[0]))
+    def crop_and_paste(self, src: np.ndarray, dst: np.ndarray,
+                      src_box: Tuple[int, int, int, int], dst_origin: Tuple[int, int]):
+        """Crop region from source and paste to destination."""
+        x1, y1, x2, y2 = src_box
+        dx, dy = dst_origin
+        crop = src[y1:y2, x1:x2]
+        crop_height, crop_width = crop.shape[:2]
+        dst[dy:dy+crop_height, dx:dx+crop_width] = crop
+    def cues_on_grey_background(self, image: np.ndarray, facial_landmarks: List[List[float]]) -> np.ndarray:
+        """
+        Create face frame with only eyes and mouth visible on grey background.
+        Args:
+            image: Input image as numpy array
+            facial_landmarks: Face landmarks from MediaPipe
+        Returns:
+            Face frame with eyes and mouth on grey background
+        """
+        image_shape = image.shape
+        # Calculate bounding boxes for facial features
+        left_eye_box = self.calculate_bounding_box(facial_landmarks, self.left_eye_indices, image_shape)
+        right_eye_box = self.calculate_bounding_box(facial_landmarks, self.right_eye_indices, image_shape)
+        mouth_box = self.calculate_bounding_box(facial_landmarks, self.mouth_indices, image_shape)
+        # Calculate the overall bounding box
+        min_x = min(left_eye_box[0], right_eye_box[0], mouth_box[0])
+        min_y = min(left_eye_box[1], right_eye_box[1], mouth_box[1])
+        max_x = max(left_eye_box[2], right_eye_box[2], mouth_box[2])
+        max_y = max(left_eye_box[3], right_eye_box[3], mouth_box[3])
+        # Add padding
+        padding = 10
+        min_x = max(0, min_x - padding)
+        min_y = max(0, min_y - padding)
+        max_x = min(image.shape[1], max_x + padding)
+        max_y = min(image.shape[0], max_y + padding)
+        # Make the crop a square by adjusting either width or height
+        width = max_x - min_x
+        height = max_y - min_y
+        side_length = max(width, height)
+        # Adjust to ensure square
+        if width < side_length:
+            extra = side_length - width
+            min_x = max(0, min_x - extra // 2)
+            max_x = min(image.shape[1], max_x + extra // 2)
+        if height < side_length:
+            extra = side_length - height
+            min_y = max(0, min_y - extra // 2)
+            max_y = min(image.shape[0], max_y + extra // 2)
+        # Create grey background image
+        grey_background = np.ones((side_length, side_length, 3), dtype=np.uint8) * self.grey_background_color
+        # Crop and paste facial features onto grey background
+        self.crop_and_paste(image, grey_background, left_eye_box, (left_eye_box[0]-min_x, left_eye_box[1]-min_y))
+        self.crop_and_paste(image, grey_background, right_eye_box, (right_eye_box[0]-min_x, right_eye_box[1]-min_y))
+        self.crop_and_paste(image, grey_background, mouth_box, (mouth_box[0]-min_x, mouth_box[1]-min_y))
+        return grey_background
+    def select_face(self, pose_landmarks: List[List[float]], face_landmarks: List[List[List[float]]]) -> List[List[float]]:
+        """
+        Select the face that is closest to the pose nose landmark.
+        Args:
+            pose_landmarks: Pose landmarks from MediaPipe
+            face_landmarks: List of face landmarks from MediaPipe
+        Returns:
+            Selected face landmarks
+        """
+        nose_landmark_from_pose = pose_landmarks[0]  # Nose from pose
+        nose_landmarks_from_face = [face_landmarks[i][0] for i in range(len(face_landmarks))]
+        # Find closest face based on nose landmark
+        distances = [np.linalg.norm(np.array(nose_landmark_from_pose) - np.array(nose_landmark))
+                    for nose_landmark in nose_landmarks_from_face]
+        closest_nose_index = np.argmin(distances)
+        return face_landmarks[closest_nose_index]
+    def extract_face_frames(self, video_input, landmarks_data: Dict[int, Any]) -> List[np.ndarray]:
+        """
+        Extract face frames from video based on landmarks.
+        Args:
+            video_input: Either a path to video file (str) or a decord.VideoReader object
+            landmarks_data: Dictionary containing pose and face landmarks for each frame
+        Returns:
+            List of face frames as numpy arrays
+        """
+        # Handle different input types
+        if isinstance(video_input, str):
+            video_path = Path(video_input)
+            if not video_path.exists():
+                raise FileNotFoundError(f"Video file not found: {video_input}")
+            video = decord.VideoReader(str(video_path))
+        # elif hasattr(video_input, '__len__') and hasattr(video_input, '__getitem__'):
+        else:
+            video = video_input
+        # else:
+        #     raise TypeError("video_input must be either a file path (str) or a VideoReader object")
+        face_frames = []
+        prev_face_frame = None
+        prev_landmarks = None
+        for i in range(len(video)):
+            # frame = video[i].asnumpy()
+            frame = video[i]
+            if hasattr(video, 'seek'):
+                video.seek(0)
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            # Get landmarks for this frame
+            frame_landmarks = landmarks_data.get(i, None)
+            # Handle missing landmarks
+            if frame_landmarks is None:
+                if prev_landmarks is not None:
+                    frame_landmarks = prev_landmarks
+                else:
+                    # Use blank frame if no landmarks available
+                    face_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+                    continue
+            else:
+                prev_landmarks = frame_landmarks
+            # Check if pose landmarks exist
+            if frame_landmarks.get('pose_landmarks') is None:
+                if prev_face_frame is not None:
+                    face_frames.append(prev_face_frame)
+                else:
+                    face_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+                continue
+            # Process face if face landmarks exist
+            if frame_landmarks.get('face_landmarks') is not None:
+                # Select the face closest to the pose
+                selected_face = self.select_face(
+                    frame_landmarks['pose_landmarks'][0],
+                    frame_landmarks['face_landmarks']
+                )
+                # Create face frame with cues on grey background
+                face_frame = self.cues_on_grey_background(frame_rgb, selected_face)
+                face_frame = self.resize_frame(face_frame, self.output_size)
+                face_frames.append(face_frame)
+                prev_face_frame = face_frame
+            elif prev_face_frame is not None:
+                face_frames.append(prev_face_frame)
+            else:
+                # Use blank frame if no face landmarks
+                face_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+        return face_frames
+    def extract_and_save_face_video(self, video_input, landmarks_data: Dict[int, Any],
+                                   output_dir: str, video_name: Optional[str] = None) -> str:
+        """
+        Extract face frames and save as video file.
+        Args:
+            video_input: Either a path to video file (str) or a decord.VideoReader object
+            landmarks_data: Dictionary containing pose and face landmarks for each frame
+            output_dir: Directory to save the face video
+            video_name: Name for output video (auto-generated if not provided)
+        Returns:
+            Path to the saved face video
+        """
+        # Handle video input and get FPS
+        if isinstance(video_input, str):
+            video_path = Path(video_input)
+            if not video_path.exists():
+                raise FileNotFoundError(f"Video file not found: {video_input}")
+            video = decord.VideoReader(str(video_path))
+            if video_name is None:
+                video_name = video_path.stem
+        # elif hasattr(video_input, '__len__') and hasattr(video_input, '__getitem__'):
+        else:
+            video = video_input
+            if video_name is None:
+                video_name = "video"
+        # else:
+        #     raise TypeError("video_input must be either a file path (str) or a VideoReader object")
+        fps = video.get_avg_fps() if hasattr(video, 'get_avg_fps') else 30.0
+        # Create output directory
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        # Define output path
+        face_video_path = output_path / f"{video_name}_face.mp4"
+        # Remove existing file
+        if face_video_path.exists():
+            face_video_path.unlink()
+        # Create video writer
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        writer = cv2.VideoWriter(str(face_video_path), fourcc, fps, self.output_size)
+        try:
+            # Extract face frames
+            face_frames = self.extract_face_frames(video, landmarks_data)
+            # Write frames to video file
+            for frame in face_frames:
+                writer.write(frame)
+        finally:
+            # Clean up
+            writer.release()
+            del writer
+        return str(face_video_path)
+# Convenience function for backward compatibility
+def extract_face_frames(video_input, landmarks_data: Dict[int, Any],
+                       output_size: Tuple[int, int] = (224, 224)) -> List[np.ndarray]:
+    """
+    Convenience function to extract face frames from video.
+    Args:
+        video_input: Either a path to video file (str) or a decord.VideoReader object
+        landmarks_data: Dictionary containing pose and face landmarks for each frame
+        output_size: Size of the output face frames (width, height)
+    Returns:
+        List of face frames as numpy arrays
+    """
+    extractor = FaceExtractor(output_size=output_size)
+    return extractor.extract_face_frames(video_input, landmarks_data)
+def video_holistic(video_file: str, face_path: str, problem_file_path: str, pose_path: str):
+    """
+    Original function for backward compatibility with command-line usage.
+    """
+    try:
+        video = decord.VideoReader(video_file)
+        fps = video.get_avg_fps()
+        video_name = Path(video_file).stem
+        clip_face_path = Path(face_path) / f"{video_name}_face.mp4"
+        landmark_json_path = Path(pose_path) / f"{video_name}_pose.json"
+        # Load landmarks
+        with open(landmark_json_path, 'r') as rd:
+            landmarks_data = json.load(rd)
+        # Convert string keys to integers
+        landmarks_data = {int(k): v for k, v in landmarks_data.items()}
+        # Extract face video
+        extractor = FaceExtractor()
+        extractor.extract_and_save_face_video(video, landmarks_data, face_path, video_name)
+    except Exception as e:
+        print(f"Error processing {video_file}: {e}")
+        with open(problem_file_path, "a") as p:
+            p.write(video_file + "\n")
+# Utility functions for batch processing
+def load_file(filename: str):
+    """Load a pickled and gzipped file."""
+    with gzip.open(filename, "rb") as f:
+        return pickle.load(f)
+def is_string_in_file(file_path: str, target_string: str) -> bool:
+    """Check if a string exists in a file."""
+    try:
+        with Path(file_path).open("r") as f:
+            for line in f:
+                if target_string in line:
+                    return True
+        return False
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--index', type=int, required=True,
+                        help='index of the sub_list to work with')
+    parser.add_argument('--batch_size', type=int, required=True,
+                        help='batch size')
+    parser.add_argument('--time_limit', type=int, required=True,
+                        help='time limit')
+    parser.add_argument('--files_list', type=str, required=True,
+                        help='files list')
+    parser.add_argument('--problem_file_path', type=str, required=True,
+                        help='problem file path')
+    parser.add_argument('--pose_path', type=str, required=True,
+                        help='pose path')
+    parser.add_argument('--face_path', type=str, required=True,
+                        help='face path')
+    args = parser.parse_args()
+    start_time = time.time()
+    # Load files list
+    fixed_list = load_file(args.files_list)
+    # Create problem file if it doesn't exist
+    if not os.path.exists(args.problem_file_path):
+        with open(args.problem_file_path, "w") as f:
+            f.write("")
+    # Process videos in batches
+    video_batches = [fixed_list[i:i + args.batch_size] for i in range(0, len(fixed_list), args.batch_size)]
+    for video_file in video_batches[args.index]:
+        current_time = time.time()
+        if current_time - start_time > args.time_limit:
+            print("Time limit reached. Stopping execution.")
+            break
+        video_name = Path(video_file).stem
+        clip_face_path = Path(args.face_path) / f"{video_name}_face.mp4"
+        if clip_face_path.exists():
+            print(f"Skipping {video_file} - output already exists")
+            continue
+        elif is_string_in_file(args.problem_file_path, video_file):
+            print(f"Skipping {video_file} - found in problem file")
+            continue
+        else:
+            try:
+                print(f"Processing {video_file}")
+                video_holistic(video_file, args.face_path, args.problem_file_path, args.pose_path)
+                print(f"Successfully processed {video_file}")
+            except Exception as e:
+                print(f"Error processing {video_file}: {e}")
+                with open(args.problem_file_path, "a") as p:
+                    p.write(video_file + "\n")
+if __name__ == "__main__":
+    main()

crop_hands.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import cv2
+import numpy as np
+import os
+import pickle
+import gzip
+from datetime import datetime
+from pathlib import Path
+import decord
+import argparse
+import json
+import time
+from typing import Dict, Optional, Tuple, List, Union, Any
+import tempfile
+class HandExtractor:
+    """
+    A class for extracting hand regions from videos based on pose landmarks.
+    """
+    def __init__(self, output_size: Tuple[int, int] = (224, 224),
+                 scale_factor: float = 1.5, distance_threshold: float = 0.1):
+        """
+        Initialize the HandExtractor.
+        Args:
+            output_size: Size of the output hand frames (width, height)
+            scale_factor: Scale factor for bounding box expansion
+            distance_threshold: Distance threshold for hand-pose matching
+        """
+        self.output_size = output_size
+        self.scale_factor = scale_factor
+        self.distance_threshold = distance_threshold
+    def resize_frame(self, frame: np.ndarray, frame_size: Tuple[int, int]) -> Optional[np.ndarray]:
+        """Resize frame to specified size."""
+        if frame is not None and frame.size > 0:
+            return cv2.resize(frame, frame_size, interpolation=cv2.INTER_AREA)
+        else:
+            return None
+    def crop_frame(self, image: np.ndarray, bounding_box: Tuple[int, int, int, int]) -> np.ndarray:
+        """Crop frame using bounding box."""
+        x, y, w, h = bounding_box
+        cropped_frame = image[y:y + h, x:x + w]
+        return cropped_frame
+    def get_bounding_box(self, landmarks: List[List[float]], image_shape: Tuple[int, int, int],
+                        scale_factor: float = 1.2) -> Tuple[int, int, int, int]:
+        """Get bounding box from landmarks."""
+        ih, iw, _ = image_shape
+        landmarks_px = np.array([(int(l[0] * iw), int(l[1] * ih)) for l in landmarks])
+        center_x, center_y = np.mean(landmarks_px, axis=0, dtype=int)
+        xb, yb, wb, hb = cv2.boundingRect(landmarks_px)
+        box_size = max(wb, hb)
+        half_size = box_size // 2
+        x = center_x - half_size
+        y = center_y - half_size
+        w = box_size
+        h = box_size
+        w_padding = int((scale_factor - 1) * w / 2)
+        h_padding = int((scale_factor - 1) * h / 2)
+        x -= w_padding
+        y -= h_padding
+        w += 2 * w_padding
+        h += 2 * h_padding
+        return x, y, w, h
+    def adjust_bounding_box(self, bounding_box: Tuple[int, int, int, int],
+                           image_shape: Tuple[int, int, int]) -> Tuple[int, int, int, int]:
+        """Adjust bounding box to fit within image boundaries."""
+        x, y, w, h = bounding_box
+        ih, iw, _ = image_shape
+        # Adjust x-coordinate if the bounding box extends beyond the image's right edge
+        if x + w > iw:
+            x = iw - w
+        # Adjust y-coordinate if the bounding box extends beyond the image's bottom edge
+        if y + h > ih:
+            y = ih - h
+        # Ensure bounding box's x and y coordinates are not negative
+        x = max(x, 0)
+        y = max(y, 0)
+        return x, y, w, h
+    def select_hands(self, pose_landmarks: List[List[float]], hand_landmarks: Optional[List[List[List[float]]]],
+                    image_shape: Tuple[int, int, int]) -> Tuple[Optional[List[List[float]]], Optional[List[List[float]]]]:
+        """
+        Select left and right hands from detected hand landmarks based on pose wrist positions.
+        Args:
+            pose_landmarks: Pose landmarks from MediaPipe
+            hand_landmarks: Hand landmarks from MediaPipe
+            image_shape: Shape of the image (height, width, channels)
+        Returns:
+            Tuple of (left_hand_landmarks, right_hand_landmarks)
+        """
+        if hand_landmarks is None:
+            return None, None
+        # Get wrist landmarks from pose (indices 15 and 16 for left and right wrists)
+        left_wrist_from_pose = pose_landmarks[15]
+        right_wrist_from_pose = pose_landmarks[16]
+        # Get wrist landmarks from hand detections (index 0 is wrist in hand landmarks)
+        wrist_from_hand = [hand_landmarks[i][0] for i in range(len(hand_landmarks))]
+        # Match right hand
+        right_hand_landmarks = None
+        if right_wrist_from_pose is not None:
+            minimum_distance = 100
+            best_hand_idx = 0
+            for i in range(len(hand_landmarks)):
+                distance = np.linalg.norm(np.array(right_wrist_from_pose[0:2]) - np.array(wrist_from_hand[i][0:2]))
+                if distance < minimum_distance:
+                    minimum_distance = distance
+                    best_hand_idx = i
+            if minimum_distance < self.distance_threshold:
+                right_hand_landmarks = hand_landmarks[best_hand_idx]
+        # Match left hand
+        left_hand_landmarks = None
+        if left_wrist_from_pose is not None:
+            minimum_distance = 100
+            best_hand_idx = 0
+            for i in range(len(hand_landmarks)):
+                distance = np.linalg.norm(np.array(left_wrist_from_pose[0:2]) - np.array(wrist_from_hand[i][0:2]))
+                if distance < minimum_distance:
+                    minimum_distance = distance
+                    best_hand_idx = i
+            if minimum_distance < self.distance_threshold:
+                left_hand_landmarks = hand_landmarks[best_hand_idx]
+        return left_hand_landmarks, right_hand_landmarks
+    def extract_hand_frames(self, video_input, landmarks_data: Dict[int, Any]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+        """
+        Extract hand frames from video based on landmarks.
+        Args:
+            video_input: Either a path to video file (str) or a decord.VideoReader object
+            landmarks_data: Dictionary containing pose and hand landmarks for each frame
+        Returns:
+            Tuple of (left_hand_frames, right_hand_frames) as lists of numpy arrays
+        """
+        # Handle different input types
+        if isinstance(video_input, str):
+            video_path = Path(video_input)
+            if not video_path.exists():
+                raise FileNotFoundError(f"Video file not found: {video_input}")
+            video = decord.VideoReader(str(video_path))
+        # elif hasattr(video_input, '__len__') and hasattr(video_input, '__getitem__'):
+        else:
+            video = video_input
+        # else:
+        #     raise TypeError("video_input must be either a file path (str) or a VideoReader object")
+        left_hand_frames = []
+        right_hand_frames = []
+        prev_left_frame = None
+        prev_right_frame = None
+        prev_landmarks = None
+        for i in range(len(video)):
+            # frame = video[i].asnumpy()
+            frame = video[i]
+            if hasattr(video, 'seek'):
+                video.seek(0)
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            # Get landmarks for this frame
+            frame_landmarks = landmarks_data.get(i, None)
+            # Handle missing landmarks
+            if frame_landmarks is None:
+                if prev_landmarks is not None:
+                    frame_landmarks = prev_landmarks
+                else:
+                    # Use blank frames if no landmarks available
+                    left_hand_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+                    right_hand_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+                    continue
+            else:
+                prev_landmarks = frame_landmarks
+            # Check if pose landmarks exist
+            if frame_landmarks.get('pose_landmarks') is None:
+                # Use previous frames or blank frames
+                if prev_left_frame is not None:
+                    left_hand_frames.append(prev_left_frame)
+                else:
+                    left_hand_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+                if prev_right_frame is not None:
+                    right_hand_frames.append(prev_right_frame)
+                else:
+                    right_hand_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+                continue
+            # Select hands based on pose landmarks
+            left_hand_landmarks, right_hand_landmarks = self.select_hands(
+                frame_landmarks['pose_landmarks'][0],
+                frame_landmarks.get('hand_landmarks'),
+                frame_rgb.shape
+            )
+            # Process left hand
+            if left_hand_landmarks is not None:
+                left_box = self.get_bounding_box(left_hand_landmarks, frame_rgb.shape, self.scale_factor)
+                left_box = self.adjust_bounding_box(left_box, frame_rgb.shape)
+                left_frame = self.crop_frame(frame_rgb, left_box)
+                left_frame = self.resize_frame(left_frame, self.output_size)
+                left_hand_frames.append(left_frame)
+                prev_left_frame = left_frame
+            elif prev_left_frame is not None:
+                left_hand_frames.append(prev_left_frame)
+            else:
+                left_hand_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+            # Process right hand
+            if right_hand_landmarks is not None:
+                right_box = self.get_bounding_box(right_hand_landmarks, frame_rgb.shape, self.scale_factor)
+                right_box = self.adjust_bounding_box(right_box, frame_rgb.shape)
+                right_frame = self.crop_frame(frame_rgb, right_box)
+                right_frame = self.resize_frame(right_frame, self.output_size)
+                right_hand_frames.append(right_frame)
+                prev_right_frame = right_frame
+            elif prev_right_frame is not None:
+                right_hand_frames.append(prev_right_frame)
+            else:
+                right_hand_frames.append(np.zeros((*self.output_size, 3), dtype=np.uint8))
+        return left_hand_frames, right_hand_frames
+    def extract_and_save_hand_videos(self, video_input, landmarks_data: Dict[int, Any],
+                                   output_dir: str, video_name: Optional[str] = None) -> Tuple[str, str]:
+        """
+        Extract hand frames and save as video files.
+        Args:
+            video_input: Either a path to video file (str) or a decord.VideoReader object
+            landmarks_data: Dictionary containing pose and hand landmarks for each frame
+            output_dir: Directory to save the hand videos
+            video_name: Name for output videos (auto-generated if not provided)
+        Returns:
+            Tuple of (left_hand_video_path, right_hand_video_path)
+        """
+        # Handle video input and get FPS
+        if isinstance(video_input, str):
+            video_path = Path(video_input)
+            if not video_path.exists():
+                raise FileNotFoundError(f"Video file not found: {video_input}")
+            video = decord.VideoReader(str(video_path))
+            if video_name is None:
+                video_name = video_path.stem
+        # elif hasattr(video_input, '__len__') and hasattr(video_input, '__getitem__'):
+        else:
+            video = video_input
+            if video_name is None:
+                video_name = "video"
+        # else:
+        #     raise TypeError("video_input must be either a file path (str) or a VideoReader object")
+        fps = video.get_avg_fps() if hasattr(video, 'get_avg_fps') else 30.0
+        # Create output directory
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        # Define output paths
+        left_hand_path = output_path / f"{video_name}_hand1.mp4"
+        right_hand_path = output_path / f"{video_name}_hand2.mp4"
+        # Remove existing files
+        if left_hand_path.exists():
+            left_hand_path.unlink()
+        if right_hand_path.exists():
+            right_hand_path.unlink()
+        # Create video writers
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        left_writer = cv2.VideoWriter(str(left_hand_path), fourcc, fps, self.output_size)
+        right_writer = cv2.VideoWriter(str(right_hand_path), fourcc, fps, self.output_size)
+        try:
+            # Extract hand frames
+            left_frames, right_frames = self.extract_hand_frames(video, landmarks_data)
+            # Write frames to video files
+            for left_frame, right_frame in zip(left_frames, right_frames):
+                left_writer.write(left_frame)
+                right_writer.write(right_frame)
+        finally:
+            # Clean up
+            left_writer.release()
+            right_writer.release()
+            del left_writer
+            del right_writer
+        return str(left_hand_path), str(right_hand_path)
+# Convenience function for backward compatibility
+def extract_hand_frames(video_input, landmarks_data: Dict[int, Any],
+                       output_size: Tuple[int, int] = (224, 224)) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+    """
+    Convenience function to extract hand frames from video.
+    Args:
+        video_input: Either a path to video file (str) or a decord.VideoReader object
+        landmarks_data: Dictionary containing pose and hand landmarks for each frame
+        output_size: Size of the output hand frames (width, height)
+    Returns:
+        Tuple of (left_hand_frames, right_hand_frames) as lists of numpy arrays
+    """
+    extractor = HandExtractor(output_size=output_size)
+    return extractor.extract_hand_frames(video_input, landmarks_data)
+def video_holistic(video_file: str, hand_path: str, problem_file_path: str, pose_path: str):
+    """
+    Original function for backward compatibility with command-line usage.
+    """
+    try:
+        video = decord.VideoReader(video_file)
+        fps = video.get_avg_fps()
+        video_name = Path(video_file).stem
+        clip_hand1_path = Path(hand_path) / f"{video_name}_hand1.mp4"
+        clip_hand2_path = Path(hand_path) / f"{video_name}_hand2.mp4"
+        landmark_json_path = Path(pose_path) / f"{video_name}_pose.json"
+        # Load landmarks
+        with open(landmark_json_path, 'r') as rd:
+            landmarks_data = json.load(rd)
+        # Convert string keys to integers
+        landmarks_data = {int(k): v for k, v in landmarks_data.items()}
+        # Extract hand videos
+        extractor = HandExtractor()
+        extractor.extract_and_save_hand_videos(video, landmarks_data, hand_path, video_name)
+    except Exception as e:
+        print(f"Error processing {video_file}: {e}")
+        with open(problem_file_path, "a") as p:
+            p.write(video_file + "\n")
+# Utility functions for batch processing
+def load_file(filename: str):
+    """Load a pickled and gzipped file."""
+    with gzip.open(filename, "rb") as f:
+        return pickle.load(f)
+def is_string_in_file(file_path: str, target_string: str) -> bool:
+    """Check if a string exists in a file."""
+    try:
+        with Path(file_path).open("r") as f:
+            for line in f:
+                if target_string in line:
+                    return True
+        return False
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--index', type=int, required=True,
+                        help='index of the sub_list to work with')
+    parser.add_argument('--batch_size', type=int, required=True,
+                        help='batch size')
+    parser.add_argument('--time_limit', type=int, required=True,
+                        help='time limit')
+    parser.add_argument('--files_list', type=str, required=True,
+                        help='files list')
+    parser.add_argument('--problem_file_path', type=str, required=True,
+                        help='problem file path')
+    parser.add_argument('--pose_path', type=str, required=True,
+                        help='pose path')
+    parser.add_argument('--hand_path', type=str, required=True,
+                        help='hand path')
+    args = parser.parse_args()
+    start_time = time.time()
+    # Create directories if they do not exist
+    Path(args.hand_path).mkdir(parents=True, exist_ok=True)
+    # Load files list
+    fixed_list = load_file(args.files_list)
+    # Create problem file if it doesn't exist
+    if not os.path.exists(args.problem_file_path):
+        with open(args.problem_file_path, "w") as f:
+            f.write("")
+    # Process videos in batches
+    video_batches = [fixed_list[i:i + args.batch_size] for i in range(0, len(fixed_list), args.batch_size)]
+    for video_file in video_batches[args.index]:
+        current_time = time.time()
+        if current_time - start_time > args.time_limit:
+            print("Time limit reached. Stopping execution.")
+            break
+        video_name = Path(video_file).stem
+        clip_hand2_path = Path(args.hand_path) / f"{video_name}_hand2.mp4"
+        if clip_hand2_path.exists():
+            print(f"Skipping {video_file} - output already exists")
+            continue
+        elif is_string_in_file(args.problem_file_path, video_file):
+            print(f"Skipping {video_file} - found in problem file")
+            continue
+        else:
+            try:
+                print(f"Processing {video_file}")
+                video_holistic(video_file, args.hand_path, args.problem_file_path, args.pose_path)
+                print(f"Successfully processed {video_file}")
+            except Exception as e:
+                print(f"Error processing {video_file}: {e}")
+                with open(args.problem_file_path, "a") as p:
+                    p.write(video_file + "\n")
+if __name__ == "__main__":
+    main()

dinov2_features.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import torch
+import torch.nn as nn
+from torchvision import transforms
+from PIL import Image
+import decord
+from decord import VideoReader
+from decord import cpu, gpu
+import numpy as np
+import os
+import pickle
+import gzip
+from pathlib import Path
+import argparse
+import json
+import csv
+import glob
+import time
+from typing import List, Union, Optional, Tuple
+class DINOEmbedder:
+    """
+    A class for extracting DINOv2 embeddings from video frames or images.
+    """
+    def __init__(self, dino_model_path: str, batch_size: int = 128, device: Optional[str] = None):
+        """
+        Initialize the DINOEmbedder.
+        Args:
+            dino_model_path: Path to the fine-tuned DINOv2 model
+            batch_size: Batch size for processing frames
+            device: Device to use ('cuda' or 'cpu'). Auto-detected if None
+        """
+        self.dino_model_path = dino_model_path
+        self.batch_size = batch_size
+        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize model
+        self.model = self._load_dino_model()
+        self.model.eval()
+        # Initialize transform
+        self.transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+        print(f"DINOEmbedder initialized on device: {self.device}")
+    def _load_dino_model(self) -> nn.Module:
+        """Load the fine-tuned DINOv2 model."""
+        # Load the original DINOv2 model with the correct architecture
+        model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14_reg', pretrained=False)
+        # Load fine-tuned weights
+        pretrained = torch.load(self.dino_model_path, map_location=self.device)
+        # Make correct state dict for loading
+        new_state_dict = {}
+        for key, value in pretrained['teacher'].items():
+            if 'dino_head' in key:
+                continue  # Skip dino_head layers
+            else:
+                new_key = key.replace('backbone.', '')
+                new_state_dict[new_key] = value
+        # Change shape of pos_embed
+        pos_embed = nn.Parameter(torch.zeros(1, 257, 384))
+        model.pos_embed = pos_embed
+        # Load state dict
+        model.load_state_dict(new_state_dict, strict=True)
+        # Move model to device
+        model.to(self.device)
+        return model
+    def _preprocess_frame(self, frame: np.ndarray) -> torch.Tensor:
+        """Preprocess a single frame."""
+        if isinstance(frame, np.ndarray):
+            image = Image.fromarray(frame)
+        else:
+            image = frame
+        tensor = self.transform(image)
+        # Ensure only RGB channels are considered
+        return tensor[:3]
+    def _preprocess_frames_batch(self, frames: List[np.ndarray]) -> torch.Tensor:
+        """Preprocess a batch of frames."""
+        batch_tensors = torch.stack([self._preprocess_frame(frame) for frame in frames])
+        return batch_tensors.to(self.device)
+    def extract_embeddings_from_frames(self, frames: List[np.ndarray]) -> np.ndarray:
+        """
+        Extract DINOv2 embeddings from a list of frames.
+        Args:
+            frames: List of frames as numpy arrays
+        Returns:
+            Numpy array of embeddings with shape (num_frames, embedding_dim)
+        """
+        all_embeddings = []
+        # Process frames in batches
+        for idx in range(0, len(frames), self.batch_size):
+            batch_frames = frames[idx:idx + self.batch_size]
+            # Preprocess batch
+            batch_tensors = self._preprocess_frames_batch(batch_frames)
+            # Extract embeddings
+            with torch.no_grad():
+                batch_embeddings = self.model(batch_tensors).cpu().numpy()
+            all_embeddings.append(batch_embeddings)
+        # Concatenate all embeddings
+        embeddings = np.concatenate(all_embeddings, axis=0)
+        return embeddings
+    def extract_embeddings_from_video(self, video_input: Union[str, VideoReader],
+                                     target_size: Tuple[int, int] = (224, 224)) -> np.ndarray:
+        """
+        Extract DINOv2 embeddings from a video.
+        Args:
+            video_input: Either a path to video file (str) or a VideoReader object
+            target_size: Target size for video frames (width, height)
+        Returns:
+            Numpy array of embeddings with shape (num_frames, embedding_dim)
+        """
+        # Handle different input types
+        if isinstance(video_input, str):
+            video_path = Path(video_input)
+            if not video_path.exists():
+                raise FileNotFoundError(f"Video file not found: {video_input}")
+            try:
+                vr = VideoReader(str(video_path), width=target_size[0], height=target_size[1])
+            except Exception as e:
+                raise RuntimeError(f"Error loading video {video_input}: {e}")
+        # elif hasattr(video_input, 'get_batch'):
+        else:
+            vr = video_input
+        # else:
+        #     raise TypeError("video_input must be either a file path (str) or a VideoReader object")
+        total_frames = len(vr)
+        all_embeddings = []
+        # Process video in batches
+        for idx in range(0, total_frames, self.batch_size):
+            batch_indices = range(idx, min(idx + self.batch_size, total_frames))
+            # batch_frames = vr.get_batch(batch_indices).asnumpy()
+            batch_frames = vr[batch_indices]
+            # Preprocess batch
+            batch_tensors = self._preprocess_frames_batch(batch_frames)
+            # Extract embeddings
+            with torch.no_grad():
+                batch_embeddings = self.model(batch_tensors).cpu().numpy()
+            all_embeddings.append(batch_embeddings)
+        # Concatenate all embeddings
+        embeddings = np.concatenate(all_embeddings, axis=0)
+        return embeddings
+    def extract_embeddings_from_video_and_save(self, video_path: str, output_folder: str) -> str:
+        """
+        Extract embeddings from video and save to file.
+        Args:
+            video_path: Path to the video file
+            output_folder: Folder to save the embeddings
+        Returns:
+            Path to the saved embeddings file
+        """
+        # Create output folder if it doesn't exist
+        Path(output_folder).mkdir(parents=True, exist_ok=True)
+        # Extract embeddings
+        embeddings = self.extract_embeddings_from_video(video_path)
+        # Save embeddings
+        video_name = Path(video_path).stem
+        np_path = Path(output_folder) / f"{video_name}.npy"
+        np.save(np_path, embeddings)
+        return str(np_path)
+    def extract_embedding_from_single_image(self, image: Union[np.ndarray, Image.Image]) -> np.ndarray:
+        """
+        Extract DINOv2 embedding from a single image.
+        Args:
+            image: Image as numpy array or PIL Image
+        Returns:
+            Numpy array of embedding with shape (1, embedding_dim)
+        """
+        # Preprocess image
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        tensor = self.transform(image).unsqueeze(0).to(self.device)
+        # Extract embedding
+        with torch.no_grad():
+            embedding = self.model(tensor).cpu().numpy()
+        return embedding
+# Convenience functions for backward compatibility
+def extract_embeddings_from_frames(frames: List[np.ndarray], dino_model_path: str,
+                                  batch_size: int = 128) -> np.ndarray:
+    """
+    Convenience function to extract embeddings from frames.
+    Args:
+        frames: List of frames as numpy arrays
+        dino_model_path: Path to the fine-tuned DINOv2 model
+        batch_size: Batch size for processing
+    Returns:
+        Numpy array of embeddings
+    """
+    embedder = DINOEmbedder(dino_model_path, batch_size)
+    return embedder.extract_embeddings_from_frames(frames)
+def extract_embeddings_from_video(video_path: str, dino_model_path: str,
+                                 batch_size: int = 128) -> np.ndarray:
+    """
+    Convenience function to extract embeddings from video.
+    Args:
+        video_path: Path to the video file
+        dino_model_path: Path to the fine-tuned DINOv2 model
+        batch_size: Batch size for processing
+    Returns:
+        Numpy array of embeddings
+    """
+    embedder = DINOEmbedder(dino_model_path, batch_size)
+    return embedder.extract_embeddings_from_video(video_path)
+def video_to_embeddings(video_path: str, output_folder: str, dino_path: str, batch_size: int = 128):
+    """
+    Original function for backward compatibility with command-line usage.
+    """
+    try:
+        embedder = DINOEmbedder(dino_path, batch_size)
+        embedder.extract_embeddings_from_video_and_save(video_path, output_folder)
+    except Exception as e:
+        print(f'Error processing {video_path}: {e}')
+# Utility functions for batch processing
+def get_mp4_files(directory: str) -> List[str]:
+    """Get all MP4 files in a directory."""
+    if not os.path.exists(directory):
+        raise FileNotFoundError(f'Directory not found: {directory}')
+    mp4_files = glob.glob(os.path.join(directory, '*.mp4'))
+    return [os.path.abspath(file) for file in mp4_files]
+def load_file(filename: str):
+    """Load a pickled and gzipped file."""
+    with gzip.open(filename, "rb") as f:
+        return pickle.load(f)
+def is_string_in_file(file_path: str, target_string: str) -> bool:
+    """Check if a string exists in a file."""
+    try:
+        with Path(file_path).open("r") as f:
+            for line in f:
+                if target_string in line:
+                    return True
+        return False
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--index', type=int, required=True,
+                        help='index of the sub_list to work with')
+    parser.add_argument('--time_limit', type=int, required=True,
+                        help='time limit in seconds')
+    parser.add_argument('--batch_size', type=int, required=True,
+                        help='number of videos to process in this batch')
+    parser.add_argument('--files_list', type=str, required=True,
+                        help='path to the files list file')
+    parser.add_argument('--output_folder', type=str, required=True,
+                        help='path to the output folder')
+    parser.add_argument('--dino_path', type=str, required=True,
+                        help='path to the dino model')
+    args = parser.parse_args()
+    start_time = time.time()
+    # Load files list
+    fixed_list = load_file(args.files_list)
+    # Create output folder if it doesn't exist
+    if not os.path.exists(args.output_folder):
+        os.makedirs(args.output_folder)
+    # Initialize embedder
+    embedder = DINOEmbedder(args.dino_path, batch_size=512)
+    # Process videos in batches
+    video_batches = [fixed_list[i:i + args.batch_size] for i in range(0, len(fixed_list), args.batch_size)]
+    print(f"Total number of video batches: {len(video_batches)}")
+    for video_path in video_batches[args.index]:
+        current_time = time.time()
+        if current_time - start_time > args.time_limit:
+            print("Time limit reached. Stopping execution.")
+            break
+        video_name = Path(video_path).stem
+        np_path = Path(args.output_folder) / f"{video_name}.npy"
+        if np_path.exists():
+            print(f"Skipping {video_path} - output already exists")
+            continue
+        else:
+            try:
+                print(f"Processing {video_path}")
+                embedder.extract_embeddings_from_video_and_save(video_path, args.output_folder)
+                print(f"Successfully processed {video_path}")
+            except Exception as e:
+                print(f"Error processing {video_path}: {e}")
+if __name__ == "__main__":
+    main()

features.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import torch
+import numpy as np
+import decord
+import torch.nn as nn
+import json
+import cv2
+from kpe_mediapipe import video_holistic
+from crop_hands import HandExtractor
+from crop_face import FaceExtractor
+from dinov2_features import extract_embeddings_from_frames
+from body_features import process_pose_landmarks
+# from shubert import SignHubertModel, SignHubertConfig
+from inference import test
+import subprocess
+class SHuBERTProcessor:
+    def __init__(self, config):
+        self.config = config
+    def process_video(self, video_path):
+        # output_file = f"{output_path}/{os.path.basename(video_file)}"
+        # # Target FPS is 12.5
+        # cmd = [
+        #     'ffmpeg',
+        #     '-i', video_path,
+        #     '-filter:v', 'fps=15',
+        #     '-c:v', 'libx264',
+        #     '-preset', 'medium',  # Balance between speed and quality
+        #     '-crf', '23',  # Quality level (lower is better)
+        #     '-y',  # Overwrite output file if it exists
+        #     video_path
+        # ]
+        # try:
+        #     subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        #     print(f"Saved to {video_path} at 15 fps")
+        # except subprocess.CalledProcessError as e:
+        #     print(f"Error reading video {video_path}: {e}")
+        # Step 1: Change the fps to 15
+        signer_video = decord.VideoReader(video_path)
+        signer_video_fps = signer_video.get_avg_fps()
+        target_fps = 12
+        stride = max(1, int(round(signer_video_fps / target_fps)))
+        index_list = list(range(0, len(signer_video), stride))
+        signer_video = signer_video.get_batch(index_list)
+        signer_video = signer_video.asnumpy()
+        # Step 2: Extract pose using kpe_mediapipe
+        landmarks = video_holistic(
+            video_input=signer_video,
+            face_model_path=self.config['mediapipe_face_model_path'],
+            hand_model_path=self.config['mediapipe_hands_model_path'],
+        )
+        # Step 3: Extract stream features
+        hand_extractor = HandExtractor()
+        left_hand_frames, right_hand_frames = hand_extractor.extract_hand_frames(signer_video, landmarks)
+        left_hand_embeddings = extract_embeddings_from_frames(left_hand_frames, self.config['dino_hands_model_path'])
+        right_hand_embeddings = extract_embeddings_from_frames(right_hand_frames, self.config['dino_hands_model_path'])
+        del left_hand_frames, right_hand_frames
+        face_extractor = FaceExtractor()
+        face_frames = face_extractor.extract_face_frames(signer_video, landmarks)
+        face_embeddings = extract_embeddings_from_frames(face_frames, self.config['dino_face_model_path'])
+        del face_frames, signer_video
+        pose_embeddings = process_pose_landmarks(landmarks)
+        del landmarks
+        output_text = test(face_embeddings,
+             left_hand_embeddings,
+             right_hand_embeddings,
+             pose_embeddings,
+             self.config['slt_model_config'],
+             self.config['slt_model_checkpoint'],
+             self.config['slt_tokenizer_checkpoint'],
+             self.config['temp_dir'])
+        return output_text
+if __name__ == "__main__":
+    config = {
+        'yolov8_model_path': '/share/data/pals/shester/inference/models/yolov8n.pt',
+        'dino_face_model_path': '/share/data/pals/shester/inference/models/dinov2face.pth',
+        'dino_hands_model_path': '/share/data/pals/shester/inference/models/dinov2hand.pth',
+        'mediapipe_face_model_path': '/share/data/pals/shester/inference/models/face_landmarker_v2_with_blendshapes.task',
+        'mediapipe_hands_model_path': '/share/data/pals/shester/inference/models/hand_landmarker.task',
+        'shubert_model_path': '/share/data/pals/shester/inference/models/checkpoint_836_400000.pt',
+        'temp_dir': '/share/data/pals/shester/inference',
+        'slt_model_config': '/share/data/pals/shester/inference/models/byt5_base/config.json',
+        'slt_model_checkpoint': '/share/data/pals/shester/inference/models/checkpoint-11625',
+        'slt_tokenizer_checkpoint': '/share/data/pals/shester/inference/models/byt5_base',
+    }
+    # input_clip = "/share/data/pals/shester/datasets/openasl/clips_bbox/J-0KHhPS_m4.029676-029733.mp4"
+    # input_clip = "/share/data/pals/shester/inference/recordings/sabrin30fps.mp4"
+    input_clip = "/share/data/pals/shester/inference/recordings/sabrina30fps.mp4"
+    processor = SHuBERTProcessor(config)
+    output_text = processor.process_video(input_clip)
+    print(f"The English translation is: {output_text}")
+# /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/attention.py
+# /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/block.py

inference.py ADDED Viewed

	@@ -0,0 +1,738 @@

+import os
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import random
+import warnings
+from transformers import (
+    ByT5Tokenizer,
+    Seq2SeqTrainingArguments,
+    Seq2SeqTrainer,
+)
+from transformers.models.t5 import T5Config
+from transformers.models.t5.modeling_t5 import *
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from torch.nn import CrossEntropyLoss
+from collections.abc import Mapping
+from dataclasses import dataclass
+from random import randint
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+from transformers.utils import PaddingStrategy
+from shubert import SignHubertModel, SignHubertConfig
+class SignHubertAdapter(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        # Adjust intermediate_dim based on number of channels
+        intermediate_dim_shubert = 1024
+        self.signhubert = SignHubertModel(SignHubertConfig(
+            channels=channels,
+            intermediate_dim=intermediate_dim_shubert
+        ))
+    def forward(self, x):
+        features = self.signhubert.extract_features(x, padding_mask=None, kmeans_labels=None, mask=False)
+        # Extract layer outputs
+        layer_outputs = []
+        for layer in features['layer_results']:
+            layer_output = layer[-1]  # Shape: [B, T, D]
+            layer_outputs.append(layer_output)
+        # Stack the outputs from all layers
+        stacked_features = torch.stack(layer_outputs, dim=1)  # Shape: [B, L, T, D]
+        return stacked_features
+class LinearAdapter(nn.Module):
+    def __init__(self, face_dim, hand_dim, pose_dim, representations_dim, out_dim, extraction_layer, channels):
+        super().__init__()
+        self.signhubert_adapter = SignHubertAdapter(channels)
+        self.layer_weights = nn.Parameter(torch.ones(12))  # Learnable weights for each layer
+        self.final_layer = nn.Linear(representations_dim, out_dim)
+        self.extraction_layer = extraction_layer
+    def forward(self, face_features, left_hand_features, right_hand_features, body_posture_features):
+        dtype = torch.float32
+        face_features = face_features.to(dtype=dtype)
+        left_hand_features = left_hand_features.to(dtype=dtype)
+        right_hand_features = right_hand_features.to(dtype=dtype)
+        body_posture_features = body_posture_features.to(dtype=dtype)
+        batch_size, seq_len = face_features.shape[:2]
+        dummy_labels = torch.zeros((seq_len, 1), dtype=dtype, device=face_features.device)
+        source = []
+        for i in range(batch_size):
+            source.append({
+                "face": face_features[i],
+                "left_hand": left_hand_features[i],
+                "right_hand": right_hand_features[i],
+                "body_posture": body_posture_features[i],
+                "label_face": dummy_labels,
+                "label_left_hand": dummy_labels,
+                "label_right_hand": dummy_labels,
+                "label_body_posture": dummy_labels
+            })
+        # Get representations from SignHubert
+        representations_features = self.signhubert_adapter(source) # [T, L, B, D]
+        representations_features = representations_features.permute(2, 1, 0, 3) # [B, L, T, D]
+        if self.extraction_layer == 0:
+            normalized_weights = self.layer_weights
+            weighted_representations = representations_features * normalized_weights.view(1, -1, 1, 1)
+            representations_for_downstream_task = torch.sum(weighted_representations, dim=1)
+        else:
+            representations_for_downstream_task = representations_features[:, self.extraction_layer-1, :, :]
+        final_output = self.final_layer(representations_for_downstream_task)
+        return final_output
+class SignLanguageByT5Config(T5Config):
+    def __init__(
+        self,
+        representations_dim=768,
+        adapter="linear",
+        finetune_signhubert=False,
+        face_dim=384,
+        hand_dim=384,
+        pose_dim=14,
+        extraction_layer=0, # use last layer
+        channels="face,left_hand,right_hand,body_posture",
+        **kwargs
+    ):
+        self.representations_dim = representations_dim
+        self.adapter = adapter
+        self.finetune_signhubert = finetune_signhubert
+        self.face_dim = face_dim
+        self.hand_dim = hand_dim
+        self.pose_dim = pose_dim
+        self.extraction_layer = extraction_layer
+        self.channels = channels
+        super().__init__(**kwargs)
+class SignLanguageByT5Encoder(T5PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # Initialize the adapter based on the configuration
+        if config.adapter == "linear":
+            self.adapter = LinearAdapter(
+                config.face_dim,
+                config.hand_dim,
+                config.pose_dim,
+                config.representations_dim,
+                config.d_model,
+                config.extraction_layer,
+                config.channels
+            )
+        else:
+            raise NotImplementedError("Adapter type not implemented.")
+        self.is_decoder = config.is_decoder
+        # Define the encoder blocks
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel settings
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
+            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
+            " 'block.1': 1, ...}",
+            FutureWarning,
+        )
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.block))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for layer in v:
+                cuda_device = "cuda:" + str(k)
+                self.block[layer] = self.block[layer].to(cuda_device)
+        # Set embed_tokens to first layer
+        self.embed_tokens = self.embed_tokens.to(self.first_device)
+        # Set final layer norm to last device
+        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        for i in range(len(self.block)):
+            self.block[i] = self.block[i].to("cpu")
+        self.embed_tokens = self.embed_tokens.to("cpu")
+        self.final_layer_norm = self.final_layer_norm.to("cpu")
+        torch.cuda.empty_cache()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+    def forward(
+        self,
+        face_features=None,
+        left_hand_features=None,
+        right_hand_features=None,
+        pose_features=None,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Set default values if not provided
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Use the adapter to convert representation features into embeddings
+        inputs_embeds = self.adapter(face_features, left_hand_features, right_hand_features, pose_features)
+        input_shape = inputs_embeds.shape[:2]
+        batch_size, seq_length = input_shape
+        mask_seq_length = seq_length
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        # Initialize past_key_values if not provided
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+        # Extend attention mask
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = self.dropout(inputs_embeds)
+        # Iterate over each encoder block
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+            layer_head_mask = head_mask[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=None,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                encoder_decoder_position_bias=None,
+                layer_head_mask=layer_head_mask,
+                cross_attn_layer_head_mask=cross_attn_head_mask,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                present_key_value_states = present_key_value_states + (layer_outputs[1],)
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[2],)
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=None,
+        )
+class SignLanguageByT5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [
+        "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+    ]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+        # Initialize the decoder embedding
+        self.decoder_emb = nn.Embedding(config.vocab_size, config.d_model)
+        # Initialize the encoder with the custom SignLanguageByT5Encoder
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = SignLanguageByT5Encoder(encoder_config)
+        # Initialize the decoder
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.decoder_emb)
+        # Initialize the language modeling head
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel settings
+        self.model_parallel = False
+        self.device_map = None
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
+            " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
+            " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
+            " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.decoder = self.decoder.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+    def get_input_embeddings(self):
+        return self.decoder_emb
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        decoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+    def _reorder_cache(self, past_key_values, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past_key_values is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past_key_values
+        reordered_decoder_past = ()
+        for layer_past_states in past_key_values:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
+                )
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past
+    def forward(
+        self,
+        face_features=None,
+        left_hand_features=None,
+        right_hand_features=None,
+        pose_features=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        decoder_inputs_embeds=None,
+        labels=None,  # Keep this for training compatibility
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Set default values if not provided
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Prepare head masks if needed
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+        # Encode if encoder outputs are not provided
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                face_features=face_features,
+                left_hand_features=left_hand_features,
+                right_hand_features=right_hand_features,
+                pose_features=pose_features,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutputWithPastAndCrossAttentions):
+            encoder_outputs = BaseModelOutputWithPastAndCrossAttentions(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        hidden_states = encoder_outputs[0]
+        # Prepare decoder inputs
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = self._shift_right(labels)
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = decoder_outputs[0]
+        # Scale sequence output if embeddings are tied
+        if self.config.tie_word_embeddings:
+            sequence_output = sequence_output * (self.model_dim ** -0.5)
+        # Compute language modeling logits
+        lm_logits = self.lm_head(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    def generate(
+        self,
+        face_features=None,
+        left_hand_features=None,
+        right_hand_features=None,
+        pose_features=None,
+        attention_mask=None,
+        **kwargs
+    ):
+        """
+        Generate method to handle sign language features and generate output sequences.
+        """
+        # Compute encoder outputs using sign language features
+        encoder_outputs = self.encoder(
+            face_features=face_features,
+            left_hand_features=left_hand_features,
+            right_hand_features=right_hand_features,
+            pose_features=pose_features,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        # Pass encoder outputs to the decoder
+        kwargs["encoder_outputs"] = encoder_outputs
+        # Generate sequences using the parent class's generate method
+        return super().generate(
+            attention_mask=attention_mask,
+            **kwargs
+        )
+@dataclass
+class SignLanguageT5Collator:
+    model: Optional[Any] = None
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pt"
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        face_embeds = [feature["face_features"] for feature in features]
+        left_hand_embeds = [feature["left_hand_features"] for feature in features]
+        right_hand_embeds = [feature["right_hand_features"] for feature in features]
+        pose_embeds = [feature["pose_features"] for feature in features]
+        # Padding
+        max_len = max([emb.shape[0] for emb in face_embeds])
+        def pad_embeds(embeds):
+            padded_embeds = []
+            for emb in embeds:
+                if emb.dim() == 3:  # For 3D tensors (pose features)
+                    pad_len = max_len - emb.shape[1]  # padding the second dimension (T)
+                    emb_pad = torch.nn.functional.pad(emb, (0, 0, 0, pad_len, 0, 0), value=0)
+                else:  # For 2D tensors (face, hand features)
+                    pad_len = max_len - emb.shape[0]
+                    emb_pad = torch.nn.functional.pad(emb, (0, 0, 0, pad_len), value=0)
+                padded_embeds.append(emb_pad)
+            return padded_embeds
+        padded_face_embeds = pad_embeds(face_embeds)
+        padded_left_hand_embeds = pad_embeds(left_hand_embeds)
+        padded_right_hand_embeds = pad_embeds(right_hand_embeds)
+        padded_pose_embeds = pad_embeds(pose_embeds)
+        batch = {}
+        batch["face_features"] = torch.stack(padded_face_embeds, dim=0)
+        batch["left_hand_features"] = torch.stack(padded_left_hand_embeds, dim=0)
+        batch["right_hand_features"] = torch.stack(padded_right_hand_embeds, dim=0)
+        batch["pose_features"] = torch.stack(padded_pose_embeds, dim=0)
+        # For inference, we don't need decoder_input_ids - the model.generate() will handle this
+        # Remove the decoder_input_ids requirement
+        return batch
+class TranslationFeatures(torch.utils.data.Dataset):
+    def __init__(self, face_embeddings, left_hand_embeddings, right_hand_embeddings, body_posture_embeddings):
+        self.face_embeddings = face_embeddings
+        self.left_hand_embeddings = left_hand_embeddings
+        self.right_hand_embeddings = right_hand_embeddings
+        self.body_posture_embeddings = body_posture_embeddings
+    def __len__(self):
+        return 1
+    def __getitem__(self, idx):
+        return {
+            "face_features": torch.tensor(self.face_embeddings),
+            "left_hand_features": torch.tensor(self.left_hand_embeddings),
+            "right_hand_features": torch.tensor(self.right_hand_embeddings),
+            "pose_features": torch.tensor(self.body_posture_embeddings),
+        }
+def generate_text_from_features(
+    face_embeddings: np.ndarray,
+    left_hand_embeddings: np.ndarray,
+    right_hand_embeddings: np.ndarray,
+    body_posture_embeddings: np.ndarray,
+    model_config: str,
+    model_checkpoint: str,
+    tokenizer_checkpoint: str,
+    output_dir: str,
+    generation_max_length: int = 2048,
+    generation_num_beams: int = 5,
+):
+    """
+    Direct inference function that generates text from sign language features.
+    """
+    # Load model and tokenizer
+    config = SignLanguageByT5Config.from_pretrained(model_config)
+    model = SignLanguageByT5ForConditionalGeneration.from_pretrained(
+        model_checkpoint,
+        # config=config,
+        cache_dir=os.path.join(output_dir, "cache"),
+    )
+    tokenizer = ByT5Tokenizer.from_pretrained(tokenizer_checkpoint)
+    # Move model to appropriate device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    # Convert inputs to tensors and move to device
+    face_tensor = torch.tensor(face_embeddings, dtype=torch.float32).unsqueeze(0).to(device)
+    left_hand_tensor = torch.tensor(left_hand_embeddings, dtype=torch.float32).unsqueeze(0).to(device)
+    right_hand_tensor = torch.tensor(right_hand_embeddings, dtype=torch.float32).unsqueeze(0).to(device)
+    pose_tensor = torch.tensor(body_posture_embeddings, dtype=torch.float32).unsqueeze(0).to(device)
+    # Generate text
+    with torch.no_grad():
+        generated_ids = model.generate(
+            face_features=face_tensor,
+            left_hand_features=left_hand_tensor,
+            right_hand_features=right_hand_tensor,
+            pose_features=pose_tensor,
+            max_length=generation_max_length,
+            num_beams=generation_num_beams,
+            early_stopping=True,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    # Decode generated text
+    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+    return generated_text
+def test(
+    face_embeddings: np.ndarray,
+    left_hand_embeddings: np.ndarray,
+    right_hand_embeddings: np.ndarray,
+    body_posture_embeddings: np.ndarray,
+    model_config: str,
+    model_checkpoint: str,
+    tokenizer_checkpoint: str,
+    output_dir: str,
+):
+    """
+    Test function for inference - generates text from sign language features.
+    This is a simpler wrapper around the direct inference function.
+    """
+    return generate_text_from_features(
+        face_embeddings=face_embeddings,
+        left_hand_embeddings=left_hand_embeddings,
+        right_hand_embeddings=right_hand_embeddings,
+        body_posture_embeddings=body_posture_embeddings,
+        model_config=model_config,
+        model_checkpoint=model_checkpoint,
+        tokenizer_checkpoint=tokenizer_checkpoint,
+        output_dir=output_dir,
+    )

kpe_mediapipe.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import mediapipe as mp
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+import cv2
+import numpy as np
+import json
+from pathlib import Path
+import decord
+from typing import Dict, Optional, Tuple, Any
+class HolisticDetector:
+    """
+    A class for detecting face, hand, and pose landmarks in videos using MediaPipe.
+    """
+    def __init__(self, face_model_path: str, hand_model_path: str,
+                 min_detection_confidence: float = 0.1,
+                 min_hand_detection_confidence: float = 0.05,
+                 max_faces: int = 6, max_hands: int = 6):
+        """
+        Initialize the HolisticDetector with model paths and configuration.
+        Args:
+            face_model_path: Path to the face detection model
+            hand_model_path: Path to the hand detection model
+            min_detection_confidence: Minimum confidence for pose detection
+            min_hand_detection_confidence: Minimum confidence for hand detection
+            max_faces: Maximum number of faces to detect
+            max_hands: Maximum number of hands to detect
+        """
+        self.face_model_path = face_model_path
+        self.hand_model_path = hand_model_path
+        self.min_detection_confidence = min_detection_confidence
+        self.min_hand_detection_confidence = min_hand_detection_confidence
+        self.max_faces = max_faces
+        self.max_hands = max_hands
+        self._initialize_detectors()
+    def _initialize_detectors(self):
+        """Initialize the MediaPipe detectors."""
+        # Initialize face detector
+        base_options_face = python.BaseOptions(model_asset_path=self.face_model_path)
+        options_face = vision.FaceLandmarkerOptions(
+            base_options=base_options_face,
+            output_face_blendshapes=True,
+            output_facial_transformation_matrixes=True,
+            num_faces=self.max_faces
+        )
+        self.face_detector = vision.FaceLandmarker.create_from_options(options_face)
+        # Initialize hand detector
+        base_options_hand = python.BaseOptions(model_asset_path=self.hand_model_path)
+        options_hand = vision.HandLandmarkerOptions(
+            base_options=base_options_hand,
+            num_hands=self.max_hands,
+            min_hand_detection_confidence=self.min_hand_detection_confidence
+        )
+        self.hand_detector = vision.HandLandmarker.create_from_options(options_hand)
+        # Initialize holistic model for pose
+        self.mp_holistic = mp.solutions.holistic.Holistic(
+            min_detection_confidence=self.min_detection_confidence
+        )
+    def detect_frame_landmarks(self, image: np.ndarray) -> Tuple[Dict[str, int], Dict[str, Any]]:
+        """
+        Detect landmarks in a single frame.
+        Args:
+            image: Input image as numpy array
+        Returns:
+            Tuple of (bounding_boxes_count, landmarks_data)
+        """
+        results = self.mp_holistic.process(image)
+        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
+        face_prediction = self.face_detector.detect(mp_image)
+        hand_prediction = self.hand_detector.detect(mp_image)
+        bounding_boxes = {}
+        landmarks_data = {}
+        # Process face landmarks
+        if face_prediction.face_landmarks:
+            bounding_boxes['#face'] = len(face_prediction.face_landmarks)
+            landmarks_data['face_landmarks'] = []
+            for face in face_prediction.face_landmarks:
+                landmarks_face = [[landmark.x, landmark.y, landmark.z] for landmark in face]
+                landmarks_data['face_landmarks'].append(landmarks_face)
+        else:
+            bounding_boxes['#face'] = 0
+            landmarks_data['face_landmarks'] = None
+        # Process hand landmarks
+        if hand_prediction.hand_landmarks:
+            bounding_boxes['#hands'] = len(hand_prediction.hand_landmarks)
+            landmarks_data['hand_landmarks'] = []
+            for hand in hand_prediction.hand_landmarks:
+                landmarks_hand = [[landmark.x, landmark.y, landmark.z] for landmark in hand]
+                landmarks_data['hand_landmarks'].append(landmarks_hand)
+        else:
+            bounding_boxes['#hands'] = 0
+            landmarks_data['hand_landmarks'] = None
+        # Process pose landmarks
+        if results.pose_landmarks:
+            bounding_boxes['#pose'] = 1
+            landmarks_data['pose_landmarks'] = []
+            pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in results.pose_landmarks.landmark]
+            landmarks_data['pose_landmarks'].append(pose_landmarks)
+        else:
+            bounding_boxes['#pose'] = 0
+            landmarks_data['pose_landmarks'] = None
+        return bounding_boxes, landmarks_data
+    def process_video(self, video_input, save_results: bool = False,
+                     output_dir: Optional[str] = None, video_name: Optional[str] = None) -> Dict[int, Any]:
+        """
+        Process a video and extract landmarks from all frames.
+        Args:
+            video_input: Either a path to video file (str) or a decord.VideoReader object
+            save_results: Whether to save results to files
+            output_dir: Directory to save results (required if save_results=True)
+            video_name: Name for output files (required if save_results=True and video_input is VideoReader)
+        Returns:
+            Dictionary containing landmarks for each frame
+        Raises:
+            FileNotFoundError: If video file doesn't exist
+            ValueError: If save_results=True but output_dir is None, or if video_name is None when needed
+            TypeError: If video_input is neither string nor VideoReader
+        """
+        if save_results and output_dir is None:
+            raise ValueError("output_dir must be provided when save_results=True")
+        # Handle different input types
+        if isinstance(video_input, str):
+            # Input is a file path
+            video_path = Path(video_input)
+            if not video_path.exists():
+                raise FileNotFoundError(f"Video file not found: {video_input}")
+            try:
+                video = decord.VideoReader(str(video_path))
+            except Exception as e:
+                raise RuntimeError(f"Error loading video {video_input}: {e}")
+            file_name = video_path.stem
+        # elif hasattr(video_input, '__len__') and hasattr(video_input, '__getitem__'):
+        else:
+            # Input is a VideoReader object or similar
+            video = video_input
+            if save_results and video_name is None:
+                raise ValueError("video_name must be provided when save_results=True and video_input is a VideoReader object")
+            file_name = video_name or "video"
+        # else:
+        #     raise TypeError("video_input must be either a file path (str) or a VideoReader object")
+        result_dict = {}
+        stats = {}
+        # Process each frame
+        for i in range(len(video)):
+            try:
+                # frame_rgb = video[i].asnumpy()
+                frame_rgb = video[i]
+                if hasattr(video, 'seek'):
+                    video.seek(0)
+                bounding_boxes, landmarks = self.detect_frame_landmarks(frame_rgb)
+                result_dict[i] = landmarks
+                stats[i] = bounding_boxes
+            except Exception as e:
+                print(f"Error processing frame {i}: {e}")
+                result_dict[i] = None
+                stats[i] = {'#face': 0, '#hands': 0, '#pose': 0}
+        # Save results if requested
+        if save_results:
+            self._save_results(file_name, result_dict, stats, output_dir)
+        return result_dict
+    def process_video_frames(self, frames: list, save_results: bool = False,
+                           output_dir: Optional[str] = None, video_name: str = "video") -> Dict[int, Any]:
+        """
+        Process a list of frames and extract landmarks.
+        Args:
+            frames: List of frame images as numpy arrays
+            save_results: Whether to save results to files
+            output_dir: Directory to save results (required if save_results=True)
+            video_name: Name for output files
+        Returns:
+            Dictionary containing landmarks for each frame
+        """
+        if save_results and output_dir is None:
+            raise ValueError("output_dir must be provided when save_results=True")
+        result_dict = {}
+        stats = {}
+        # Process each frame
+        for i, frame in enumerate(frames):
+            try:
+                bounding_boxes, landmarks = self.detect_frame_landmarks(frame)
+                result_dict[i] = landmarks
+                stats[i] = bounding_boxes
+            except Exception as e:
+                print(f"Error processing frame {i}: {e}")
+                result_dict[i] = None
+                stats[i] = {'#face': 0, '#hands': 0, '#pose': 0}
+        # Save results if requested
+        if save_results:
+            self._save_results(video_name, result_dict, stats, output_dir)
+        return result_dict
+    def _save_results(self, video_name: str, landmarks_data: Dict, stats_data: Dict, output_dir: str):
+        """Save landmarks and stats to JSON files."""
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        # Save landmarks
+        landmarks_file = output_path / f"{video_name}_pose.json"
+        with open(landmarks_file, 'w') as f:
+            json.dump(landmarks_data, f)
+        # Save stats
+        stats_file = output_path / f"{video_name}_stats.json"
+        with open(stats_file, 'w') as f:
+            json.dump(stats_data, f)
+    def compute_video_stats(self, landmarks_data: Dict) -> Dict[str, Any]:
+        """
+        Compute statistics from landmarks data.
+        Args:
+            landmarks_data: Dictionary containing landmarks for each frame
+        Returns:
+            Dictionary containing frame-by-frame stats and maximums
+        """
+        stats = {}
+        max_counts = {'#face': 0, '#hands': 0, '#pose': 0}
+        for frame, landmarks in landmarks_data.items():
+            if landmarks is None:
+                presence = {'#face': 0, '#hands': 0, '#pose': 0}
+            else:
+                presence = {
+                    '#face': len(landmarks.get('face_landmarks', [])) if landmarks.get('face_landmarks') else 0,
+                    '#hands': len(landmarks.get('hand_landmarks', [])) if landmarks.get('hand_landmarks') else 0,
+                    '#pose': len(landmarks.get('pose_landmarks', [])) if landmarks.get('pose_landmarks') else 0
+                }
+            stats[frame] = presence
+            # Update max counts
+            for key in max_counts:
+                max_counts[key] = max(max_counts[key], presence[key])
+        stats['max'] = max_counts
+        return stats
+# Convenience function for backward compatibility and simple usage
+def video_holistic(video_input, face_model_path: str, hand_model_path: str,
+                  save_results: bool = False, output_dir: Optional[str] = None,
+                  video_name: Optional[str] = None) -> Dict[int, Any]:
+    """
+    Convenience function to process a video and extract holistic landmarks.
+    Args:
+        video_input: Either a path to video file (str) or a decord.VideoReader object
+        face_model_path: Path to the face detection model
+        hand_model_path: Path to the hand detection model
+        save_results: Whether to save results to files
+        output_dir: Directory to save results
+        video_name: Name for output files (required if save_results=True and video_input is VideoReader)
+    Returns:
+        Dictionary containing landmarks for each frame
+    """
+    detector = HolisticDetector(face_model_path, hand_model_path)
+    return detector.process_video(video_input, save_results, output_dir, video_name)
+# Utility functions for batch processing
+def load_file(filename: str):
+    """Load a pickled and gzipped file."""
+    import pickle
+    import gzip
+    with gzip.open(filename, "rb") as f:
+        return pickle.load(f)
+def is_string_in_file(file_path: str, target_string: str) -> bool:
+    """Check if a string exists in a file."""
+    try:
+        with Path(file_path).open("r") as f:
+            for line in f:
+                if target_string in line:
+                    return True
+        return False
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+def main():
+    """Main function for command-line usage."""
+    import argparse
+    import time
+    import os
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--index', type=int, required=True,
+                        help='index of the sub_list to work with')
+    parser.add_argument('--batch_size', type=int, required=True,
+                        help='batch size')
+    parser.add_argument('--pose_path', type=str, required=True,
+                        help='path to where the pose data will be saved')
+    parser.add_argument('--stats_path', type=str, required=True,
+                        help='path to where the stats data will be saved')
+    parser.add_argument('--time_limit', type=int, required=True,
+                        help='time limit')
+    parser.add_argument('--files_list', type=str, required=True,
+                        help='files list')
+    parser.add_argument('--problem_file_path', type=str, required=True,
+                        help='problem file path')
+    parser.add_argument('--face_model_path', type=str, required=True,
+                        help='face model path')
+    parser.add_argument('--hand_model_path', type=str, required=True,
+                        help='hand model path')
+    args = parser.parse_args()
+    start_time = time.time()
+    # Initialize detector
+    detector = HolisticDetector(args.face_model_path, args.hand_model_path)
+    # Load the files list
+    fixed_list = load_file(args.files_list)
+    # Create folders if they do not exist
+    Path(args.pose_path).mkdir(parents=True, exist_ok=True)
+    Path(args.stats_path).mkdir(parents=True, exist_ok=True)
+    # Create problem file if it doesn't exist
+    if not os.path.exists(args.problem_file_path):
+        with open(args.problem_file_path, 'w') as f:
+            pass
+    # Process videos in batches
+    video_batches = [fixed_list[i:i + args.batch_size] for i in range(0, len(fixed_list), args.batch_size)]
+    for video_file in video_batches[args.index]:
+        current_time = time.time()
+        if current_time - start_time > args.time_limit:
+            print("Time limit reached. Stopping execution.")
+            break
+        # Check if output files already exist
+        video_name = Path(video_file).stem
+        landmark_json_path = Path(args.pose_path) / f"{video_name}_pose.json"
+        stats_json_path = Path(args.stats_path) / f"{video_name}_stats.json"
+        if landmark_json_path.exists() and stats_json_path.exists():
+            print(f"Skipping {video_file} - output files already exist")
+            continue
+        elif is_string_in_file(args.problem_file_path, video_file):
+            print(f"Skipping {video_file} - found in problem file")
+            continue
+        else:
+            try:
+                print(f"Processing {video_file}")
+                result_dict = detector.process_video(
+                    video_file_path=video_file,
+                    save_results=True,
+                    output_dir=args.pose_path
+                )
+                # Also save stats separately for compatibility
+                stats = detector.compute_video_stats(result_dict)
+                with open(stats_json_path, 'w') as f:
+                    json.dump(stats, f)
+                print(f"Successfully processed {video_file}")
+            except Exception as e:
+                print(f"Error processing {video_file}: {e}")
+                # Add to problem file
+                with open(args.problem_file_path, "a") as p:
+                    p.write(video_file + "\n")
+if __name__ == "__main__":
+    main()

shubert.py ADDED Viewed

	@@ -0,0 +1,479 @@

+import logging
+from dataclasses import dataclass, field
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+import numpy as np
+import random
+import os
+import sys
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import (
+    Wav2Vec2Config,
+    TransformerEncoder,
+)
+# Debug print to show where Wav2Vec2Config is defined
+print(f"Wav2Vec2Config is imported from: {Wav2Vec2Config.__module__}")
+print(f"Full path: {sys.modules[Wav2Vec2Config.__module__].__file__}")
+from fairseq.modules import (
+    LayerNorm,
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class SignHubertConfig(Wav2Vec2Config):
+    # pos_conv_kernel: int = field(default=32)
+    conv_pos: int = field(default=32)
+    discrete: bool = field(default=False)
+    codebook_size: int = field(default=256)
+    channels_embed_dim: int = field(default=384)
+    channels_pose_embed_dim: int = field(default=14)
+    intermediate_dim: int = field(default=1024)  # This will be overridden if needed
+    mask_strategy: str = field(default="random")
+    channels: str = field(default="face,left_hand,right_hand,body_posture")
+@register_model("signhubert_onlyhands", dataclass=SignHubertConfig)
+class SignHubertModel(BaseFairseqModel):
+    def __init__(self, cfg: SignHubertConfig):
+        super().__init__()
+        self.cfg = cfg
+        # print(cfg)
+        self.discrete = cfg.discrete  # since it's hubert this will always be discrete anyways
+        self.embed = cfg.encoder_embed_dim # whether it is small(384), base(768), large, etc.
+        self.channel_embed = cfg.channels_embed_dim  # embedding dimension for face, left_hand and right_hand (default: 384)
+        self.channel_pose_embed = cfg.channels_pose_embed_dim  # embedding dimension for pose (default: 14)
+        self.intermediate_dim = cfg.intermediate_dim  # intermediate dimension before the projection layer to encoder_embed_dim (default: 1024)
+        self.channels = cfg.channels.split(",")
+        self.post_extract_proj = nn.Linear(cfg.intermediate_dim, cfg.encoder_embed_dim)  # 4 channels concatenated
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_strategy = cfg.mask_strategy
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(1, 1, 1, cfg.intermediate_dim // len(self.channels)).uniform_()
+        )
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.channel_embed * len(self.channels))
+        if "face" in self.channels:
+            self.layer_norm_face = LayerNorm(self.channel_embed)
+            self.face_proj = nn.Linear(self.channel_embed, cfg.intermediate_dim // len(self.channels))
+        if "left_hand" in self.channels:
+            self.layer_norm_lhand = LayerNorm(self.channel_embed)
+            self.left_hand_proj = nn.Linear(self.channel_embed, cfg.intermediate_dim // len(self.channels))
+        if "right_hand" in self.channels:
+            self.layer_norm_rhand = LayerNorm(self.channel_embed)
+            self.right_hand_proj = nn.Linear(self.channel_embed, cfg.intermediate_dim // len(self.channels))
+        if "body_posture" in self.channels:
+            self.layer_norm_body = LayerNorm(self.channel_pose_embed)
+            self.body_posture_proj = nn.Linear(self.channel_pose_embed, cfg.intermediate_dim // len(self.channels))
+        self.codebook_size = cfg.codebook_size # number of codebook vectors
+        self.heads = []
+        for i in range(len(self.channels)):
+            self.heads.append(nn.Linear(cfg.encoder_embed_dim, cfg.codebook_size))
+        self.heads = torch.nn.ModuleList(self.heads)
+        # self.heads = torch.nn.ModuleList([
+        #         nn.Linear(cfg.encoder_embed_dim, cfg.codebook_size) ,
+        #         nn.Linear(cfg.encoder_embed_dim, cfg.codebook_size),
+        #         nn.Linear(cfg.encoder_embed_dim, cfg.codebook_size),
+        #     ]
+        # )
+        # # Define separate linear layers for each channel
+        # self.face_proj = nn.Linear(self.channel_embed, cfg.intermediate_dim // 4)
+        # self.left_hand_proj = nn.Linear(self.channel_embed, cfg.intermediate_dim // 4)
+        # self.right_hand_proj = nn.Linear(self.channel_embed, cfg.intermediate_dim // 4)
+        # self.body_posture_proj = nn.Linear(self.channel_pose_embed, cfg.intermediate_dim // 4)
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        return state
+    @classmethod
+    def build_model(cls, cfg: SignHubertConfig, task=None):
+        """Build a new model instance."""
+        return cls(cfg)
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C, D = x.shape
+        # Initialize a mask vector with ones (same shape as x)
+        mask = torch.ones_like(x)
+        # channel masking
+        if self.mask_prob > 0 and self.mask_strategy == "channel":
+            if mask_indices is None:
+                mask_indices = torch.zeros_like(x[:,:,:,0], dtype=bool)
+                num_channels_to_mask = int(C * self.mask_prob)
+                num_channels_to_mask = max(1, num_channels_to_mask)
+                for i in range(B):
+                    channels_to_mask = np.random.choice(C, num_channels_to_mask, replace=False)
+                    mask_indices[i, :, channels_to_mask] = True
+            mask[mask_indices.unsqueeze(-1).expand(-1, -1, -1, D)] = 0
+        # gloss/time masking
+        elif self.mask_prob > 0 and self.mask_strategy == "gloss":
+            if mask_indices is None:
+                mask_indices_channel = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices_channel = torch.from_numpy(mask_indices_channel).to(x.device)
+            # Apply the same mask to all channels
+            mask_indices = mask_indices_channel.unsqueeze(2).expand(-1, -1, C)
+            mask_indices = mask_indices.unsqueeze(3).expand(-1, -1, -1, D)
+            mask[mask_indices] = 0
+        # random masking
+        elif self.mask_prob > 0 and self.mask_strategy == "random":
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T*C),  # Note: T*C instead of T
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            mask_indices = mask_indices.view(B, T, C)
+            mask_indices = mask_indices.unsqueeze(3).expand(-1, -1, -1, D)
+            mask[mask_indices] = 0
+        else:
+            raise ValueError(f"unknown mask strategy {self.mask_strategy}")
+        # Apply the mask to x and return the masked tensor with the same shape as x
+        # x = x * mask
+        x = x * mask + self.mask_emb * (1 - mask)
+        return x, mask
+        # mask is a tensor of shape BxTx4x256 where 0 means the value is masked and 1 means the value is not masked
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+        kmeans_labels=None,
+        ):
+        channels_to_use = []
+        for c in self.channels:
+            if c in source[0]:
+                channels_to_use.append(c)
+        for c in channels_to_use:
+            if c == "face":
+                face_features_list = []
+                label_face_features_list = []
+            elif c == "left_hand":
+                left_hand_features_list = []
+                label_left_hand_features_list = []
+            elif c == "right_hand":
+                right_hand_features_list = []
+                label_right_hand_features_list = []
+            elif c == "body_posture":
+                body_posture_features_list = []
+                label_body_posture_features_list = []
+        # # source is a list of dictionaries with keys "face", "left_hand", "right_hand", "body_posture"
+        # face_features_list = []
+        # left_hand_features_list = []
+        # right_hand_features_list = []
+        # body_posture_features_list = []
+        # label_face_features_list = []
+        # label_left_hand_features_list = []
+        # label_right_hand_features_list = []
+        # label_body_posture_features_list = []
+        # for sample in source:
+        #     face_features_list.append(sample["face"])   # Tx384
+        #     left_hand_features_list.append(sample["left_hand"]) # Tx384
+        #     right_hand_features_list.append(sample["right_hand"])   # Tx384
+        #     body_posture_features_list.append(sample["body_posture"])   # Tx14
+        #     label_face_features_list.append(sample["label_face"])   # Tx1
+        #     label_left_hand_features_list.append(sample["label_left_hand"])   # Tx1
+        #     label_right_hand_features_list.append(sample["label_right_hand"])   # Tx1
+        #     label_body_posture_features_list.append(sample["label_body_posture"])   # Tx1
+        for sample in source:
+            for c in channels_to_use:
+                if c == "face":
+                    face_features_list.append(sample["face"])   # Tx384
+                    label_face_features_list.append(sample["label_face"])   # Tx1
+                elif c == "left_hand":
+                    left_hand_features_list.append(sample["left_hand"]) # Tx384
+                    label_left_hand_features_list.append(sample["label_left_hand"])   # Tx1
+                elif c == "right_hand":
+                    right_hand_features_list.append(sample["right_hand"])   # Tx384
+                    label_right_hand_features_list.append(sample["label_right_hand"])   # Tx1
+                elif c == "body_posture":
+                    body_posture_features_list.append(sample["body_posture"])   # Tx14
+                    label_body_posture_features_list.append(sample["label_body_posture"])   # Tx1
+        # face_features = torch.stack(face_features_list) # BxTx384
+        # left_hand_features = torch.stack(left_hand_features_list)   # BxTx384
+        # right_hand_features = torch.stack(right_hand_features_list) # BxTx384
+        # body_posture_features = torch.stack(body_posture_features_list) # BxTx14
+        # face_labels = torch.stack(label_face_features_list) # BxTx1
+        # left_hand_labels = torch.stack(label_left_hand_features_list) # BxTx1
+        # right_hand_labels = torch.stack(label_right_hand_features_list) # BxTx1
+        # body_posture_labels = torch.stack(label_body_posture_features_list) # BxTx1
+        # # Apply layer normalization to each part
+        # face_features = self.layer_norm_face(face_features) # BxTx384
+        # left_hand_features = self.layer_norm_lhand(left_hand_features) # BxTx384
+        # right_hand_features = self.layer_norm_rhand(right_hand_features)    # BxTx384
+        # body_posture_features = self.layer_norm_body(body_posture_features) # BxTx14
+        # # Apply separate linear projections for each channel
+        # face_features = self.face_proj(face_features) # BxTx256
+        # left_hand_features = self.left_hand_proj(left_hand_features) # BxTx256
+        # right_hand_features = self.right_hand_proj(right_hand_features) # BxTx256
+        # body_posture_features = self.body_posture_proj(body_posture_features)   # BxTx256
+        features_list = []
+        labels_list = []
+        for c in channels_to_use:
+            if c == "face":
+                face_features = torch.stack(face_features_list) # BxTx384
+                face_labels = torch.stack(label_face_features_list) # BxTx1
+                face_features = self.layer_norm_face(face_features) # BxTx384
+                face_features = self.face_proj(face_features) # BxTx256
+                features_list.append(face_features)
+                labels_list.append(face_labels)
+            elif c == "left_hand":
+                left_hand_features = torch.stack(left_hand_features_list) # BxTx384
+                left_hand_labels = torch.stack(label_left_hand_features_list) # BxTx1
+                left_hand_features = self.layer_norm_lhand(left_hand_features) # BxTx384
+                left_hand_features = self.left_hand_proj(left_hand_features) # BxTx256
+                features_list.append(left_hand_features)
+                labels_list.append(left_hand_labels)
+            elif c == "right_hand":
+                right_hand_features = torch.stack(right_hand_features_list) # BxTx384
+                right_hand_labels = torch.stack(label_right_hand_features_list) # BxTx1
+                right_hand_features = self.layer_norm_rhand(right_hand_features) # BxTx384
+                right_hand_features = self.right_hand_proj(right_hand_features) # BxTx256
+                features_list.append(right_hand_features)
+                labels_list.append(right_hand_labels)
+            elif c == "body_posture":
+                body_posture_features = torch.stack(body_posture_features_list) # BxTx14
+                body_posture_labels = torch.stack(label_body_posture_features_list) # BxTx1
+                body_posture_features = self.layer_norm_body(body_posture_features) # BxTx14
+                body_posture_features = self.body_posture_proj(body_posture_features)   # BxTx256
+                features_list.append(body_posture_features)
+                labels_list.append(body_posture_labels)
+        # concatenate the projected features to have dimension BxTxCxD where C=4 and D=256
+        # features = torch.stack(
+        #     [
+        #         face_features,
+        #         left_hand_features,
+        #         right_hand_features,
+        #         body_posture_features
+        #     ],
+        #     dim=2) # BxTx4x256
+        features = torch.stack(features_list, dim=2) # BxTx4x256
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        # mask_indices is a tensor of shape BxTx4x256 where 0 means the value is masked and 1 means the value is not masked
+        else:
+            x = features
+            mask_indices = None
+        x = self.dropout_input(x) # BxTx4x256
+        x = x.view(x.size(0), x.size(1), -1)  # BxTx1024
+        if self.post_extract_proj is not None:
+            x = self.post_extract_proj(x)  # BxTx768
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+        result = {
+            "losses": {},
+        }
+        # use linear heads to compute the discrete prediction for each channel and make it into a single tensor of shape BxTxCxcodebook_size
+        predictions = []
+        for i, head in enumerate(self.heads):
+            channel_pred = head(x)  # BxTxcodebook_size
+            predictions.append(channel_pred)
+        predictions = torch.stack(predictions, dim=2)  # BxTx4xcodebook_size
+        # labels = torch.stack(
+        #     [
+        #         face_labels,
+        #         left_hand_labels,
+        #         right_hand_labels,
+        #         body_posture_labels
+        #     ],
+        #     dim=2) # BxTx4x1
+        labels = torch.stack(labels_list, dim=2) # BxTx4x1
+        # print(f"predictions shape: {predictions.shape} and labels shape: {labels.shape}")
+        predictions_flat = predictions.view(-1, self.codebook_size)  # Shape: (B * T * C, codebook_size)
+        labels_flat = labels.view(-1)  # Shape: (B * T * C)
+        # Ensure labels are of correct shape
+        labels_flat = labels_flat.squeeze(-1)  # Remove the last dimension if it's size 1
+        # Correct the mask_indices to match the shape of predictions_flat
+        mask_indices_reduced = mask_indices.any(dim=-1)  # Reduce mask to (B, T, C) by collapsing last dimension
+        mask_indices_flat = mask_indices_reduced.view(-1)  # Flatten to match the shape of (B * T * C)
+        # Calculate the loss only for the masked positions (where mask_indices_flat is zero)
+        masked_loss = F.cross_entropy(
+            predictions_flat[mask_indices_flat == 0],
+            labels_flat[mask_indices_flat == 0],
+            reduction='none'
+        )
+        # Store the result
+        result['losses']['kmeans_loss'] = masked_loss
+        if "sample_size" not in result:
+            result['sample_size'] = masked_loss.numel()
+        return result
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+    def extract_features(
+        self, source, padding_mask, kmeans_labels, mask=False, layer=None
+    ):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+            kmeans_labels=kmeans_labels,
+        )
+        return res
+    def remove_pretraining_modules(self, last_layer=None):
+        self.heads = None
+        self.final_proj = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
+            )

shubert_inference.py ADDED Viewed

	@@ -0,0 +1,439 @@

+import torch
+import numpy as np
+import csv
+import os
+from tqdm import tqdm
+import argparse
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
+from examples.shubert.models.shubert import SHubertModel, SHubertConfig
+from transformers import ByT5Tokenizer, ByT5ForConditionalGeneration
+class SHubertProcessor:
+    """
+    A class for processing multi-modal embeddings through SHubert model.
+    """
+    def __init__(self, checkpoint_path: str, device: Optional[str] = None):
+        """
+        Initialize the SHubertProcessor.
+        Args:
+            checkpoint_path: Path to the SHubert model checkpoint
+            device: Device to use ('cuda' or 'cpu'). Auto-detected if None
+        """
+        self.checkpoint_path = checkpoint_path
+        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load the model
+        self.model = self._load_model()
+        print(f"SHubertProcessor initialized on device: {self.device}")
+    def _load_model(self) -> SHubertModel:
+        """Load the SHubert model from checkpoint."""
+        # Initialize configuration
+        cfg = SHubertConfig()
+        # Initialize the model
+        model = SHubertModel(cfg)
+        # Load the checkpoint
+        checkpoint = torch.load(self.checkpoint_path, map_location=self.device)
+        # Extract state dict
+        if 'model' in checkpoint:
+            state_dict = checkpoint['model']
+        else:
+            state_dict = checkpoint
+        # Load the state dictionary into the model
+        model.load_state_dict(state_dict, strict=False)
+        model.eval()
+        model.to(self.device)
+        return model
+    def process_embeddings(self, face_embeddings: np.ndarray,
+                          left_hand_embeddings: np.ndarray,
+                          right_hand_embeddings: np.ndarray,
+                          pose_embeddings: np.ndarray) -> np.ndarray:
+        """
+        Process multi-modal embeddings through SHubert model.
+        Args:
+            face_embeddings: Face embeddings array of shape (num_frames, embedding_dim)
+            left_hand_embeddings: Left hand embeddings array of shape (num_frames, embedding_dim)
+            right_hand_embeddings: Right hand embeddings array of shape (num_frames, embedding_dim)
+            pose_embeddings: Pose embeddings array of shape (num_frames, pose_dim)
+        Returns:
+            Numpy array of SHubert features with shape (num_layers, num_frames, feature_dim)
+        """
+        # Convert to tensors and move to device
+        face = torch.from_numpy(face_embeddings).float().to(self.device)
+        left_hand = torch.from_numpy(left_hand_embeddings).float().to(self.device)
+        right_hand = torch.from_numpy(right_hand_embeddings).float().to(self.device)
+        body_posture = torch.from_numpy(pose_embeddings).float().to(self.device)
+        length = face.shape[0]
+        # Prepare input in the format expected by SHubert
+        source = [{
+            "face": face,
+            "left_hand": left_hand,
+            "right_hand": right_hand,
+            "body_posture": body_posture,
+            # Add dummy labels to match the expected input format
+            "label_face": torch.zeros((length, 1)).to(self.device),
+            "label_left_hand": torch.zeros((length, 1)).to(self.device),
+            "label_right_hand": torch.zeros((length, 1)).to(self.device),
+            "label_body_posture": torch.zeros((length, 1)).to(self.device)
+        }]
+        # Extract features
+        with torch.no_grad():
+            result = self.model.extract_features(source, padding_mask=None, kmeans_labels=None, mask=False)
+        # Extract layer outputs
+        layer_outputs = []
+        for layer in result['layer_results']:
+            # layer_output has shape [T, B, D]
+            # Since batch size B is 1, we can squeeze it
+            layer_output = layer[-1]
+            layer_output = layer_output.squeeze(1)  # Shape: [T, D]
+            layer_outputs.append(layer_output.cpu().numpy())  # Convert to NumPy array
+        # Stack the outputs from all layers to get an array of shape [L, T, D]
+        features = np.stack(layer_outputs, axis=0)  # Shape: [L, T, D]
+        return features
+    def process_embeddings_from_files(self, face_path: str, left_hand_path: str,
+                                     right_hand_path: str, pose_path: str) -> np.ndarray:
+        """
+        Process embeddings loaded from files.
+        Args:
+            face_path: Path to face embeddings .npy file
+            left_hand_path: Path to left hand embeddings .npy file
+            right_hand_path: Path to right hand embeddings .npy file
+            pose_path: Path to pose embeddings .npy file
+        Returns:
+            Numpy array of SHubert features with shape (num_layers, num_frames, feature_dim)
+        """
+        # Load numpy arrays
+        face_embeddings = np.load(face_path)
+        left_hand_embeddings = np.load(left_hand_path)
+        right_hand_embeddings = np.load(right_hand_path)
+        pose_embeddings = np.load(pose_path)
+        return self.process_embeddings(face_embeddings, left_hand_embeddings,
+                                     right_hand_embeddings, pose_embeddings)
+    def process_and_save_embeddings(self, face_embeddings: np.ndarray,
+                                   left_hand_embeddings: np.ndarray,
+                                   right_hand_embeddings: np.ndarray,
+                                   pose_embeddings: np.ndarray,
+                                   output_path: str) -> str:
+        """
+        Process embeddings and save to file.
+        Args:
+            face_embeddings: Face embeddings array
+            left_hand_embeddings: Left hand embeddings array
+            right_hand_embeddings: Right hand embeddings array
+            pose_embeddings: Pose embeddings array
+            output_path: Path to save the output file
+        Returns:
+            Path to the saved file
+        """
+        # Process embeddings
+        features = self.process_embeddings(face_embeddings, left_hand_embeddings,
+                                         right_hand_embeddings, pose_embeddings)
+        # Create output directory if it doesn't exist
+        output_dir = Path(output_path).parent
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Save features
+        np.save(output_path, features)
+        return str(output_path)
+    def process_from_files_and_save(self, face_path: str, left_hand_path: str,
+                                   right_hand_path: str, pose_path: str,
+                                   output_path: str) -> str:
+        """
+        Process embeddings from files and save results.
+        Args:
+            face_path: Path to face embeddings .npy file
+            left_hand_path: Path to left hand embeddings .npy file
+            right_hand_path: Path to right hand embeddings .npy file
+            pose_path: Path to pose embeddings .npy file
+            output_path: Path to save the output file
+        Returns:
+            Path to the saved file
+        """
+        # Process embeddings
+        features = self.process_embeddings_from_files(face_path, left_hand_path,
+                                                     right_hand_path, pose_path)
+        # Create output directory if it doesn't exist
+        output_dir = Path(output_path).parent
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Save features
+        np.save(output_path, features)
+        return str(output_path)
+class SHuBERTTextGenerator:
+    """
+    A class that combines SHuBERT feature extraction with BYT5 text generation.
+    """
+    def __init__(self, shubert_checkpoint: str, byt5_model_name: str = "google/byt5-base",
+                 device: Optional[str] = None):
+        """
+        Initialize with SHuBERT and BYT5 models.
+        Args:
+            shubert_checkpoint: Path to SHuBERT model checkpoint
+            byt5_model_name: Name of BYT5 model (default: "google/byt5-base")
+            device: Device to use ('cuda' or 'cpu')
+        """
+        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize SHuBERT processor
+        self.shubert_processor = SHubertProcessor(shubert_checkpoint, self.device)
+        # Initialize BYT5 model
+        self.tokenizer = ByT5Tokenizer.from_pretrained(byt5_model_name)
+        self.model = ByT5ForConditionalGeneration.from_pretrained(byt5_model_name).to(self.device)
+    def generate_text(self, face_embeddings: np.ndarray,
+                     left_hand_embeddings: np.ndarray,
+                     right_hand_embeddings: np.ndarray,
+                     pose_embeddings: np.ndarray,
+                     max_length: int = 1024,
+                     num_beams: int = 5) -> str:
+        """
+        Generate text from multi-modal embeddings.
+        Args:
+            face_embeddings: Face embeddings array
+            left_hand_embeddings: Left hand embeddings array
+            right_hand_embeddings: Right hand embeddings array
+            pose_embeddings: Pose embeddings array
+            max_length: Maximum length of generated text
+            num_beams: Number of beams for beam search
+        Returns:
+            Generated text string
+        """
+        # Get SHuBERT features
+        features = self.shubert_processor.process_embeddings(
+            face_embeddings, left_hand_embeddings, right_hand_embeddings, pose_embeddings)
+        # Select features from specific layer (default: last layer)
+        features = features[-1]  # Shape: [T, D]
+        # Convert to tensor and add batch dimension
+        features = torch.from_numpy(features).float().unsqueeze(0).to(self.device)
+        # Generate text
+        generated_ids = self.model.generate(
+            inputs_embeds=features,
+            max_length=max_length,
+            num_beams=num_beams,
+            early_stopping=True
+        )
+        # Decode generated tokens to text
+        return self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+def generate_text_from_features(face_embeddings: np.ndarray,
+                              left_hand_embeddings: np.ndarray,
+                              right_hand_embeddings: np.ndarray,
+                              pose_embeddings: np.ndarray,
+                              shubert_checkpoint: str,
+                              byt5_model_name: str = "google/byt5-base",
+                              max_length: int = 1024,
+                              num_beams: int = 5) -> str:
+    """
+    Convenience function to generate text from features.
+    """
+    generator = SHuBERTTextGenerator(shubert_checkpoint, byt5_model_name)
+    return generator.generate_text(
+        face_embeddings, left_hand_embeddings, right_hand_embeddings, pose_embeddings,
+        max_length=max_length, num_beams=num_beams
+    )
+# Convenience functions for backward compatibility
+def process_shubert_embeddings(face_embeddings: np.ndarray,
+                              left_hand_embeddings: np.ndarray,
+                              right_hand_embeddings: np.ndarray,
+                              pose_embeddings: np.ndarray,
+                              checkpoint_path: str) -> np.ndarray:
+    """
+    Convenience function to process embeddings through SHubert.
+    Args:
+        face_embeddings: Face embeddings array
+        left_hand_embeddings: Left hand embeddings array
+        right_hand_embeddings: Right hand embeddings array
+        pose_embeddings: Pose embeddings array
+        checkpoint_path: Path to the SHubert model checkpoint
+    Returns:
+        Numpy array of SHubert features
+    """
+    processor = SHubertProcessor(checkpoint_path)
+    return processor.process_embeddings(face_embeddings, left_hand_embeddings,
+                                      right_hand_embeddings, pose_embeddings)
+def process_sample(model: SHubertModel, face_path: str, left_hand_path: str,
+                  right_hand_path: str, body_posture_path: str) -> np.ndarray:
+    """
+    Original function for backward compatibility with command-line usage.
+    """
+    # Load numpy arrays
+    face_np = np.load(face_path)
+    left_hand_np = np.load(left_hand_path)
+    right_hand_np = np.load(right_hand_path)
+    body_posture_np = np.load(body_posture_path)
+    face = torch.from_numpy(face_np).float().cuda()
+    left_hand = torch.from_numpy(left_hand_np).float().cuda()
+    right_hand = torch.from_numpy(right_hand_np).float().cuda()
+    body_posture = torch.from_numpy(body_posture_np).float().cuda()
+    length = face.shape[0]
+    # Prepare input
+    source = [{
+        "face": face,
+        "left_hand": left_hand,
+        "right_hand": right_hand,
+        "body_posture": body_posture,
+        # Add dummy labels to match the expected input format
+        "label_face": torch.zeros((length, 1)).cuda(),
+        "label_left_hand": torch.zeros((length, 1)).cuda(),
+        "label_right_hand": torch.zeros((length, 1)).cuda(),
+        "label_body_posture": torch.zeros((length, 1)).cuda()
+    }]
+    # Extract features
+    with torch.no_grad():
+        result = model.extract_features(source, padding_mask=None, kmeans_labels=None, mask=False)
+    # Extract layer outputs
+    layer_outputs = []
+    for layer in result['layer_results']:
+        # layer_output has shape [T, B, D]
+        # Since batch size B is 1, we can squeeze it
+        layer_output = layer[-1]
+        layer_output = layer_output.squeeze(1)  # Shape: [T, D]
+        layer_outputs.append(layer_output.cpu().numpy())  # Convert to NumPy array
+    # Stack the outputs from all layers to get an array of shape [L, T, D]
+    features = np.stack(layer_outputs, axis=0)  # Shape: [L, T, D]
+    return features
+def load_model(checkpoint_path: str) -> SHubertModel:
+    """
+    Original function for backward compatibility with command-line usage.
+    """
+    cfg = SHubertConfig()
+    # Initialize the model
+    model = SHubertModel(cfg)
+    # Load the checkpoint
+    checkpoint = torch.load(checkpoint_path)
+    # If the checkpoint is saved with a 'model' key
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    # Load the state dictionary into the model
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    model.cuda()  # Move to GPU if available
+    return model
+def main(csv_list: List[List[str]], checkpoint_path: str, output_dir: str, index: int):
+    """
+    Original main function for backward compatibility with command-line usage.
+    """
+    model = load_model(checkpoint_path)
+    os.makedirs(output_dir, exist_ok=True)
+    for row in csv_list:
+        cues_list = row[0].split('\t')
+        face_path, left_hand_path, right_hand_path, body_posture_path = cues_list[0], cues_list[1], cues_list[2], cues_list[3]
+        output_filename = f"{os.path.basename(face_path).rsplit('.', 1)[0].rsplit('_', 1)[0]}.npy"
+        output_path = os.path.join(output_dir, output_filename)
+        # check if the output file already exists
+        if os.path.exists(output_path):
+            print(f"Skipping {output_path} as it already exists")
+            continue
+        # Process the sample
+        features = process_sample(model, face_path, left_hand_path, right_hand_path, body_posture_path)
+        np.save(output_path, features)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--index', type=int, required=True,
+                        help='index of the sub_list to work with')
+    parser.add_argument('--csv_path', type=str, required=True,
+                        help='path to the CSV file')
+    parser.add_argument('--checkpoint_path', type=str, required=True,
+                        help='path to the checkpoint file')
+    parser.add_argument('--output_dir', type=str, required=True,
+                        help='directory to save output files')
+    parser.add_argument('--batch_size', type=int, required=True,
+                        help='batch size for processing')
+    args = parser.parse_args()
+    index = args.index
+    csv_path = args.csv_path
+    checkpoint_path = args.checkpoint_path
+    output_dir = args.output_dir
+    batch_size = int(args.batch_size)
+    # make output dir
+    os.makedirs(output_dir, exist_ok=True)
+    # Load CSV data
+    fixed_list = []
+    with open(csv_path, 'r') as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            fixed_list.append(row)
+    # Process in batches
+    video_batches = [fixed_list[i:i + batch_size] for i in range(0, len(fixed_list), batch_size)]
+    csv_list = video_batches[index]
+    main(csv_list, checkpoint_path, output_dir, index)