File size: 13,628 Bytes
ceeabec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import cv2
import numpy as np
import os
import pickle
import gzip
from datetime import datetime
from pathlib import Path
import decord
import argparse
import json
import glob
import time
from typing import Dict, List, Optional, Tuple, Union, Any


class PoseProcessor:
    """
    A class for processing pose landmarks and converting them to normalized numpy arrays.
    """
    
    def __init__(self, pose_indices: Optional[List[int]] = None, 
                 normalize_keypoints: bool = True, fill_missing_value: float = -9999.0):
        """
        Initialize the PoseProcessor.
        
        Args:
            pose_indices: List of pose landmark indices to extract. 
                         Default is [0,11,12,13,14,15,16] (nose, shoulders, elbows, wrists)
            normalize_keypoints: Whether to normalize keypoints to signing space
            fill_missing_value: Value to use for missing keypoints
        """
        self.pose_indices = pose_indices if pose_indices else [0, 11, 12, 13, 14, 15, 16]
        self.normalize_keypoints = normalize_keypoints
        self.fill_missing_value = fill_missing_value
        
        # Number of coordinates per keypoint (x, y)
        self.coords_per_keypoint = 2
        self.output_shape = (len(self.pose_indices), self.coords_per_keypoint)
    
    def normalize_pose_keypoints(self, pose_landmarks: List[List[float]]) -> List[List[float]]:
        """
        Normalize pose keypoints to signing space.
        
        Args:
            pose_landmarks: List of pose landmarks from MediaPipe
            
        Returns:
            List of normalized pose keypoints
        """
        # Extract relevant landmarks for normalization
        left_shoulder = np.array(pose_landmarks[11][:2])
        right_shoulder = np.array(pose_landmarks[12][:2])
        left_eye = np.array(pose_landmarks[2][:2])
        nose = np.array(pose_landmarks[0][:2])

        # Calculate head unit in normalized space
        head_unit = np.linalg.norm(right_shoulder - left_shoulder) / 2

        # Define signing space dimensions in normalized space
        signing_space_width = 6 * head_unit
        signing_space_height = 7 * head_unit

        # Calculate signing space bounding box in normalized space
        signing_space_top = left_eye[1] - 0.5 * head_unit
        signing_space_bottom = signing_space_top + signing_space_height
        signing_space_left = nose[0] - signing_space_width / 2
        signing_space_right = signing_space_left + signing_space_width

        # Create transformation matrix
        translation_matrix = np.array([[1, 0, -signing_space_left],
                                       [0, 1, -signing_space_top],
                                       [0, 0, 1]])
        scale_matrix = np.array([[1 / signing_space_width, 0, 0],
                                 [0, 1 / signing_space_height, 0],
                                 [0, 0, 1]])
        shift_matrix = np.array([[1, 0, -0.5],
                                 [0, 1, -0.5],
                                 [0, 0, 1]])
        transformation_matrix = shift_matrix @ scale_matrix @ translation_matrix

        # Apply transformation to pose keypoints
        normalized_keypoints = []
        for landmark in pose_landmarks:
            keypoint = np.array([landmark[0], landmark[1], 1])
            normalized_keypoint = transformation_matrix @ keypoint
            normalized_keypoints.append(normalized_keypoint[:2].tolist())

        return normalized_keypoints
    
    def process_frame_landmarks(self, frame_landmarks: Optional[Dict[str, Any]]) -> np.ndarray:
        """
        Process landmarks for a single frame.
        
        Args:
            frame_landmarks: Dictionary containing pose landmarks for one frame
            
        Returns:
            Numpy array of processed pose keypoints
        """
        if frame_landmarks is None or frame_landmarks.get('pose_landmarks') is None:
            # Return missing value array
            return np.full(self.output_shape, self.fill_missing_value).flatten()
        
        # Get pose landmarks
        pose_landmarks = frame_landmarks['pose_landmarks'][0]
        
        # Normalize keypoints if required
        if self.normalize_keypoints:
            # Take first 25 landmarks for normalization (MediaPipe pose has 33 total)
            normalized_landmarks = self.normalize_pose_keypoints(pose_landmarks[:25])
        else:
            normalized_landmarks = pose_landmarks
        
        # Extract only the specified indices
        selected_landmarks = [normalized_landmarks[i] for i in self.pose_indices]
        
        # Convert to numpy array and flatten
        frame_keypoints = np.array(selected_landmarks).flatten()
        
        return frame_keypoints
    
    def process_landmarks_sequence(self, landmarks_data: Dict[int, Any]) -> np.ndarray:
        """
        Process landmarks for an entire sequence (video).
        
        Args:
            landmarks_data: Dictionary containing landmarks for each frame
            
        Returns:
            Numpy array of shape (num_frames, num_keypoints * 2)
        """
        # Get number of frames
        if not landmarks_data:
            return np.array([])
        
        max_frame = max(landmarks_data.keys())
        num_frames = max_frame + 1
        
        video_pose_landmarks = []
        prev_pose = None
        
        for i in range(num_frames):
            frame_landmarks = landmarks_data.get(i, None)
            
            if frame_landmarks is None:
                # Use previous pose if available, otherwise use missing values
                if prev_pose is not None:
                    frame_keypoints = prev_pose
                else:
                    frame_keypoints = np.full(self.output_shape, self.fill_missing_value).flatten()
            else:
                # Process current frame
                frame_keypoints = self.process_frame_landmarks(frame_landmarks)
                if not np.all(frame_keypoints == self.fill_missing_value):
                    prev_pose = frame_keypoints
            
            video_pose_landmarks.append(frame_keypoints)
        
        # Convert to numpy array
        video_pose_landmarks = np.array(video_pose_landmarks)
        
        # Apply any post-processing (like the original code's wrist masking)
        # video_pose_landmarks = self._apply_post_processing(video_pose_landmarks)
        
        return video_pose_landmarks
    
    def _apply_post_processing(self, pose_array: np.ndarray) -> np.ndarray:
        """
        Apply post-processing to the pose array.
        
        Args:
            pose_array: Input pose array
            
        Returns:
            Post-processed pose array
        """
        # The original code fills left and right wrist with -9999
        # This corresponds to indices 15 and 16 in the original pose landmarks
        # In our selected indices [0,11,12,13,14,15,16], wrists are at positions 5 and 6
        # Each keypoint has 2 coordinates, so wrists are at positions 10-11 and 12-13
        
        # if len(self.pose_indices) >= 7 and 15 in self.pose_indices and 16 in self.pose_indices:
        #     # Find positions of wrists in our selected indices
        #     left_wrist_idx = self.pose_indices.index(15) * 2  # *2 because each keypoint has x,y
        #     right_wrist_idx = self.pose_indices.index(16) * 2
            
        #     # Fill wrist coordinates with missing value
        #     pose_array[:, left_wrist_idx:left_wrist_idx+2] = self.fill_missing_value
        #     pose_array[:, right_wrist_idx:right_wrist_idx+2] = self.fill_missing_value
        
        return pose_array
    
    def process_landmarks_from_file(self, pose_file_path: str) -> np.ndarray:
        """
        Process landmarks from a JSON file.
        
        Args:
            pose_file_path: Path to the pose landmarks JSON file
            
        Returns:
            Numpy array of processed pose keypoints
        """
        try:
            with open(pose_file_path, 'r') as f:
                landmarks_data = json.load(f)
            
            # Convert string keys to integers
            landmarks_data = {int(k): v for k, v in landmarks_data.items()}
            
            return self.process_landmarks_sequence(landmarks_data)
            
        except Exception as e:
            print(f"Error processing {pose_file_path}: {e}")
            return np.array([])
    
    def process_and_save_landmarks(self, landmarks_data: Dict[int, Any], 
                                  output_path: str, filename: str) -> str:
        """
        Process landmarks and save to file.
        
        Args:
            landmarks_data: Dictionary containing landmarks for each frame
            output_path: Directory to save the processed landmarks
            filename: Name for the output file (without extension)
            
        Returns:
            Path to the saved file
        """
        # Process landmarks
        processed_landmarks = self.process_landmarks_sequence(landmarks_data)
        
        # Create output directory if it doesn't exist
        output_dir = Path(output_path)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save to file
        save_path = output_dir / f"{filename}.npy"
        np.save(save_path, processed_landmarks)
        
        return str(save_path)


# Convenience functions for backward compatibility
def process_pose_landmarks(landmarks_data: Dict[int, Any], 
                          normalize: bool = True, 
                          pose_indices: Optional[List[int]] = None) -> np.ndarray:
    """
    Convenience function to process pose landmarks.
    
    Args:
        landmarks_data: Dictionary containing landmarks for each frame
        normalize: Whether to normalize keypoints to signing space
        pose_indices: List of pose landmark indices to extract
        
    Returns:
        Numpy array of processed pose keypoints
    """
    processor = PoseProcessor(pose_indices=pose_indices, normalize_keypoints=normalize)
    return processor.process_landmarks_sequence(landmarks_data)


def keypoints_to_numpy(pose_file: str, pose_emb_path: str):
    """
    Original function for backward compatibility with command-line usage.
    """
    try:
        processor = PoseProcessor()
        processed_landmarks = processor.process_landmarks_from_file(pose_file)
        
        if processed_landmarks.size > 0:
            # Save the processed landmarks
            video_name = Path(pose_file).stem
            save_path = Path(pose_emb_path) / f"{video_name}.npy"
            save_path.parent.mkdir(parents=True, exist_ok=True)
            np.save(save_path, processed_landmarks)
            
    except Exception as e:
        print(f"Error processing {pose_file}: {e}")


# Utility functions for batch processing
def get_mp4_files(directory: str) -> List[str]:
    """Get all MP4 files in a directory."""
    if not os.path.exists(directory):
        raise FileNotFoundError(f'Directory not found: {directory}')
    
    mp4_files = glob.glob(os.path.join(directory, '*.mp4'))
    return [os.path.abspath(file) for file in mp4_files]


def load_file(filename: str):
    """Load a pickled and gzipped file."""
    with gzip.open(filename, "rb") as f:
        return pickle.load(f)


def is_string_in_file(file_path: str, target_string: str) -> bool:
    """Check if a string exists in a file."""
    try:
        with Path(file_path).open("r") as f:
            for line in f:
                if target_string in line:
                    return True
        return False
    except Exception as e:
        print(f"Error: {e}")
        return False


def main():
    """Main function for command-line usage."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--index', type=int, required=True,
                        help='index of the sub_list to work with')
    parser.add_argument('--files_list', type=str, required=True,
                        help='path to the pose file')
    parser.add_argument('--pose_features_path', type=str, required=True,
                        help='path to the pose features file')
    parser.add_argument('--batch_size', type=int, required=True,
                        help='batch size')
    parser.add_argument('--time_limit', type=int, required=True,
                        help='time limit')
    
    args = parser.parse_args()
    start_time = time.time()
    
    # Load files list
    fixed_list = load_file(args.files_list)
    
    # Initialize processor
    processor = PoseProcessor()
    
    # Process files in batches
    video_batches = [fixed_list[i:i + args.batch_size] for i in range(0, len(fixed_list), args.batch_size)]
    
    for pose_file in video_batches[args.index]:
        pose_file_path = Path(pose_file)
        output_path = Path(args.pose_features_path) / f"{pose_file_path.stem}.npy"
        
        if output_path.exists():
            print(f"Skipping {pose_file} - output already exists")
            continue
        
        current_time = time.time()
        if current_time - start_time > args.time_limit:
            print("Time limit reached. Stopping execution.")
            break
        
        try:
            print(f"Processing {pose_file}")
            keypoints_to_numpy(pose_file, args.pose_features_path)
            print(f"Successfully processed {pose_file}")
        except Exception as e:
            print(f"Error processing {pose_file}: {e}")


if __name__ == "__main__":
    main()