KangarooGroup
/

kangaroo

@@ -1,11 +1,29 @@
 import decord
-import numpy as np
-import torch
-from PIL import Image
 import random
-from eva_clip.transform import image_transform
-image_processor = image_transform(image_size=448, is_train=False)
 def preprocess_multimodal(sources, num_segments):
     for source in sources:
@@ -26,6 +44,7 @@ def preprocess_multimodal(sources, num_segments):
                 sentence["content"] = sentence["content"].replace(X_token, replace_token)
     return sources
 def preprocess(
     sources,
     tokenizer,
@@ -60,9 +79,6 @@ def preprocess(
     else:
         index = random.choice(range(len(en_qa_templates)))
     system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手，{ch_qa_templates[index]}"""
-    chat_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>'
-                        + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}
-                        {% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"""
     messages = []
     for source in sources:
         message = [{'role': 'system', 'content': system_prompt}]
@@ -70,14 +86,14 @@ def preprocess(
             message.append(sentence)
         messages.append(message)
-    #input_ids = tokenizer.apply_chat_template(messages, chat_template, add_generation_prompt=True, return_tensors='pt')
     input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
     return input_ids
 def get_index(fps, max_frame, num_segments):
     num_frames = max_frame
     if num_frames <= num_segments:
-        out_indices = start_idx + np.array([(idx % num_frames) for idx in range(num_segments)])
         out_indices = np.sort(out_indices)
     else:
         out_indices = np.linspace(0, num_frames-1, num_segments)
@@ -85,22 +101,23 @@ def get_index(fps, max_frame, num_segments):
     durations = [idx.item() / fps  for idx in out_indices]
     return out_indices.astype(np.int64), durations
 def read_video(video_path, num_segments):
     vr = decord.VideoReader(video_path)
-    max_frame = len(vr) - 1
     fps = float(vr.get_avg_fps())
-    total_duration = len(vr) / fps
-    frame_indices, durations = get_index(fps, max_frame, num_segments)
     video = []
     for frame_index in frame_indices:
         image = Image.fromarray(vr[frame_index].asnumpy())
         video.append(image_processor(image).unsqueeze(0))
     video = torch.concat(video)
-    return video, torch.Tensor(durations), total_duration
 def get_input(video_path, num_segments, question, history, tokenizer, s_id):
-    video, durations, total_duration = read_video(video_path, num_segments)
     if history == None:
         conversations = []
         conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
@@ -113,8 +130,7 @@ def get_input(video_path, num_segments, question, history, tokenizer, s_id):
     return video, durations, input_ids, conversations
 def add_pred_to_history(history, pred):
     history.append({'role': 'assistant', 'content': pred})
     return history

 import decord
 import random
+import numpy as np
+from PIL import Image
+import torch
+from torchvision.transforms import Normalize, Compose, InterpolationMode, ToTensor, Resize
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def image_transform(image_size: int):
+    mean = (0.48145466, 0.4578275, 0.40821073)
+    std = (0.26862954, 0.26130258, 0.27577711)
+    normalize = Normalize(mean=mean, std=std)
+    transforms = [
+        Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+        _convert_to_rgb,
+        ToTensor(),
+        normalize,
+    ]
+    return Compose(transforms)
 def preprocess_multimodal(sources, num_segments):
     for source in sources:
                 sentence["content"] = sentence["content"].replace(X_token, replace_token)
     return sources
 def preprocess(
     sources,
     tokenizer,
     else:
         index = random.choice(range(len(en_qa_templates)))
     system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手，{ch_qa_templates[index]}"""
     messages = []
     for source in sources:
         message = [{'role': 'system', 'content': system_prompt}]
             message.append(sentence)
         messages.append(message)
     input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
     return input_ids
 def get_index(fps, max_frame, num_segments):
     num_frames = max_frame
     if num_frames <= num_segments:
+        out_indices = np.array([(idx % num_frames) for idx in range(num_segments)])
         out_indices = np.sort(out_indices)
     else:
         out_indices = np.linspace(0, num_frames-1, num_segments)
     durations = [idx.item() / fps  for idx in out_indices]
     return out_indices.astype(np.int64), durations
 def read_video(video_path, num_segments):
+    image_processor = image_transform(image_size=448)
     vr = decord.VideoReader(video_path)
     fps = float(vr.get_avg_fps())
+    frame_indices, durations = get_index(fps, len(vr) - 1, num_segments)
     video = []
     for frame_index in frame_indices:
         image = Image.fromarray(vr[frame_index].asnumpy())
         video.append(image_processor(image).unsqueeze(0))
     video = torch.concat(video)
+    return video, torch.Tensor(durations)
 def get_input(video_path, num_segments, question, history, tokenizer, s_id):
+    video, durations = read_video(video_path, num_segments)
     if history == None:
         conversations = []
         conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
     return video, durations, input_ids, conversations
 def add_pred_to_history(history, pred):
     history.append({'role': 'assistant', 'content': pred})
     return history