jw2yang commited on
Commit
25537d2
·
1 Parent(s): 4342ccf
Files changed (2) hide show
  1. config.json +1 -6
  2. image_processing_magma.py +0 -76
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/home/jianwyan/projects/ProjectWillow/azureblobs/projects4jw_model/magma/checkpoints/finetune-none-bs8-ep5-bimsz512-ncrops4-anyrescrop-seqlen3072-1e-5-constant-0.0_openx_magma_trace_coin_howto100m_ego4d_sthv2_epic_seeclick_llava_sharegpt4v_vision2ui_-1_iseTrue_ihTrue_tseFalse_tsdTrue_rtptsTrue_qsz256-nnodes12-zero1/checkpoint-12000",
3
  "architectures": [
4
  "MagmaForConditionalGeneration"
5
  ],
@@ -118,10 +118,6 @@
118
  "transformers_version": "4.44.1",
119
  "use_cache": false,
120
  "vision_config": {
121
- "_name_or_path": "/mnt/model/llms/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45",
122
- "architectures": [
123
- "LlavaLlamaForCausalLM"
124
- ],
125
  "attention_bias": false,
126
  "attention_dropout": 0.0,
127
  "bos_token_id": 128000,
@@ -145,7 +141,6 @@
145
  "mm_vision_select_feature": "patch",
146
  "mm_vision_select_layer": -2,
147
  "mm_vision_tower": "segtokv9_xxlarge",
148
- "model_type": "llava_llama",
149
  "num_attention_heads": 32,
150
  "num_hidden_layers": 32,
151
  "num_key_value_heads": 8,
 
1
  {
2
+ "_name_or_path": "Magma-8B",
3
  "architectures": [
4
  "MagmaForConditionalGeneration"
5
  ],
 
118
  "transformers_version": "4.44.1",
119
  "use_cache": false,
120
  "vision_config": {
 
 
 
 
121
  "attention_bias": false,
122
  "attention_dropout": 0.0,
123
  "bos_token_id": 128000,
 
141
  "mm_vision_select_feature": "patch",
142
  "mm_vision_select_layer": -2,
143
  "mm_vision_tower": "segtokv9_xxlarge",
 
144
  "num_attention_heads": 32,
145
  "num_hidden_layers": 32,
146
  "num_key_value_heads": 8,
image_processing_magma.py CHANGED
@@ -43,82 +43,6 @@ if is_vision_available():
43
  import torch
44
  import torchvision
45
 
46
- def padding_336(b):
47
- width, height = b.size
48
- tar = int(np.ceil(height / 336) * 336)
49
- top_padding = int((tar - height)/2)
50
- bottom_padding = tar - height - top_padding
51
- left_padding = 0
52
- right_padding = 0
53
- b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
54
-
55
- return b
56
-
57
- def calc_padded_size(width, height, padding_unit=336):
58
- target_height = int(np.ceil(height / padding_unit) * padding_unit)
59
- top_padding = int((target_height - height) / 2)
60
- bottom_padding = target_height - height - top_padding
61
- left_padding = 0
62
- right_padding = 0
63
- padded_width = width + left_padding + right_padding
64
- padded_height = height + top_padding + bottom_padding
65
- return padded_width, padded_height
66
-
67
- def HD_transform(img, hd_num=4, base_img_size=768):
68
- width, height = img.size
69
- trans = False
70
- if width < height:
71
- img = img.transpose(Image.TRANSPOSE)
72
- trans = True
73
- width, height = img.size
74
- ratio = (width / height)
75
- scale = 1
76
- while scale*np.ceil(scale/ratio) <= hd_num:
77
- scale += 1
78
- scale -= 1
79
- new_w = int(scale * base_img_size)
80
- new_h = int(new_w / ratio)
81
-
82
- img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
83
- img = padding_336(img)
84
- width, height = img.size
85
- if trans:
86
- img = img.transpose(Image.TRANSPOSE)
87
-
88
- return img
89
-
90
- def calc_hd_transform_size(width, height, hd_num=16):
91
- transposed = False
92
- if width < height:
93
- width, height = height, width
94
- transposed = True
95
-
96
- ratio = width / height
97
- scale = 1
98
- while scale * np.ceil(scale / ratio) <= hd_num:
99
- scale += 1
100
- scale -= 1
101
-
102
- new_width = int(scale * 336)
103
- new_height = int(new_width / ratio)
104
-
105
- padded_width, padded_height = calc_padded_size(new_width, new_height)
106
-
107
- if transposed:
108
- padded_width, padded_height = padded_height, padded_width
109
-
110
- return padded_width, padded_height
111
-
112
- def pad_to_max_num_crops_tensor(images, max_crops=5):
113
- """
114
- images: B x 3 x H x W, B<=max_crops
115
- """
116
- B, _, H, W = images.shape
117
- if B < max_crops:
118
- pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
119
- images = torch.cat([images, pad], dim=0)
120
- return images
121
-
122
  def select_best_resolution(original_size, possible_resolutions):
123
  """
124
  Selects the best resolution from a list of possible resolutions based on the original size.
 
43
  import torch
44
  import torchvision
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def select_best_resolution(original_size, possible_resolutions):
47
  """
48
  Selects the best resolution from a list of possible resolutions based on the original size.