update
Browse files- config.json +1 -6
- image_processing_magma.py +0 -76
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"MagmaForConditionalGeneration"
|
5 |
],
|
@@ -118,10 +118,6 @@
|
|
118 |
"transformers_version": "4.44.1",
|
119 |
"use_cache": false,
|
120 |
"vision_config": {
|
121 |
-
"_name_or_path": "/mnt/model/llms/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45",
|
122 |
-
"architectures": [
|
123 |
-
"LlavaLlamaForCausalLM"
|
124 |
-
],
|
125 |
"attention_bias": false,
|
126 |
"attention_dropout": 0.0,
|
127 |
"bos_token_id": 128000,
|
@@ -145,7 +141,6 @@
|
|
145 |
"mm_vision_select_feature": "patch",
|
146 |
"mm_vision_select_layer": -2,
|
147 |
"mm_vision_tower": "segtokv9_xxlarge",
|
148 |
-
"model_type": "llava_llama",
|
149 |
"num_attention_heads": 32,
|
150 |
"num_hidden_layers": 32,
|
151 |
"num_key_value_heads": 8,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "Magma-8B",
|
3 |
"architectures": [
|
4 |
"MagmaForConditionalGeneration"
|
5 |
],
|
|
|
118 |
"transformers_version": "4.44.1",
|
119 |
"use_cache": false,
|
120 |
"vision_config": {
|
|
|
|
|
|
|
|
|
121 |
"attention_bias": false,
|
122 |
"attention_dropout": 0.0,
|
123 |
"bos_token_id": 128000,
|
|
|
141 |
"mm_vision_select_feature": "patch",
|
142 |
"mm_vision_select_layer": -2,
|
143 |
"mm_vision_tower": "segtokv9_xxlarge",
|
|
|
144 |
"num_attention_heads": 32,
|
145 |
"num_hidden_layers": 32,
|
146 |
"num_key_value_heads": 8,
|
image_processing_magma.py
CHANGED
@@ -43,82 +43,6 @@ if is_vision_available():
|
|
43 |
import torch
|
44 |
import torchvision
|
45 |
|
46 |
-
def padding_336(b):
|
47 |
-
width, height = b.size
|
48 |
-
tar = int(np.ceil(height / 336) * 336)
|
49 |
-
top_padding = int((tar - height)/2)
|
50 |
-
bottom_padding = tar - height - top_padding
|
51 |
-
left_padding = 0
|
52 |
-
right_padding = 0
|
53 |
-
b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
|
54 |
-
|
55 |
-
return b
|
56 |
-
|
57 |
-
def calc_padded_size(width, height, padding_unit=336):
|
58 |
-
target_height = int(np.ceil(height / padding_unit) * padding_unit)
|
59 |
-
top_padding = int((target_height - height) / 2)
|
60 |
-
bottom_padding = target_height - height - top_padding
|
61 |
-
left_padding = 0
|
62 |
-
right_padding = 0
|
63 |
-
padded_width = width + left_padding + right_padding
|
64 |
-
padded_height = height + top_padding + bottom_padding
|
65 |
-
return padded_width, padded_height
|
66 |
-
|
67 |
-
def HD_transform(img, hd_num=4, base_img_size=768):
|
68 |
-
width, height = img.size
|
69 |
-
trans = False
|
70 |
-
if width < height:
|
71 |
-
img = img.transpose(Image.TRANSPOSE)
|
72 |
-
trans = True
|
73 |
-
width, height = img.size
|
74 |
-
ratio = (width / height)
|
75 |
-
scale = 1
|
76 |
-
while scale*np.ceil(scale/ratio) <= hd_num:
|
77 |
-
scale += 1
|
78 |
-
scale -= 1
|
79 |
-
new_w = int(scale * base_img_size)
|
80 |
-
new_h = int(new_w / ratio)
|
81 |
-
|
82 |
-
img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
|
83 |
-
img = padding_336(img)
|
84 |
-
width, height = img.size
|
85 |
-
if trans:
|
86 |
-
img = img.transpose(Image.TRANSPOSE)
|
87 |
-
|
88 |
-
return img
|
89 |
-
|
90 |
-
def calc_hd_transform_size(width, height, hd_num=16):
|
91 |
-
transposed = False
|
92 |
-
if width < height:
|
93 |
-
width, height = height, width
|
94 |
-
transposed = True
|
95 |
-
|
96 |
-
ratio = width / height
|
97 |
-
scale = 1
|
98 |
-
while scale * np.ceil(scale / ratio) <= hd_num:
|
99 |
-
scale += 1
|
100 |
-
scale -= 1
|
101 |
-
|
102 |
-
new_width = int(scale * 336)
|
103 |
-
new_height = int(new_width / ratio)
|
104 |
-
|
105 |
-
padded_width, padded_height = calc_padded_size(new_width, new_height)
|
106 |
-
|
107 |
-
if transposed:
|
108 |
-
padded_width, padded_height = padded_height, padded_width
|
109 |
-
|
110 |
-
return padded_width, padded_height
|
111 |
-
|
112 |
-
def pad_to_max_num_crops_tensor(images, max_crops=5):
|
113 |
-
"""
|
114 |
-
images: B x 3 x H x W, B<=max_crops
|
115 |
-
"""
|
116 |
-
B, _, H, W = images.shape
|
117 |
-
if B < max_crops:
|
118 |
-
pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
|
119 |
-
images = torch.cat([images, pad], dim=0)
|
120 |
-
return images
|
121 |
-
|
122 |
def select_best_resolution(original_size, possible_resolutions):
|
123 |
"""
|
124 |
Selects the best resolution from a list of possible resolutions based on the original size.
|
|
|
43 |
import torch
|
44 |
import torchvision
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def select_best_resolution(original_size, possible_resolutions):
|
47 |
"""
|
48 |
Selects the best resolution from a list of possible resolutions based on the original size.
|