rinkhanh000 commited on
Commit
35ccbb5
·
1 Parent(s): 3f1b507
Files changed (2) hide show
  1. app.py +75 -54
  2. requirements.txt +5 -6
app.py CHANGED
@@ -1,75 +1,96 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, AutoTokenizer
3
- from qwen_vl_utils import process_vision_info
4
- from transformers import Qwen2_5_VLForConditionalGeneration
5
  import torch
6
  from PIL import Image
 
 
7
 
8
- # ImageNet constants (not used in this code, kept for reference)
9
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
10
  IMAGENET_STD = (0.229, 0.224, 0.225)
11
 
12
  # Load model and processor
13
- model_name = "rinkhanh000/Qwen2.5VL-7B_ViMemeCap"
14
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
15
  model_name,
16
  torch_dtype=torch.float32, # Use float32 for CPU
 
17
  trust_remote_code=True
18
- ).eval() # No device_map or cuda
 
19
 
20
- processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Prediction function
23
  def predict_from_prompt_and_image(prompt, image):
24
  if not prompt or not image:
25
  return {"Error": "Please provide both a prompt and an image"}
26
  try:
27
- messages = [
28
- {
29
- "role": "user",
30
- "content": [
31
- {
32
- "type": "image",
33
- "image": image # PIL image from Gradio
34
- },
35
- {
36
- "type": "text",
37
- "text": prompt # User's text input
38
- }
39
- ]
40
- }
41
- ]
42
-
43
- # Prepare inputs for inference
44
- text = processor.apply_chat_template(
45
- messages, tokenize=False, add_generation_prompt=True
46
- )
47
- image_inputs, video_inputs = process_vision_info(messages)
48
- inputs = processor(
49
- text=[text],
50
- images=image_inputs,
51
- videos=video_inputs,
52
- padding=True,
53
- return_tensors="pt"
54
- )
55
- # No .to("cuda") - keep on CPU
56
-
57
- # Generate response
58
- generation_config = {
59
- "max_new_tokens": 512,
60
- "do_sample": False, # Enable beam search
61
- "num_beams": 3, # 3 beams
62
- "repetition_penalty": 3.5
63
- }
64
- generated_ids = model.generate(**inputs, **generation_config)
65
- generated_ids_trimmed = [
66
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
67
- ]
68
- response = processor.batch_decode(
69
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
70
- )[0]
71
- return response
72
-
73
  except Exception as e:
74
  return {"Error": f"Failed to process: {str(e)}"}
75
 
 
1
  import gradio as gr
2
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer
 
 
3
  import torch
4
  from PIL import Image
5
+ import torchvision.transforms as T
6
+ from torchvision.transforms.functional import InterpolationMode
7
 
8
+ # ImageNet constants
9
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
10
  IMAGENET_STD = (0.229, 0.224, 0.225)
11
 
12
  # Load model and processor
13
+ model_name = 'rinkhanh000/Vintern-ViMemeCap'
14
+ model = AutoModel.from_pretrained(
15
  model_name,
16
  torch_dtype=torch.float32, # Use float32 for CPU
17
+ low_cpu_mem_usage=True,
18
  trust_remote_code=True
19
+ ).eval() # No .cuda()
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
21
 
22
+ def build_transform(input_size):
23
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
24
+ transform = T.Compose([
25
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
26
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
27
+ T.ToTensor(),
28
+ T.Normalize(mean=MEAN, std=STD)
29
+ ])
30
+ return transform
31
+
32
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
33
+ best_ratio_diff = float('inf')
34
+ best_ratio = (1, 1)
35
+ area = width * height
36
+ for ratio in target_ratios:
37
+ target_aspect_ratio = ratio[0] / ratio[1]
38
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
39
+ if ratio_diff < best_ratio_diff:
40
+ best_ratio_diff = ratio_diff
41
+ best_ratio = ratio
42
+ elif ratio_diff == best_ratio_diff:
43
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
44
+ best_ratio = ratio
45
+ return best_ratio
46
+
47
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
48
+ orig_width, orig_height = image.size
49
+ aspect_ratio = orig_width / orig_height
50
+ target_ratios = set(
51
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1)
52
+ if i * j <= max_num and i * j >= min_num)
53
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
54
+ target_aspect_ratio = find_closest_aspect_ratio(
55
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
56
+ target_width = image_size * target_aspect_ratio[0]
57
+ target_height = image_size * target_aspect_ratio[1]
58
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
59
+ resized_img = image.resize((target_width, target_height))
60
+ processed_images = []
61
+ for i in range(blocks):
62
+ box = (
63
+ (i % (target_width // image_size)) * image_size,
64
+ (i // (target_width // image_size)) * image_size,
65
+ ((i % (target_width // image_size)) + 1) * image_size,
66
+ ((i // (target_width // image_size)) + 1) * image_size
67
+ )
68
+ split_img = resized_img.crop(box)
69
+ processed_images.append(split_img)
70
+ assert len(processed_images) == blocks
71
+ if use_thumbnail and len(processed_images) != 1:
72
+ thumbnail_img = image.resize((image_size, image_size))
73
+ processed_images.append(thumbnail_img)
74
+ return processed_images
75
+
76
+ def load_image(image_file, input_size=448, max_num=12):
77
+ image = Image.open(image_file).convert('RGB')
78
+ transform = build_transform(input_size=input_size)
79
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
80
+ pixel_values = [transform(image) for image in images]
81
+ pixel_values = torch.stack(pixel_values)
82
+ return pixel_values
83
 
84
  # Prediction function
85
  def predict_from_prompt_and_image(prompt, image):
86
  if not prompt or not image:
87
  return {"Error": "Please provide both a prompt and an image"}
88
  try:
89
+ generation_config = dict(max_new_tokens=512, do_sample=False, num_beams=3, repetition_penalty=3.5)
90
+ question = prompt.strip()
91
+ pixel_values = load_image(image, max_num=6).to(torch.float32) # Use float32 for CPU
92
+ response = model.chat(tokenizer, pixel_values, question, generation_config)
93
+ return {response}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  except Exception as e:
95
  return {"Error": f"Failed to process: {str(e)}"}
96
 
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
- gradio
2
- transformers
3
- torch
4
- pillow
5
- torchvision
6
- qwen-vl-utils
 
1
+ gradio==4.44.0
2
+ transformers==4.44.2
3
+ torch==2.4.1
4
+ Pillow==10.4.0
5
+ torchvision==0.19.1