rinkhanh000 commited on
Commit
3f1b507
·
1 Parent(s): 69f57e9

Add application file

Browse files
Files changed (2) hide show
  1. app.py +89 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoTokenizer
3
+ from qwen_vl_utils import process_vision_info
4
+ from transformers import Qwen2_5_VLForConditionalGeneration
5
+ import torch
6
+ from PIL import Image
7
+
8
+ # ImageNet constants (not used in this code, kept for reference)
9
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
10
+ IMAGENET_STD = (0.229, 0.224, 0.225)
11
+
12
+ # Load model and processor
13
+ model_name = "rinkhanh000/Qwen2.5VL-7B_ViMemeCap"
14
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
15
+ model_name,
16
+ torch_dtype=torch.float32, # Use float32 for CPU
17
+ trust_remote_code=True
18
+ ).eval() # No device_map or cuda
19
+
20
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
21
+
22
+ # Prediction function
23
+ def predict_from_prompt_and_image(prompt, image):
24
+ if not prompt or not image:
25
+ return {"Error": "Please provide both a prompt and an image"}
26
+ try:
27
+ messages = [
28
+ {
29
+ "role": "user",
30
+ "content": [
31
+ {
32
+ "type": "image",
33
+ "image": image # PIL image from Gradio
34
+ },
35
+ {
36
+ "type": "text",
37
+ "text": prompt # User's text input
38
+ }
39
+ ]
40
+ }
41
+ ]
42
+
43
+ # Prepare inputs for inference
44
+ text = processor.apply_chat_template(
45
+ messages, tokenize=False, add_generation_prompt=True
46
+ )
47
+ image_inputs, video_inputs = process_vision_info(messages)
48
+ inputs = processor(
49
+ text=[text],
50
+ images=image_inputs,
51
+ videos=video_inputs,
52
+ padding=True,
53
+ return_tensors="pt"
54
+ )
55
+ # No .to("cuda") - keep on CPU
56
+
57
+ # Generate response
58
+ generation_config = {
59
+ "max_new_tokens": 512,
60
+ "do_sample": False, # Enable beam search
61
+ "num_beams": 3, # 3 beams
62
+ "repetition_penalty": 3.5
63
+ }
64
+ generated_ids = model.generate(**inputs, **generation_config)
65
+ generated_ids_trimmed = [
66
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
67
+ ]
68
+ response = processor.batch_decode(
69
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
70
+ )[0]
71
+ return response
72
+
73
+ except Exception as e:
74
+ return {"Error": f"Failed to process: {str(e)}"}
75
+
76
+ # Gradio interface
77
+ demo = gr.Interface(
78
+ fn=predict_from_prompt_and_image,
79
+ inputs=[
80
+ gr.Textbox(label="Enter Prompt"),
81
+ gr.Image(label="Upload Image", type="pil")
82
+ ],
83
+ outputs=gr.Textbox(label="Generated Caption"),
84
+ title="ViMemeCap",
85
+ allow_flagging="never"
86
+ )
87
+
88
+ # Launch the interface
89
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ pillow
5
+ torchvision
6
+ qwen-vl-utils