tingwei.wang commited on
Commit
db580a6
·
1 Parent(s): 07a5d4d

Add application file

Browse files
Files changed (2) hide show
  1. app.py +89 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
3
+ from transformers.image_utils import load_image
4
+ from threading import Thread
5
+ import time
6
+ import torch
7
+ import spaces
8
+
9
+ MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
10
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
11
+ model = Qwen2_VLForConditionalGeneration.from_pretrained(
12
+ MODEL_ID,
13
+ trust_remote_code=True,
14
+ torch_dtype=torch.bfloat16
15
+ ).to("cpu").eval()
16
+
17
+ @spaces.GPU
18
+ def model_inference(input_dict, history):
19
+ text = input_dict["text"]
20
+ files = input_dict["files"]
21
+
22
+ # Load images if provided
23
+ if len(files) > 1:
24
+ images = [load_image(image) for image in files]
25
+ elif len(files) == 1:
26
+ images = [load_image(files[0])]
27
+ else:
28
+ images = []
29
+
30
+ # Validate input
31
+ if text == "" and not images:
32
+ gr.Error("Please input a query and optionally image(s).")
33
+ return
34
+ if text == "" and images:
35
+ gr.Error("Please input a text query along with the image(s).")
36
+ return
37
+
38
+ # Prepare messages for the model
39
+ messages = [
40
+ {
41
+ "role": "user",
42
+ "content": [
43
+ *[{"type": "image", "image": image} for image in images],
44
+ {"type": "text", "text": text},
45
+ ],
46
+ }
47
+ ]
48
+
49
+ # Apply chat template and process inputs
50
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
+ inputs = processor(
52
+ text=[prompt],
53
+ images=images if images else None,
54
+ return_tensors="pt",
55
+ padding=True,
56
+ ).to("cpu")
57
+
58
+ # Set up streamer for real-time output
59
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
60
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
61
+
62
+ # Start generation in a separate thread
63
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
64
+ thread.start()
65
+
66
+ # Stream the output
67
+ buffer = ""
68
+ yield "Thinking..."
69
+ for new_text in streamer:
70
+ buffer += new_text
71
+ time.sleep(0.01)
72
+ yield buffer
73
+
74
+
75
+ # Example inputs
76
+ examples = [
77
+ ]
78
+
79
+ demo = gr.ChatInterface(
80
+ fn=model_inference,
81
+ description="# **Qwen2-VL-2B-Instruct**",
82
+ examples=examples,
83
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
84
+ stop_btn="Stop Generation",
85
+ multimodal=True,
86
+ cache_examples=False,
87
+ )
88
+
89
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio_client==1.3.0
2
+ qwen-vl-utils==0.0.2
3
+ transformers-stream-generator==0.0.4
4
+ torch==2.4.0
5
+ torchvision==0.19.0
6
+ git+https://github.com/huggingface/transformers.git
7
+ accelerate
8
+ av