DoctorSlimm commited on
Commit
9cb2953
·
verified ·
1 Parent(s): a539d3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -88
app.py CHANGED
@@ -30,16 +30,45 @@ model = AutoModelForCausalLM.from_pretrained(
30
  ).to(DEVICE).eval()
31
 
32
 
 
 
 
 
 
 
33
  @spaces.GPU
34
  def generate_caption(image, prompt):
 
35
 
36
  # Process the image and the prompt
37
- text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
38
- # inputs = processor(texts=[prompt], images=[image], return_tensors="pt").to('cuda') # move inputs to cuda
39
-
40
-
41
-
42
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  ## make predictions via api ##
@@ -53,85 +82,4 @@ demo = gr.Interface(
53
  )
54
 
55
  # Launch the interface
56
- demo.launch(share=True)
57
-
58
-
59
-
60
- ####### ML CODE #######
61
- import torch
62
- from PIL import Image
63
- from transformers import AutoModelForCausalLM, AutoTokenizer
64
-
65
- MODEL_PATH = "THUDM/cogvlm2-llama3-chat-19B"
66
- DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
67
- TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
68
-
69
- tokenizer = AutoTokenizer.from_pretrained(
70
- MODEL_PATH,
71
- trust_remote_code=True
72
- )
73
- model = AutoModelForCausalLM.from_pretrained(
74
- MODEL_PATH,
75
- torch_dtype=TORCH_TYPE,
76
- trust_remote_code=True,
77
- ).to(DEVICE).eval()
78
-
79
- text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
80
-
81
- while True:
82
- image_path = input("image path >>>>> ")
83
- if image_path == '':
84
- print('You did not enter image path, the following will be a plain text conversation.')
85
- image = None
86
- text_only_first_query = True
87
- else:
88
- image = Image.open(image_path).convert('RGB')
89
-
90
- history = []
91
-
92
- while True:
93
- query = input("Human:")
94
- if query == "clear":
95
- break
96
-
97
- if image is None:
98
- if text_only_first_query:
99
- query = text_only_template.format(query)
100
- text_only_first_query = False
101
- else:
102
- old_prompt = ''
103
- for _, (old_query, response) in enumerate(history):
104
- old_prompt += old_query + " " + response + "\n"
105
- query = old_prompt + "USER: {} ASSISTANT:".format(query)
106
- if image is None:
107
- input_by_model = model.build_conversation_input_ids(
108
- tokenizer,
109
- query=query,
110
- history=history,
111
- template_version='chat'
112
- )
113
- else:
114
- input_by_model = model.build_conversation_input_ids(
115
- tokenizer,
116
- query=query,
117
- history=history,
118
- images=[image],
119
- template_version='chat'
120
- )
121
- inputs = {
122
- 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
123
- 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
124
- 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
125
- 'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if image is not None else None,
126
- }
127
- gen_kwargs = {
128
- "max_new_tokens": 2048,
129
- "pad_token_id": 128002,
130
- }
131
- with torch.no_grad():
132
- outputs = model.generate(**inputs, **gen_kwargs)
133
- outputs = outputs[:, inputs['input_ids'].shape[1]:]
134
- response = tokenizer.decode(outputs[0])
135
- response = response.split("<|end_of_text|>")[0]
136
- print("\nCogVLM2:", response)
137
- history.append((query, response))
 
30
  ).to(DEVICE).eval()
31
 
32
 
33
+
34
+ text_only_template = """A chat between a curious user and an artificial intelligence assistant. \
35
+ The assistant gives helpful, detailed, and polite answers to the user's questions. \
36
+ USER: {} ASSISTANT:"""
37
+
38
+
39
  @spaces.GPU
40
  def generate_caption(image, prompt):
41
+ print(DEVICE)
42
 
43
  # Process the image and the prompt
44
+
45
+ # image = Image.open(image_path).convert('RGB')
46
+ image = image.convert('RGB')
47
+ query = text_only_template.format(query)
48
+ input_by_model = model.build_conversation_input_ids(
49
+ tokenizer,
50
+ query=query,
51
+ history=[],
52
+ images=[image],
53
+ template_version='chat'
54
+ )
55
+ inputs = {
56
+ 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
57
+ 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
58
+ 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
59
+ 'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if image is not None else None,
60
+ }
61
+ gen_kwargs = {
62
+ "max_new_tokens": 2048,
63
+ "pad_token_id": 128002,
64
+ }
65
+ with torch.no_grad():
66
+ outputs = model.generate(**inputs, **gen_kwargs)
67
+ outputs = outputs[:, inputs['input_ids'].shape[1]:]
68
+ response = tokenizer.decode(outputs[0])
69
+ response = response.split("<|end_of_text|>")[0]
70
+ print("\nCogVLM2:", response)
71
+ return response
72
 
73
 
74
  ## make predictions via api ##
 
82
  )
83
 
84
  # Launch the interface
85
+ demo.launch(share=True)