Upload folder using huggingface_hub
Browse files- .github/workflows/update_space.yml +28 -0
- .gradio/certificate.pem +31 -0
- README.md +2 -8
- main.py +214 -0
.github/workflows/update_space.yml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Run Python script
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
build:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout
|
14 |
+
uses: actions/checkout@v2
|
15 |
+
|
16 |
+
- name: Set up Python
|
17 |
+
uses: actions/setup-python@v2
|
18 |
+
with:
|
19 |
+
python-version: '3.9'
|
20 |
+
|
21 |
+
- name: Install Gradio
|
22 |
+
run: python -m pip install gradio
|
23 |
+
|
24 |
+
- name: Log in to Hugging Face
|
25 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
26 |
+
|
27 |
+
- name: Deploy to Spaces
|
28 |
+
run: gradio deploy
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.34.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: test
|
3 |
+
app_file: main.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 5.34.0
|
|
|
|
|
6 |
---
|
|
|
|
main.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append('.')
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import gradio as gr
|
6 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
7 |
+
import argparse
|
8 |
+
import os
|
9 |
+
|
10 |
+
class SimpleVideoLLaMA3Interface:
|
11 |
+
def __init__(self, model_path):
|
12 |
+
print(f"Loading model from {model_path}...")
|
13 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
14 |
+
model_path,
|
15 |
+
trust_remote_code=True,
|
16 |
+
device_map="auto",
|
17 |
+
torch_dtype=torch.bfloat16,
|
18 |
+
attn_implementation="flash_attention_2",
|
19 |
+
)
|
20 |
+
self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
21 |
+
print("Model loaded successfully!")
|
22 |
+
|
23 |
+
self.image_formats = ("png", "jpg", "jpeg", "bmp", "gif", "webp")
|
24 |
+
self.video_formats = ("mp4", "avi", "mov", "mkv", "webm", "m4v", "3gp", "flv")
|
25 |
+
|
26 |
+
@torch.inference_mode()
|
27 |
+
def predict(self, messages, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=4096, fps=10, max_frames=256):
|
28 |
+
if not messages or len(messages) == 0:
|
29 |
+
return messages
|
30 |
+
|
31 |
+
# Convert Gradio messages to VideoLLaMA3 format with PROPER conversation history
|
32 |
+
conversation = []
|
33 |
+
|
34 |
+
# Group messages into proper conversation turns
|
35 |
+
i = 0
|
36 |
+
while i < len(messages):
|
37 |
+
if messages[i]["role"] == "user":
|
38 |
+
# Collect all consecutive user messages into one turn
|
39 |
+
user_content = []
|
40 |
+
|
41 |
+
while i < len(messages) and messages[i]["role"] == "user":
|
42 |
+
msg = messages[i]
|
43 |
+
print(f"DEBUG: Processing user message {i}: {msg}")
|
44 |
+
print(f"DEBUG: Content type: {type(msg['content'])}")
|
45 |
+
print(f"DEBUG: Content value: {msg['content']}")
|
46 |
+
|
47 |
+
# Handle different types of user content
|
48 |
+
if isinstance(msg["content"], str):
|
49 |
+
print(f"DEBUG: Adding text: {msg['content']}")
|
50 |
+
user_content.append({"type": "text", "text": msg["content"]})
|
51 |
+
elif isinstance(msg["content"], tuple) and len(msg["content"]) > 0:
|
52 |
+
# Handle file uploads from Gradio (comes as tuple)
|
53 |
+
file_path = msg["content"][0]
|
54 |
+
print(f"Processing file from tuple: {file_path}")
|
55 |
+
|
56 |
+
# Check if file exists and add appropriate content
|
57 |
+
if not os.path.exists(file_path):
|
58 |
+
print(f"ERROR: File does not exist: {file_path}")
|
59 |
+
user_content.append({"type": "text", "text": f"Error: Could not find file {file_path}"})
|
60 |
+
elif file_path.lower().endswith(self.video_formats):
|
61 |
+
print(f"β
DETECTED VIDEO: Adding video with fps={fps}, max_frames={max_frames}")
|
62 |
+
user_content.append({"type": "video", "video": {"video_path": file_path, "fps": fps, "max_frames": max_frames}})
|
63 |
+
elif file_path.lower().endswith(self.image_formats):
|
64 |
+
print(f"β
DETECTED IMAGE: Adding image: {file_path}")
|
65 |
+
user_content.append({"type": "image", "image": {"image_path": file_path}})
|
66 |
+
else:
|
67 |
+
print(f"β UNKNOWN FILE TYPE: {file_path}")
|
68 |
+
user_content.append({"type": "text", "text": f"Unsupported file type: {file_path}"})
|
69 |
+
elif isinstance(msg["content"], dict) and "path" in msg["content"]:
|
70 |
+
# Handle file uploads with path dict (backup method)
|
71 |
+
file_path = msg["content"]["path"]
|
72 |
+
print(f"Processing file from dict: {file_path}")
|
73 |
+
|
74 |
+
if not os.path.exists(file_path):
|
75 |
+
print(f"ERROR: File does not exist: {file_path}")
|
76 |
+
user_content.append({"type": "text", "text": f"Error: Could not find file {file_path}"})
|
77 |
+
elif file_path.lower().endswith(self.video_formats):
|
78 |
+
print(f"β
DETECTED VIDEO: Adding video with fps={fps}, max_frames={max_frames}")
|
79 |
+
user_content.append({"type": "video", "video": {"video_path": file_path, "fps": fps, "max_frames": max_frames}})
|
80 |
+
elif file_path.lower().endswith(self.image_formats):
|
81 |
+
print(f"β
DETECTED IMAGE: Adding image: {file_path}")
|
82 |
+
user_content.append({"type": "image", "image": {"image_path": file_path}})
|
83 |
+
else:
|
84 |
+
print(f"β UNKNOWN FILE TYPE: {file_path}")
|
85 |
+
user_content.append({"type": "text", "text": f"Unsupported file type: {file_path}"})
|
86 |
+
|
87 |
+
i += 1
|
88 |
+
|
89 |
+
# Add the complete user turn to conversation
|
90 |
+
if user_content:
|
91 |
+
conversation.append({"role": "user", "content": user_content})
|
92 |
+
print(f"π Added user turn with {len(user_content)} items: {[item.get('type', 'unknown') for item in user_content]}")
|
93 |
+
|
94 |
+
elif messages[i]["role"] == "assistant":
|
95 |
+
# Add assistant response
|
96 |
+
conversation.append({"role": "assistant", "content": messages[i]["content"]})
|
97 |
+
print(f"π€ Added assistant turn: {messages[i]['content'][:50]}...")
|
98 |
+
i += 1
|
99 |
+
|
100 |
+
if not conversation:
|
101 |
+
return messages
|
102 |
+
|
103 |
+
try:
|
104 |
+
# Debug: Print conversation structure
|
105 |
+
print(f"Conversation structure: {len(conversation)} turns")
|
106 |
+
for i, turn in enumerate(conversation):
|
107 |
+
role = turn["role"]
|
108 |
+
if role == "user":
|
109 |
+
content_types = [item.get("type", "unknown") for item in turn["content"] if isinstance(item, dict)]
|
110 |
+
print(f"Turn {i}: {role} - {content_types}")
|
111 |
+
else:
|
112 |
+
print(f"Turn {i}: {role} - text response")
|
113 |
+
|
114 |
+
inputs = self.processor(
|
115 |
+
conversation=conversation,
|
116 |
+
add_system_prompt=True,
|
117 |
+
add_generation_prompt=True,
|
118 |
+
return_tensors="pt"
|
119 |
+
)
|
120 |
+
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
121 |
+
if "pixel_values" in inputs:
|
122 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
123 |
+
|
124 |
+
output_ids = self.model.generate(
|
125 |
+
**inputs,
|
126 |
+
do_sample=do_sample,
|
127 |
+
temperature=temperature,
|
128 |
+
top_p=top_p,
|
129 |
+
max_new_tokens=max_new_tokens
|
130 |
+
)
|
131 |
+
response = self.processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
|
132 |
+
|
133 |
+
# Extract just the response part (after the last assistant prompt)
|
134 |
+
# Find the last occurrence of common assistant indicators
|
135 |
+
for indicator in ["assistant", "Assistant", "ASSISTANT"]:
|
136 |
+
if indicator in response:
|
137 |
+
response = response.split(indicator)[-1].strip()
|
138 |
+
break
|
139 |
+
|
140 |
+
# Clean up common formatting artifacts
|
141 |
+
response = response.lstrip(":")
|
142 |
+
response = response.lstrip()
|
143 |
+
|
144 |
+
messages.append({"role": "assistant", "content": response})
|
145 |
+
return messages
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
error_msg = f"Error: {str(e)}"
|
149 |
+
print(f"Error in prediction: {error_msg}")
|
150 |
+
messages.append({"role": "assistant", "content": error_msg})
|
151 |
+
return messages
|
152 |
+
|
153 |
+
def create_interface(self):
|
154 |
+
with gr.Blocks(title="VideoLLaMA3 AI Curator") as interface:
|
155 |
+
gr.Markdown("# π¬ VideoLLaMA3 AI Curator\nUpload images or videos and ask questions!")
|
156 |
+
|
157 |
+
with gr.Row():
|
158 |
+
with gr.Column(scale=2):
|
159 |
+
chatbot = gr.Chatbot(type="messages", height=600)
|
160 |
+
|
161 |
+
with gr.Column(scale=1):
|
162 |
+
with gr.Tab("Input"):
|
163 |
+
video_input = gr.Video(sources=["upload"], label="Upload Video")
|
164 |
+
image_input = gr.Image(sources=["upload"], type="filepath", label="Upload Image")
|
165 |
+
text_input = gr.Textbox(label="Your Message", placeholder="Ask about the image/video or chat...")
|
166 |
+
submit_btn = gr.Button("Send", variant="primary")
|
167 |
+
|
168 |
+
with gr.Tab("Settings"):
|
169 |
+
do_sample = gr.Checkbox(value=True, label="Do Sample")
|
170 |
+
temperature = gr.Slider(0.0, 1.0, value=0.7, label="Temperature")
|
171 |
+
top_p = gr.Slider(0.0, 1.0, value=0.9, label="Top P")
|
172 |
+
max_tokens = gr.Slider(256, 8192, value=4096, step=64, label="Max Tokens")
|
173 |
+
fps = gr.Slider(0.5, 15.0, value=10.0, label="Video FPS")
|
174 |
+
max_frames = gr.Slider(32, 512, value=256, step=8, label="Max Frames")
|
175 |
+
|
176 |
+
def add_file(history, file):
|
177 |
+
if file:
|
178 |
+
print(f"DEBUG: Gradio file input: {file}")
|
179 |
+
print(f"DEBUG: File type: {type(file)}")
|
180 |
+
history.append({"role": "user", "content": {"path": file}})
|
181 |
+
return history, None
|
182 |
+
|
183 |
+
def add_text(history, text):
|
184 |
+
if text.strip():
|
185 |
+
history.append({"role": "user", "content": text})
|
186 |
+
return history, ""
|
187 |
+
|
188 |
+
def respond(history, do_sample, temperature, top_p, max_tokens, fps, max_frames):
|
189 |
+
# Only predict if the last message is from user and we haven't responded to it yet
|
190 |
+
if history and history[-1]["role"] == "user":
|
191 |
+
return self.predict(history, do_sample, temperature, top_p, max_tokens, fps, max_frames)
|
192 |
+
return history
|
193 |
+
|
194 |
+
video_input.change(add_file, [chatbot, video_input], [chatbot, video_input])
|
195 |
+
image_input.change(add_file, [chatbot, image_input], [chatbot, image_input])
|
196 |
+
text_input.submit(add_text, [chatbot, text_input], [chatbot, text_input]).then(
|
197 |
+
respond, [chatbot, do_sample, temperature, top_p, max_tokens, fps, max_frames], [chatbot]
|
198 |
+
)
|
199 |
+
submit_btn.click(add_text, [chatbot, text_input], [chatbot, text_input]).then(
|
200 |
+
respond, [chatbot, do_sample, temperature, top_p, max_tokens, fps, max_frames], [chatbot]
|
201 |
+
)
|
202 |
+
|
203 |
+
return interface
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
parser = argparse.ArgumentParser()
|
207 |
+
parser.add_argument("--model-path", type=str, default="DAMO-NLP-SG/VideoLLaMA3-7B")
|
208 |
+
parser.add_argument("--port", type=int, default=7860)
|
209 |
+
parser.add_argument("--share", action="store_true")
|
210 |
+
args = parser.parse_args()
|
211 |
+
|
212 |
+
app = SimpleVideoLLaMA3Interface(args.model_path)
|
213 |
+
interface = app.create_interface()
|
214 |
+
interface.launch(server_port=args.port, share=args.share, server_name="0.0.0.0")
|