OneEyeDJ commited on
Commit
bdaed29
Β·
verified Β·
1 Parent(s): 4b9031c

Upload folder using huggingface_hub

Browse files
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Test
3
- emoji: πŸ†
4
- colorFrom: indigo
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.34.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: test
3
+ app_file: main.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.34.0
 
 
6
  ---
 
 
main.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('.')
3
+
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoModelForCausalLM, AutoProcessor
7
+ import argparse
8
+ import os
9
+
10
+ class SimpleVideoLLaMA3Interface:
11
+ def __init__(self, model_path):
12
+ print(f"Loading model from {model_path}...")
13
+ self.model = AutoModelForCausalLM.from_pretrained(
14
+ model_path,
15
+ trust_remote_code=True,
16
+ device_map="auto",
17
+ torch_dtype=torch.bfloat16,
18
+ attn_implementation="flash_attention_2",
19
+ )
20
+ self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
21
+ print("Model loaded successfully!")
22
+
23
+ self.image_formats = ("png", "jpg", "jpeg", "bmp", "gif", "webp")
24
+ self.video_formats = ("mp4", "avi", "mov", "mkv", "webm", "m4v", "3gp", "flv")
25
+
26
+ @torch.inference_mode()
27
+ def predict(self, messages, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=4096, fps=10, max_frames=256):
28
+ if not messages or len(messages) == 0:
29
+ return messages
30
+
31
+ # Convert Gradio messages to VideoLLaMA3 format with PROPER conversation history
32
+ conversation = []
33
+
34
+ # Group messages into proper conversation turns
35
+ i = 0
36
+ while i < len(messages):
37
+ if messages[i]["role"] == "user":
38
+ # Collect all consecutive user messages into one turn
39
+ user_content = []
40
+
41
+ while i < len(messages) and messages[i]["role"] == "user":
42
+ msg = messages[i]
43
+ print(f"DEBUG: Processing user message {i}: {msg}")
44
+ print(f"DEBUG: Content type: {type(msg['content'])}")
45
+ print(f"DEBUG: Content value: {msg['content']}")
46
+
47
+ # Handle different types of user content
48
+ if isinstance(msg["content"], str):
49
+ print(f"DEBUG: Adding text: {msg['content']}")
50
+ user_content.append({"type": "text", "text": msg["content"]})
51
+ elif isinstance(msg["content"], tuple) and len(msg["content"]) > 0:
52
+ # Handle file uploads from Gradio (comes as tuple)
53
+ file_path = msg["content"][0]
54
+ print(f"Processing file from tuple: {file_path}")
55
+
56
+ # Check if file exists and add appropriate content
57
+ if not os.path.exists(file_path):
58
+ print(f"ERROR: File does not exist: {file_path}")
59
+ user_content.append({"type": "text", "text": f"Error: Could not find file {file_path}"})
60
+ elif file_path.lower().endswith(self.video_formats):
61
+ print(f"βœ… DETECTED VIDEO: Adding video with fps={fps}, max_frames={max_frames}")
62
+ user_content.append({"type": "video", "video": {"video_path": file_path, "fps": fps, "max_frames": max_frames}})
63
+ elif file_path.lower().endswith(self.image_formats):
64
+ print(f"βœ… DETECTED IMAGE: Adding image: {file_path}")
65
+ user_content.append({"type": "image", "image": {"image_path": file_path}})
66
+ else:
67
+ print(f"❌ UNKNOWN FILE TYPE: {file_path}")
68
+ user_content.append({"type": "text", "text": f"Unsupported file type: {file_path}"})
69
+ elif isinstance(msg["content"], dict) and "path" in msg["content"]:
70
+ # Handle file uploads with path dict (backup method)
71
+ file_path = msg["content"]["path"]
72
+ print(f"Processing file from dict: {file_path}")
73
+
74
+ if not os.path.exists(file_path):
75
+ print(f"ERROR: File does not exist: {file_path}")
76
+ user_content.append({"type": "text", "text": f"Error: Could not find file {file_path}"})
77
+ elif file_path.lower().endswith(self.video_formats):
78
+ print(f"βœ… DETECTED VIDEO: Adding video with fps={fps}, max_frames={max_frames}")
79
+ user_content.append({"type": "video", "video": {"video_path": file_path, "fps": fps, "max_frames": max_frames}})
80
+ elif file_path.lower().endswith(self.image_formats):
81
+ print(f"βœ… DETECTED IMAGE: Adding image: {file_path}")
82
+ user_content.append({"type": "image", "image": {"image_path": file_path}})
83
+ else:
84
+ print(f"❌ UNKNOWN FILE TYPE: {file_path}")
85
+ user_content.append({"type": "text", "text": f"Unsupported file type: {file_path}"})
86
+
87
+ i += 1
88
+
89
+ # Add the complete user turn to conversation
90
+ if user_content:
91
+ conversation.append({"role": "user", "content": user_content})
92
+ print(f"πŸ“ Added user turn with {len(user_content)} items: {[item.get('type', 'unknown') for item in user_content]}")
93
+
94
+ elif messages[i]["role"] == "assistant":
95
+ # Add assistant response
96
+ conversation.append({"role": "assistant", "content": messages[i]["content"]})
97
+ print(f"πŸ€– Added assistant turn: {messages[i]['content'][:50]}...")
98
+ i += 1
99
+
100
+ if not conversation:
101
+ return messages
102
+
103
+ try:
104
+ # Debug: Print conversation structure
105
+ print(f"Conversation structure: {len(conversation)} turns")
106
+ for i, turn in enumerate(conversation):
107
+ role = turn["role"]
108
+ if role == "user":
109
+ content_types = [item.get("type", "unknown") for item in turn["content"] if isinstance(item, dict)]
110
+ print(f"Turn {i}: {role} - {content_types}")
111
+ else:
112
+ print(f"Turn {i}: {role} - text response")
113
+
114
+ inputs = self.processor(
115
+ conversation=conversation,
116
+ add_system_prompt=True,
117
+ add_generation_prompt=True,
118
+ return_tensors="pt"
119
+ )
120
+ inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
121
+ if "pixel_values" in inputs:
122
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
123
+
124
+ output_ids = self.model.generate(
125
+ **inputs,
126
+ do_sample=do_sample,
127
+ temperature=temperature,
128
+ top_p=top_p,
129
+ max_new_tokens=max_new_tokens
130
+ )
131
+ response = self.processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
132
+
133
+ # Extract just the response part (after the last assistant prompt)
134
+ # Find the last occurrence of common assistant indicators
135
+ for indicator in ["assistant", "Assistant", "ASSISTANT"]:
136
+ if indicator in response:
137
+ response = response.split(indicator)[-1].strip()
138
+ break
139
+
140
+ # Clean up common formatting artifacts
141
+ response = response.lstrip(":")
142
+ response = response.lstrip()
143
+
144
+ messages.append({"role": "assistant", "content": response})
145
+ return messages
146
+
147
+ except Exception as e:
148
+ error_msg = f"Error: {str(e)}"
149
+ print(f"Error in prediction: {error_msg}")
150
+ messages.append({"role": "assistant", "content": error_msg})
151
+ return messages
152
+
153
+ def create_interface(self):
154
+ with gr.Blocks(title="VideoLLaMA3 AI Curator") as interface:
155
+ gr.Markdown("# 🎬 VideoLLaMA3 AI Curator\nUpload images or videos and ask questions!")
156
+
157
+ with gr.Row():
158
+ with gr.Column(scale=2):
159
+ chatbot = gr.Chatbot(type="messages", height=600)
160
+
161
+ with gr.Column(scale=1):
162
+ with gr.Tab("Input"):
163
+ video_input = gr.Video(sources=["upload"], label="Upload Video")
164
+ image_input = gr.Image(sources=["upload"], type="filepath", label="Upload Image")
165
+ text_input = gr.Textbox(label="Your Message", placeholder="Ask about the image/video or chat...")
166
+ submit_btn = gr.Button("Send", variant="primary")
167
+
168
+ with gr.Tab("Settings"):
169
+ do_sample = gr.Checkbox(value=True, label="Do Sample")
170
+ temperature = gr.Slider(0.0, 1.0, value=0.7, label="Temperature")
171
+ top_p = gr.Slider(0.0, 1.0, value=0.9, label="Top P")
172
+ max_tokens = gr.Slider(256, 8192, value=4096, step=64, label="Max Tokens")
173
+ fps = gr.Slider(0.5, 15.0, value=10.0, label="Video FPS")
174
+ max_frames = gr.Slider(32, 512, value=256, step=8, label="Max Frames")
175
+
176
+ def add_file(history, file):
177
+ if file:
178
+ print(f"DEBUG: Gradio file input: {file}")
179
+ print(f"DEBUG: File type: {type(file)}")
180
+ history.append({"role": "user", "content": {"path": file}})
181
+ return history, None
182
+
183
+ def add_text(history, text):
184
+ if text.strip():
185
+ history.append({"role": "user", "content": text})
186
+ return history, ""
187
+
188
+ def respond(history, do_sample, temperature, top_p, max_tokens, fps, max_frames):
189
+ # Only predict if the last message is from user and we haven't responded to it yet
190
+ if history and history[-1]["role"] == "user":
191
+ return self.predict(history, do_sample, temperature, top_p, max_tokens, fps, max_frames)
192
+ return history
193
+
194
+ video_input.change(add_file, [chatbot, video_input], [chatbot, video_input])
195
+ image_input.change(add_file, [chatbot, image_input], [chatbot, image_input])
196
+ text_input.submit(add_text, [chatbot, text_input], [chatbot, text_input]).then(
197
+ respond, [chatbot, do_sample, temperature, top_p, max_tokens, fps, max_frames], [chatbot]
198
+ )
199
+ submit_btn.click(add_text, [chatbot, text_input], [chatbot, text_input]).then(
200
+ respond, [chatbot, do_sample, temperature, top_p, max_tokens, fps, max_frames], [chatbot]
201
+ )
202
+
203
+ return interface
204
+
205
+ if __name__ == "__main__":
206
+ parser = argparse.ArgumentParser()
207
+ parser.add_argument("--model-path", type=str, default="DAMO-NLP-SG/VideoLLaMA3-7B")
208
+ parser.add_argument("--port", type=int, default=7860)
209
+ parser.add_argument("--share", action="store_true")
210
+ args = parser.parse_args()
211
+
212
+ app = SimpleVideoLLaMA3Interface(args.model_path)
213
+ interface = app.create_interface()
214
+ interface.launch(server_port=args.port, share=args.share, server_name="0.0.0.0")