Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,15 +2,14 @@ import gradio as gr
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModel, AutoTokenizer
|
| 4 |
|
| 5 |
-
#
|
| 6 |
model_path = "OpenGVLab/InternVideo2_5_Chat_8B"
|
| 7 |
|
| 8 |
# Load the tokenizer and model with remote code enabled.
|
| 9 |
-
# .half() converts the model to FP16 and .cuda() moves it to GPU (if available).
|
| 10 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 11 |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
|
| 12 |
|
| 13 |
-
# Get the image processor from the vision tower
|
| 14 |
image_processor = model.get_vision_tower().image_processor
|
| 15 |
|
| 16 |
# Evaluation settings
|
|
@@ -23,23 +22,37 @@ generation_config = {
|
|
| 23 |
"num_beams": 1,
|
| 24 |
}
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def chat_interface(video_path, user_prompt, chat_history):
|
| 28 |
-
"""
|
| 29 |
-
Performs a chat turn with the model. If no chat_history is provided,
|
| 30 |
-
it starts a new conversation.
|
| 31 |
-
|
| 32 |
-
Parameters:
|
| 33 |
-
video_path (str): The filepath of the uploaded video.
|
| 34 |
-
user_prompt (str): The user's question.
|
| 35 |
-
chat_history (list): The conversation history (empty list for a new conversation).
|
| 36 |
-
|
| 37 |
-
Returns:
|
| 38 |
-
A tuple containing the model's output (str) and the updated chat history (list).
|
| 39 |
-
"""
|
| 40 |
if chat_history is None:
|
| 41 |
chat_history = []
|
| 42 |
-
# The model.chat() method returns output and updated history.
|
| 43 |
output, new_history = model.chat(
|
| 44 |
video_path=video_path,
|
| 45 |
tokenizer=tokenizer,
|
|
@@ -57,21 +70,15 @@ with gr.Blocks() as demo:
|
|
| 57 |
with gr.Row():
|
| 58 |
video_input = gr.Video(label="Upload Video", type="filepath")
|
| 59 |
question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
|
| 60 |
-
|
| 61 |
-
chat_state = gr.State([])
|
| 62 |
output_text = gr.Textbox(label="Model Response")
|
| 63 |
|
| 64 |
-
def process_chat(video, question, history):
|
| 65 |
-
response, new_history = chat_interface(video, question, history)
|
| 66 |
-
return response, new_history
|
| 67 |
-
|
| 68 |
send_btn = gr.Button("Send")
|
| 69 |
send_btn.click(
|
| 70 |
-
|
| 71 |
inputs=[video_input, question_input, chat_state],
|
| 72 |
outputs=[output_text, chat_state]
|
| 73 |
)
|
| 74 |
|
| 75 |
-
# Launch the app.
|
| 76 |
if __name__ == "__main__":
|
| 77 |
demo.launch()
|
|
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModel, AutoTokenizer
|
| 4 |
|
| 5 |
+
# Model setting
|
| 6 |
model_path = "OpenGVLab/InternVideo2_5_Chat_8B"
|
| 7 |
|
| 8 |
# Load the tokenizer and model with remote code enabled.
|
|
|
|
| 9 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 10 |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
|
| 11 |
|
| 12 |
+
# Get the image processor from the vision tower.
|
| 13 |
image_processor = model.get_vision_tower().image_processor
|
| 14 |
|
| 15 |
# Evaluation settings
|
|
|
|
| 22 |
"num_beams": 1,
|
| 23 |
}
|
| 24 |
|
| 25 |
+
video_path = "your_video.mp4" # (For testing locally, update as needed)
|
| 26 |
+
|
| 27 |
+
# Single-turn conversation example:
|
| 28 |
+
def single_turn_chat(video_path, user_prompt):
|
| 29 |
+
output, chat_history = model.chat(
|
| 30 |
+
video_path=video_path,
|
| 31 |
+
tokenizer=tokenizer,
|
| 32 |
+
user_prompt=user_prompt,
|
| 33 |
+
return_history=True,
|
| 34 |
+
max_num_frames=max_num_frames,
|
| 35 |
+
generation_config=generation_config
|
| 36 |
+
)
|
| 37 |
+
return output
|
| 38 |
+
|
| 39 |
+
# Multi-turn conversation example:
|
| 40 |
+
def multi_turn_chat(video_path, user_prompt, chat_history):
|
| 41 |
+
output, chat_history = model.chat(
|
| 42 |
+
video_path=video_path,
|
| 43 |
+
tokenizer=tokenizer,
|
| 44 |
+
user_prompt=user_prompt,
|
| 45 |
+
chat_history=chat_history,
|
| 46 |
+
return_history=True,
|
| 47 |
+
max_num_frames=max_num_frames,
|
| 48 |
+
generation_config=generation_config
|
| 49 |
+
)
|
| 50 |
+
return output, chat_history
|
| 51 |
+
|
| 52 |
+
# For the Gradio interface, we'll combine these into a chat function.
|
| 53 |
def chat_interface(video_path, user_prompt, chat_history):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
if chat_history is None:
|
| 55 |
chat_history = []
|
|
|
|
| 56 |
output, new_history = model.chat(
|
| 57 |
video_path=video_path,
|
| 58 |
tokenizer=tokenizer,
|
|
|
|
| 70 |
with gr.Row():
|
| 71 |
video_input = gr.Video(label="Upload Video", type="filepath")
|
| 72 |
question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
|
| 73 |
+
chat_state = gr.State([]) # To maintain conversation history
|
|
|
|
| 74 |
output_text = gr.Textbox(label="Model Response")
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
send_btn = gr.Button("Send")
|
| 77 |
send_btn.click(
|
| 78 |
+
chat_interface,
|
| 79 |
inputs=[video_input, question_input, chat_state],
|
| 80 |
outputs=[output_text, chat_state]
|
| 81 |
)
|
| 82 |
|
|
|
|
| 83 |
if __name__ == "__main__":
|
| 84 |
demo.launch()
|