Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,14 +4,11 @@ import torch
|
|
| 4 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 5 |
import cv2
|
| 6 |
import numpy as np
|
| 7 |
-
import io
|
| 8 |
|
| 9 |
|
| 10 |
# # Ensure GPU usage if available
|
| 11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
# Initialize the model and tokenizer
|
| 16 |
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
| 17 |
torch_dtype=torch.float16,
|
|
@@ -20,59 +17,54 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
| 21 |
|
| 22 |
|
| 23 |
-
|
| 24 |
-
# """Extracts frames from the video, 1 per second."""
|
| 25 |
-
# video = cv2.VideoCapture(io.BytesIO(video_bytes))
|
| 26 |
-
# fps = video.get(cv2.CAP_PROP_FPS)
|
| 27 |
-
# frames = []
|
| 28 |
-
# success, frame = video.read()
|
| 29 |
-
# while success:
|
| 30 |
-
# frames.append(frame)
|
| 31 |
-
# for _ in range(int(fps)): # Skip fps frames
|
| 32 |
-
# success, frame = video.read()
|
| 33 |
-
# video.release()
|
| 34 |
-
# return frames[:4] # Return the first 4 frames
|
| 35 |
-
|
| 36 |
-
def video_to_frames(video_path):
|
| 37 |
"""Converts a video file into frames and stores them as PNG images in a list."""
|
| 38 |
-
# List to hold frames encoded as PNG
|
| 39 |
frames_png = []
|
|
|
|
| 40 |
|
| 41 |
-
# Open the video file
|
| 42 |
-
cap = cv2.VideoCapture(video_path)
|
| 43 |
-
|
| 44 |
-
# Check if video opened successfully
|
| 45 |
if not cap.isOpened():
|
| 46 |
print("Error opening video file")
|
| 47 |
return frames_png
|
| 48 |
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
while cap.isOpened():
|
| 51 |
-
# Capture frame-by-frame
|
| 52 |
ret, frame = cap.read()
|
| 53 |
-
|
| 54 |
-
# If frame is read correctly ret is True
|
| 55 |
if not ret:
|
| 56 |
print("Can't receive frame (stream end?). Exiting ...")
|
| 57 |
break
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
# When everything done, release the video capture object
|
| 65 |
cap.release()
|
| 66 |
-
|
| 67 |
return frames_png
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def predict_answer(image, video, question, max_tokens=100):
|
| 70 |
|
| 71 |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
|
| 72 |
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
|
| 73 |
|
| 74 |
|
| 75 |
-
if image:
|
| 76 |
# Process as an image
|
| 77 |
image = image.convert("RGB")
|
| 78 |
image_tensor = model.image_preprocess(image)
|
|
@@ -86,13 +78,13 @@ def predict_answer(image, video, question, max_tokens=100):
|
|
| 86 |
|
| 87 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
| 88 |
|
| 89 |
-
elif video:
|
| 90 |
# Process as a video
|
| 91 |
frames = video_to_frames(video)
|
| 92 |
answers = []
|
| 93 |
for frame in frames:
|
| 94 |
-
|
| 95 |
-
image_tensor = model.image_preprocess(
|
| 96 |
|
| 97 |
# Generate the answer
|
| 98 |
output_ids = model.generate(
|
|
@@ -114,21 +106,17 @@ def predict_answer(image, video, question, max_tokens=100):
|
|
| 114 |
def gradio_predict(image, video, question, max_tokens):
|
| 115 |
answer = predict_answer(image, video, question, max_tokens)
|
| 116 |
return answer
|
| 117 |
-
|
| 118 |
-
|
| 119 |
|
| 120 |
-
# Define the Gradio interface
|
| 121 |
iface = gr.Interface(
|
| 122 |
fn=gradio_predict,
|
| 123 |
-
inputs=[
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
outputs=gr.TextArea(label="Answer"),
|
| 128 |
-
#
|
| 129 |
-
title="
|
| 130 |
-
|
| 131 |
)
|
| 132 |
|
| 133 |
-
|
| 134 |
-
iface.queue().launch(debug=True)
|
|
|
|
| 4 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 5 |
import cv2
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
# # Ensure GPU usage if available
|
| 10 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
|
|
|
|
|
|
|
| 12 |
# Initialize the model and tokenizer
|
| 13 |
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
| 14 |
torch_dtype=torch.float16,
|
|
|
|
| 17 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
| 18 |
|
| 19 |
|
| 20 |
+
def video_to_frames(video, fps=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"""Converts a video file into frames and stores them as PNG images in a list."""
|
|
|
|
| 22 |
frames_png = []
|
| 23 |
+
cap = cv2.VideoCapture(video)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
if not cap.isOpened():
|
| 26 |
print("Error opening video file")
|
| 27 |
return frames_png
|
| 28 |
|
| 29 |
+
frame_count = 0
|
| 30 |
+
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval
|
| 31 |
+
|
| 32 |
while cap.isOpened():
|
|
|
|
| 33 |
ret, frame = cap.read()
|
|
|
|
|
|
|
| 34 |
if not ret:
|
| 35 |
print("Can't receive frame (stream end?). Exiting ...")
|
| 36 |
break
|
| 37 |
|
| 38 |
+
if frame_count % frame_interval == 0:
|
| 39 |
+
is_success, buffer = cv2.imencode(".png", frame)
|
| 40 |
+
if is_success:
|
| 41 |
+
frames_png.append(np.array(buffer).tobytes())
|
| 42 |
+
|
| 43 |
+
frame_count += 1
|
| 44 |
|
|
|
|
| 45 |
cap.release()
|
|
|
|
| 46 |
return frames_png
|
| 47 |
|
| 48 |
+
def extract_frames(frame):
|
| 49 |
+
|
| 50 |
+
# Convert binary data to a numpy array
|
| 51 |
+
frame_np = np.frombuffer(frame, dtype=np.uint8)
|
| 52 |
+
|
| 53 |
+
# Decode the PNG image
|
| 54 |
+
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format
|
| 55 |
+
|
| 56 |
+
# Convert RGB to BGR
|
| 57 |
+
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
|
| 58 |
+
|
| 59 |
+
return image_bgr
|
| 60 |
+
|
| 61 |
def predict_answer(image, video, question, max_tokens=100):
|
| 62 |
|
| 63 |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
|
| 64 |
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
|
| 65 |
|
| 66 |
|
| 67 |
+
if image is not None:
|
| 68 |
# Process as an image
|
| 69 |
image = image.convert("RGB")
|
| 70 |
image_tensor = model.image_preprocess(image)
|
|
|
|
| 78 |
|
| 79 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
| 80 |
|
| 81 |
+
elif video is not None:
|
| 82 |
# Process as a video
|
| 83 |
frames = video_to_frames(video)
|
| 84 |
answers = []
|
| 85 |
for frame in frames:
|
| 86 |
+
image = extract_frames(frame)
|
| 87 |
+
image_tensor = model.image_preprocess(image)
|
| 88 |
|
| 89 |
# Generate the answer
|
| 90 |
output_ids = model.generate(
|
|
|
|
| 106 |
def gradio_predict(image, video, question, max_tokens):
|
| 107 |
answer = predict_answer(image, video, question, max_tokens)
|
| 108 |
return answer
|
|
|
|
|
|
|
| 109 |
|
|
|
|
| 110 |
iface = gr.Interface(
|
| 111 |
fn=gradio_predict,
|
| 112 |
+
inputs=[
|
| 113 |
+
gr.Image(type="pil", label="Upload or Drag an Image"),
|
| 114 |
+
gr.Video(label="Upload your video here"),
|
| 115 |
+
],
|
| 116 |
outputs=gr.TextArea(label="Answer"),
|
| 117 |
+
# outputs=gr.Image(label="Output"),
|
| 118 |
+
title="Video/Image Viewer",
|
| 119 |
+
description="Upload an image or video to view it or extract frames from the video.",
|
| 120 |
)
|
| 121 |
|
| 122 |
+
iface.launch(debug=True)
|
|
|