Build

Paused

App Files Files Community

ManishThota commited on Mar 7, 2024

Commit

9db455c

verified ·

1 Parent(s): 0f80bf1

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -15

app.py CHANGED Viewed

@@ -2,6 +2,11 @@ import gradio as gr
 from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 # # Ensure GPU usage if available
@@ -16,22 +21,94 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
                                              trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
-def predict_answer(image, question, max_tokens=100):
-    #Set inputs
-    text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
-    image = image.convert("RGB")
-    input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
-    image_tensor = model.image_preprocess(image)
-    #Generate the answer
-    output_ids = model.generate(
-        input_ids,
-        max_new_tokens=max_tokens,
-        images=image_tensor,
-        use_cache=True)[0]
-    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
 def gradio_predict(image, question, max_tokens):
     answer = predict_answer(image, question, max_tokens)
@@ -50,9 +127,10 @@ def gradio_predict(image, question, max_tokens):
 # Define the Gradio interface
 iface = gr.Interface(
     fn=gradio_predict,
-    inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
             gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
-            gr.Slider(2, 500, value=100, label="Token Count", info="Choose between 2 and 500")],
     outputs=gr.TextArea(label="Answer"),
     # examples=examples,
     title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework",

 from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+import magic
+import mimetypes
+import cv2
+import numpy as np
+import io
 # # Ensure GPU usage if available
                                              trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
+def get_file_type_from_bytes(file_bytes):
+    """Determine whether a file is an image or a video based on its MIME type from bytes."""
+    mime = magic.Magic(mime=True)
+    mimetype = mime.from_buffer(file_bytes)
+    if mimetype.startswith('image'):
+        return 'image'
+    elif mimetype.startswith('video'):
+        return 'video'
+    return 'unknown'
+def process_video(video_bytes):
+    """Extracts frames from the video, 1 per second."""
+    video = cv2.VideoCapture(io.BytesIO(video_bytes))
+    fps = video.get(cv2.CAP_PROP_FPS)
+    frames = []
+    success, frame = video.read()
+    while success:
+        frames.append(frame)
+        for _ in range(int(fps)):  # Skip fps frames
+            success, frame = video.read()
+    video.release()
+    return frames[:4]  # Return the first 4 frames
+def predict_answer(file, question, max_tokens=100):
+    file_type = get_file_type_from_bytes(file)
+    if file_type == 'image':
+        # Process as an image
+        image = Image.open(io.BytesIO(file))
+        frame = image.convert("RGB")
+        input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
+        image_tensor = model.image_preprocess(frame)
+        #Generate the answer
+        output_ids = model.generate(
+            input_ids,
+            max_new_tokens=max_tokens,
+            images=image_tensor,
+            use_cache=True)[0]
+        return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    elif file_type == 'video':
+        # Process as a video
+        frames = process_video(file)
+        answers = []
+        for frame in frames:
+            frame = Image.open(frame).convert("RGB")
+            input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
+            image_tensor = model.image_preprocess(frame)
+            # Generate the answer
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=max_tokens,
+                images=image_tensor,
+                use_cache=True)[0]
+            answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+            answers.append(answer)
+        return "\n".join(answers)
+    else:
+        return "Unsupported file type. Please upload an image or video."
+# def predict_answer(image, question, max_tokens=100):
+#     #Set inputs
+#     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
+#     image = image.convert("RGB")
+#     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
+#     image_tensor = model.image_preprocess(image)
+#     #Generate the answer
+#     output_ids = model.generate(
+#         input_ids,
+#         max_new_tokens=max_tokens,
+#         images=image_tensor,
+#         use_cache=True)[0]
+#     return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
 def gradio_predict(image, question, max_tokens):
     answer = predict_answer(image, question, max_tokens)
 # Define the Gradio interface
 iface = gr.Interface(
     fn=gradio_predict,
+    inputs=[gr.File(label="Upload an Image or Video"),
+    # gr.Image(type="pil", label="Upload or Drag an Image"),
             gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
+            gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
     outputs=gr.TextArea(label="Answer"),
     # examples=examples,
     title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework",