Build

Paused

App Files Files Community

ManishThota commited on Mar 7, 2024

Commit

edff486

verified ·

1 Parent(s): 5799880

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -54

app.py CHANGED Viewed

@@ -2,8 +2,6 @@ import gradio as gr
 from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import magic
-import mimetypes
 import cv2
 import numpy as np
 import io
@@ -22,16 +20,6 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
-def get_file_type_from_bytes(file_bytes):
-    """Determine whether a file is an image or a video based on its MIME type from bytes."""
-    mime = magic.Magic(mime=True)
-    mimetype = mime.from_buffer(file_bytes)
-    if mimetype.startswith('image'):
-        return 'image'
-    elif mimetype.startswith('video'):
-        return 'video'
-    return 'unknown'
 def process_video(video_bytes):
     """Extracts frames from the video, 1 per second."""
     video = cv2.VideoCapture(io.BytesIO(video_bytes))
@@ -46,15 +34,12 @@ def process_video(video_bytes):
     return frames[:4]  # Return the first 4 frames
-def predict_answer(file, question, max_tokens=100):
-    file_type = get_file_type_from_bytes(file)
-    if file_type == 'image':
         # Process as an image
-        image = Image.open(io.BytesIO(file))
-        frame = image.convert("RGB")
-        input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
         image_tensor = model.image_preprocess(frame)
         #Generate the answer
@@ -66,13 +51,13 @@ def predict_answer(file, question, max_tokens=100):
         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-    elif file_type == 'video':
         # Process as a video
-        frames = process_video(file)
         answers = []
         for frame in frames:
             frame = Image.open(frame).convert("RGB")
-            input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
             image_tensor = model.image_preprocess(frame)
             # Generate the answer
@@ -90,45 +75,19 @@ def predict_answer(file, question, max_tokens=100):
         return "Unsupported file type. Please upload an image or video."
-# def predict_answer(image, question, max_tokens=100):
-#     #Set inputs
-#     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
-#     image = image.convert("RGB")
-#     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
-#     image_tensor = model.image_preprocess(image)
-#     #Generate the answer
-#     output_ids = model.generate(
-#         input_ids,
-#         max_new_tokens=max_tokens,
-#         images=image_tensor,
-#         use_cache=True)[0]
-#     return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-def gradio_predict(image, question, max_tokens):
-    answer = predict_answer(image, question, max_tokens)
     return answer
-# examples = [["data/week_01_page_024.png", 'Can you explain the slide?',100],
-#             ["data/week_03_page_091.png", 'Can you explain the slide?',100],
-#             ["data/week_01_page_062.png", 'Are the training images labeled?',100],
-#             ["data/week_05_page_027.png", 'What is meant by eigenvalue multiplicity?',100],
-#             ["data/week_05_page_030.png", 'What does K represent?',100],
-#             ["data/week_15_page_046.png", 'How are individual heterogeneous models trained?',100],
-#             ["data/week_15_page_021.png", 'How does Bagging affect error?',100],
-#             ["data/week_15_page_037.png", "What does the '+' and '-' represent?",100]]
 # Define the Gradio interface
 iface = gr.Interface(
     fn=gradio_predict,
-    inputs=[gr.File(label="Upload an Image or Video"),
-    # gr.Image(type="pil", label="Upload or Drag an Image"),
             gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
             gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
     outputs=gr.TextArea(label="Answer"),

 from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import cv2
 import numpy as np
 import io
 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
 def process_video(video_bytes):
     """Extracts frames from the video, 1 per second."""
     video = cv2.VideoCapture(io.BytesIO(video_bytes))
     return frames[:4]  # Return the first 4 frames
+def predict_answer(image, video, question, max_tokens=100):
+    if image:
         # Process as an image
+        image = image.convert("RGB")
+        input_ids = tokenizer(question, return_tensors='pt').input_ids.to(device)
         image_tensor = model.image_preprocess(frame)
         #Generate the answer
         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    elif video:
         # Process as a video
+        frames = process_video(video)
         answers = []
         for frame in frames:
             frame = Image.open(frame).convert("RGB")
+            input_ids = tokenizer(question, return_tensors='pt').input_ids.to(device)
             image_tensor = model.image_preprocess(frame)
             # Generate the answer
         return "Unsupported file type. Please upload an image or video."
+def gradio_predict(image, video, question, max_tokens):
+    answer = predict_answer(image, video, question, max_tokens)
     return answer
 # Define the Gradio interface
 iface = gr.Interface(
     fn=gradio_predict,
+    inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
+            gr.Video(label="upload your video here"),
             gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
             gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
     outputs=gr.TextArea(label="Answer"),