Build

Paused

App Files Files Community

ManishThota commited on Mar 9, 2024

Commit

5c72980

verified ·

1 Parent(s): a333293

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -51

app.py CHANGED Viewed

@@ -18,6 +18,8 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
 def video_to_frames(video, fps=1):
     """Converts a video file into frames and stores them as PNG images in a list."""
     frames_png = []
@@ -59,31 +61,13 @@ def extract_frames(frame):
     return image_bgr
-def predict_answer(image, video, question, max_tokens=100):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
-    # frames = video_to_frames(video)
-    # answers = []
-    # for i in range(len(frames)):
-    #     image = extract_frames(frames[i])
-    #     image_tensor = model.image_preprocess([image])
-    #     # Generate the answer
-    #     output_ids = model.generate(
-    #             input_ids,
-    #             max_new_tokens=max_tokens,
-    #             images=image_tensor,
-    #             use_cache=True)[0]
-    #     answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-    #     answers.append(answer)
-    #     return answers
-    if image:
         # Process as an image
         image = image.convert("RGB")
         image_tensor = model.image_preprocess(image)
@@ -91,30 +75,30 @@ def predict_answer(image, video, question, max_tokens=100):
         #Generate the answer
         output_ids = model.generate(
             input_ids,
-            max_new_tokens=max_tokens,
             images=image_tensor,
             use_cache=True)[0]
         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-    elif video:
         # Process as a video
         frames = video_to_frames(video)
         answers = []
-        for i in range(len(frames)):
-            image = extract_frames(frames[i])
             image_tensor = model.image_preprocess([image])
             # Generate the answer
             output_ids = model.generate(
                 input_ids,
-                max_new_tokens=max_tokens,
                 images=image_tensor,
                 use_cache=True)[0]
             answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
             answers.append(answer)
-        return ast.literal_eval(answers[0])
     else:
         return "Unsupported file type. Please upload an image or video."
@@ -122,39 +106,47 @@ def predict_answer(image, video, question, max_tokens=100):
-def gradio_predict(image, video, question, max_tokens):
-    answer = predict_answer(image, video, question, max_tokens)
     return answer
-# iface = gr.Interface(
-#     fn=gradio_predict,
-#     inputs=[
-#         gr.Image(type="pil", label="Upload or Drag an Image"),
-#         gr.Video(label="Upload your video here"),
-#         gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
-#         gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
-#     outputs=gr.TextArea(label="Answer"),
-#     # outputs=gr.Image(label="Output"),
-#     title="Video/Image Viewer",
-#     description="Upload an image or video to view it or extract frames from the video.",
-# )
-# iface.launch(debug=True)
-with gr.Blocks() as app:
-    gr.Markdown("### Upload an Image or Video")
     with gr.Row():
-        image = gr.Image(type="pil", label="Upload or Drag an Image")
         video = gr.Video(label="Upload your video here")
     with gr.Row():
         with gr.Column():
-            question = gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", lines=4)
-            tokens = gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")
         with gr.Column():
             answer = gr.TextArea(label="Answer")
-    btn = gr.Button("Predict")
-    btn.click(gradio_predict, inputs=[image, video, question, tokens], outputs=answer)
 app.launch(debug=True)

 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
 def video_to_frames(video, fps=1):
     """Converts a video file into frames and stores them as PNG images in a list."""
     frames_png = []
     return image_bgr
+def predict_answer(image, video, question):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
+    if image is not None:
         # Process as an image
         image = image.convert("RGB")
         image_tensor = model.image_preprocess(image)
         #Generate the answer
         output_ids = model.generate(
             input_ids,
+            max_new_tokens=25,
             images=image_tensor,
             use_cache=True)[0]
         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    elif video is not None:
         # Process as a video
         frames = video_to_frames(video)
         answers = []
+        for frame in frames:
+            image = extract_frames(frame)
             image_tensor = model.image_preprocess([image])
             # Generate the answer
             output_ids = model.generate(
                 input_ids,
+                max_new_tokens=25,
                 images=image_tensor,
                 use_cache=True)[0]
             answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
             answers.append(answer)
+        return "\n".join(answers)
     else:
         return "Unsupported file type. Please upload an image or video."
+def gradio_predict(image, video, question):
+    answer = predict_answer(image, video, question)
     return answer
+css = """
+#container{
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    width: 50%;
+}
+#intro{
+    max-width: 100%;
+    margin: 0 auto;
+    text-align: center;
+}
+"""
+with gr.Blocks(css = css) as app:
+    with gr.Row(elem_id="container"):
+        gr.Markdown("""<div style='text-align: center;'><img src="https://github-production-user-asset-6210df.s3.amazonaws.com/37763863/311454340-af72f848-9735-4d49-830b-885ffbb81091.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240309%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240309T165700Z&X-Amz-Expires=300&X-Amz-Signature=51aeb4811afff72e70c083594aaffcca1f4a2b95ddd4adf23ee5e736e4fbfefe&X-Amz-SignedHeaders=host&actor_id=37763863&key_id=0&repo_id=769602947" width="1000" height="500" /></div>""")
+    gr.Markdown("""
+    ## This Gradio app serves as four folds:
+    ### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure.
+    ### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework.
+    ### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC)
+    ### 4. Ability to integrate a Large Language Model and Vision Encoder
+                 """)
     with gr.Row():
         video = gr.Video(label="Upload your video here")
+        image = gr.Image(type="pil", label="Upload or Drag an Image")
     with gr.Row():
         with gr.Column():
+            question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3)
+            btn = gr.Button("Annotate")
         with gr.Column():
             answer = gr.TextArea(label="Answer")
+    btn.click(gradio_predict, inputs=[image, video, question], outputs=answer)
 app.launch(debug=True)