Spaces:

88hours
/

multimodel-rag-chat-with-videos

Running

App Files Files Community

88hours commited on Mar 6

Commit

ad022d3

verified ·

1 Parent(s): f7c72f7

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitattributes +36 -36
.gitignore +16 -12
Dockerfile +17 -0
README.md +182 -182
app.py +385 -376
gradio_utils.py +483 -483
mm_rag/MLM/client.py +134 -134
mm_rag/MLM/lvlm.py +300 -300
mm_rag/embeddings/bridgetower_embeddings.py +88 -88
mm_rag/vectorstores/multimodal_lancedb.py +130 -130
requirements.txt +25 -25
s6_prepare_video_input.py +89 -89
s7_store_in_rag.py +104 -104
shared_data/videos/yt_video/blackholes101nationalgeographic/blackholes101nationalgeographic.mp4 +2 -2
shared_data/videos/yt_video/blackholes101nationalgeographic/captions.vtt +104 -104
utility.py +763 -763

.gitattributes CHANGED Viewed

@@ -1,36 +1,36 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-*.mp4 filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,12 +1,16 @@
-myenv
-__pycache__
-.gradio
-.venv
-.env
-.env.*
-!.env.example
-.github
-# LanceDB files
-shared_data/.lancedb/
-shared_data/.lancedb/**/*
-shared_data/videos/yt_video/blackholes101nationalgeographic/audio.mp3

+myenv
+__pycache__
+.gradio
+.venv
+.env
+.env.*
+!.env.example
+.github
+# LanceDB files
+shared_data/.lancedb/
+shared_data/.lancedb/**/*
+shared_data/videos/yt_video/blackholes101nationalgeographic/audio.mp3
+mm_rag/embeddings/__pycache__/
+mm_rag/embeddings/__pycache__/**
+.DS_Store
+.devcontainer/

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM mcr.microsoft.com/devcontainers/python:3.11
+# Install system dependencies
+RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+    && apt-get -y install --no-install-recommends \
+    ffmpeg \
+    && apt-get clean -y && rm -rf /var/lib/apt/lists/*
+# Install the required system libraries for OpenCV
+RUN apt-get update && apt-get install -y libgl1-mesa-glx
+# Install PyTorch and other dependencies
+RUN pip install -r requirements.txt
+# Run the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,182 +1,182 @@
----
-title: multimodel-rag-chat-with-videos
-app_file: app.py
-sdk: gradio
-sdk_version: 5.17.1
----
-# Demo
-## Sample Video
-    - https://www.youtube.com/watch?v=kOEDG3j1bjs
-    - https://www.youtube.com/watch?v=7Hcg-rLYwdM
-## Questions
-    - Event Horizon
-    - show me a group of astronauts, AStronaut name
-# ReArchitecture Multimodal RAG System Pipeline Journey
-I ported it locally and isolated each concept into a step as Python runnable
-It is simplified, refactored and bug-fixed now.
-I migrated from Prediction Guard to HuggingFace.
-[**Interactive Video Chat Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
-### A multimodal AI system should be able to understand both text and video content.
-## Setup
-```bash
-python -m venv venv
-source venv/bin/activate
-```
-For Fish
-```bash
-source venv/bin/activate.fish
-```
-## Step 1 - Learn Gradio (UI) (30 mins)
-Gradio is a powerful Python library for quickly building browser-based UIs. It supports hot reloading for fast development.
-### Key Concepts:
-- **fn**: The function wrapped by the UI.
-- **inputs**: The Gradio components used for input (should match function arguments).
-- **outputs**: The Gradio components used for output (should match return values).
-📖 [**Gradio Documentation**](https://www.gradio.app/docs/gradio/introduction)
-Gradio includes **30+ built-in components**.
-💡 **Tip**: For `inputs` and `outputs`, you can pass either:
-- The **component name** as a string (e.g., `"textbox"`)
-- An **instance of the component class** (e.g., `gr.Textbox()`)
-### Sharing Your Demo
-```python
-demo.launch(share=True)  # Share your demo with just one extra parameter.
-```
-## Gradio Advanced Features
-### **Gradio.Blocks**
-Gradio provides `gr.Blocks`, a flexible way to design web apps with **custom layouts and complex interactions**:
-- Arrange components freely on the page.
-- Handle multiple data flows.
-- Use outputs as inputs for other components.
-- Dynamically update components based on user interaction.
-### **Gradio.ChatInterface**
-- Always set `type="messages"` in `gr.ChatInterface`.
-- The default (`type="tuples"`) is **deprecated** and will be removed in future versions.
-- For more UI flexibility, use `gr.ChatBot`.
-- `gr.ChatInterface` supports **Markdown** (not tested yet).
----
-## Step 2 - Learn Bridge Tower Embedding Model (Multimodal Learning) (15 mins)
-Developed in collaboration with Intel, this model maps image-caption pairs into **512-dimensional vectors**.
-### Measuring Similarity
-- **Cosine Similarity** → Measures how close images are in vector space (**efficient & commonly used**).
-- **Euclidean Distance** → Uses `cv2.NORM_L2` to compute similarity between two images.
-### Converting to 2D for Visualization
-- **UMAP** reduces 512D embeddings to **2D for display purposes**.
-## Preprocessing Videos for Multimodal RAG
-### **Case 1: WEBVTT → Extracting Text Segments from Video**
-    - Converts video + text into structured metadata.
-    - Splits content inhttps://www.youtube.com/watch?v=kOEDG3j1bjsto multiple segments.
-### **Case 2: Whisper (Small) → Video Only**
-    - Extracts **audio** → `model.transcribe()`.
-    - Applies `getSubs()` helper function to retrieve **WEBVTT** subtitles.
-    - Uses **Case 1** processing.
-### **Case 3: LvLM → Video + Silent/Music Extraction**
-    - Uses **Llava (LvLM model)** for **frame-based captioning**.
-    - Encodes each frame as a **Base64 image**.
-    - Extracts context and captions from video frames.
-    - Uses **Case 1** processing.
-# Step 4 - What is LLaVA?
-LLaVA (Large Language-and-Vision Assistant), a large multimodal model that connects a vision encoder that doesn't just see images but understands them, reads the text embedded in them, and reasons about their context—all.
-# Step 5 - what is a vector Store?
-A vector store is a specialized database designed to:
-- Store and manage high-dimensional vector data efficiently
-- Perform similarity-based searches where K=1 returns the most similar result
-- In LanceDB specifically, store multiple data types:
-    . Text content (captions)
-    . Image file paths
-    . Metadata
-    . Vector embeddings
-```python
-_ = MultimodalLanceDB.from_text_image_pairs(
-    texts=updated_vid1_trans+vid2_trans,
-    image_paths=vid1_img_path+vid2_img_path,
-    embedding=BridgeTowerEmbeddings(),
-    metadatas=vid1_metadata+vid2_metadata,
-    connection=db,
-    table_name=TBL_NAME,
-    mode="overwrite",
-)
-```
-# Gotchas and Solutions
-    Image Processing: When working with base64 encoded images, convert them to PIL.Image format before processing with BridgeTower
-    Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
-    Model Size: BridgeTower model requires ~3.5GB download
-    Image Downloads: Some Flickr images may be unavailable; implement robust error handling
-    Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions
-    Install from git+https://github.com/openai/whisper.git
-# Install ffmepg using brew
-    ```bash
-        brew install ffmpeg
-        brew link ffmpeg
-    ```
-# Learning and Skills
-## Technical Skills:
-    Basic Machine learning and deep learning
-    Vector embeddings and similarity search
-    Multimodal data processing
-## Framework & Library Expertise:
-    Hugging Face Transformers
-    Gradio UI development
-    LangChain integration (Basic)
-    PyTorch basics
-    LanceDB vector storage
-## AI/ML Concepts:
-    Multimodal RAG system architecture
-    Vector embeddings and similarity search
-    Large Language Models (LLaVA)
-    Image-text pair processing
-    Dimensionality reduction techniques
-## Multimedia Processing:
-    Video frame extraction
-    Audio transcription (Whisper)
-    Image processing (PIL)
-    Base64 encoding/decoding
-    WebVTT handling
-## System Design:
-    Client-server architecture
-    API endpoint design
-    Data pipeline construction
-    Vector store implementation
-    Multimodal system integration

+---
+title: multimodel-rag-chat-with-videos
+app_file: app.py
+sdk: gradio
+sdk_version: 5.17.1
+---
+# Demo
+## Sample Video
+    - https://www.youtube.com/watch?v=kOEDG3j1bjs
+    - https://www.youtube.com/watch?v=7Hcg-rLYwdM
+## Questions
+    - Event Horizon
+    - show me a group of astronauts, AStronaut name
+# ReArchitecture Multimodal RAG System Pipeline Journey
+I ported it locally and isolated each concept into a step as Python runnable
+It is simplified, refactored and bug-fixed now.
+I migrated from Prediction Guard to HuggingFace.
+[**Interactive Video Chat Demo and Multimodal RAG System Architecture**](https://learn.deeplearning.ai/courses/multimodal-rag-chat-with-videos/lesson/2/interactive-demo-and-multimodal-rag-system-architecture)
+### A multimodal AI system should be able to understand both text and video content.
+## Setup
+```bash
+python -m venv venv
+source venv/bin/activate
+```
+For Fish
+```bash
+source venv/bin/activate.fish
+```
+## Step 1 - Learn Gradio (UI) (30 mins)
+Gradio is a powerful Python library for quickly building browser-based UIs. It supports hot reloading for fast development.
+### Key Concepts:
+- **fn**: The function wrapped by the UI.
+- **inputs**: The Gradio components used for input (should match function arguments).
+- **outputs**: The Gradio components used for output (should match return values).
+📖 [**Gradio Documentation**](https://www.gradio.app/docs/gradio/introduction)
+Gradio includes **30+ built-in components**.
+💡 **Tip**: For `inputs` and `outputs`, you can pass either:
+- The **component name** as a string (e.g., `"textbox"`)
+- An **instance of the component class** (e.g., `gr.Textbox()`)
+### Sharing Your Demo
+```python
+demo.launch(share=True)  # Share your demo with just one extra parameter.
+```
+## Gradio Advanced Features
+### **Gradio.Blocks**
+Gradio provides `gr.Blocks`, a flexible way to design web apps with **custom layouts and complex interactions**:
+- Arrange components freely on the page.
+- Handle multiple data flows.
+- Use outputs as inputs for other components.
+- Dynamically update components based on user interaction.
+### **Gradio.ChatInterface**
+- Always set `type="messages"` in `gr.ChatInterface`.
+- The default (`type="tuples"`) is **deprecated** and will be removed in future versions.
+- For more UI flexibility, use `gr.ChatBot`.
+- `gr.ChatInterface` supports **Markdown** (not tested yet).
+---
+## Step 2 - Learn Bridge Tower Embedding Model (Multimodal Learning) (15 mins)
+Developed in collaboration with Intel, this model maps image-caption pairs into **512-dimensional vectors**.
+### Measuring Similarity
+- **Cosine Similarity** → Measures how close images are in vector space (**efficient & commonly used**).
+- **Euclidean Distance** → Uses `cv2.NORM_L2` to compute similarity between two images.
+### Converting to 2D for Visualization
+- **UMAP** reduces 512D embeddings to **2D for display purposes**.
+## Preprocessing Videos for Multimodal RAG
+### **Case 1: WEBVTT → Extracting Text Segments from Video**
+    - Converts video + text into structured metadata.
+    - Splits content inhttps://www.youtube.com/watch?v=kOEDG3j1bjsto multiple segments.
+### **Case 2: Whisper (Small) → Video Only**
+    - Extracts **audio** → `model.transcribe()`.
+    - Applies `getSubs()` helper function to retrieve **WEBVTT** subtitles.
+    - Uses **Case 1** processing.
+### **Case 3: LvLM → Video + Silent/Music Extraction**
+    - Uses **Llava (LvLM model)** for **frame-based captioning**.
+    - Encodes each frame as a **Base64 image**.
+    - Extracts context and captions from video frames.
+    - Uses **Case 1** processing.
+# Step 4 - What is LLaVA?
+LLaVA (Large Language-and-Vision Assistant), a large multimodal model that connects a vision encoder that doesn't just see images but understands them, reads the text embedded in them, and reasons about their context—all.
+# Step 5 - what is a vector Store?
+A vector store is a specialized database designed to:
+- Store and manage high-dimensional vector data efficiently
+- Perform similarity-based searches where K=1 returns the most similar result
+- In LanceDB specifically, store multiple data types:
+    . Text content (captions)
+    . Image file paths
+    . Metadata
+    . Vector embeddings
+```python
+_ = MultimodalLanceDB.from_text_image_pairs(
+    texts=updated_vid1_trans+vid2_trans,
+    image_paths=vid1_img_path+vid2_img_path,
+    embedding=BridgeTowerEmbeddings(),
+    metadatas=vid1_metadata+vid2_metadata,
+    connection=db,
+    table_name=TBL_NAME,
+    mode="overwrite",
+)
+```
+# Gotchas and Solutions
+    Image Processing: When working with base64 encoded images, convert them to PIL.Image format before processing with BridgeTower
+    Model Selection: Using BridgeTowerForContrastiveLearning instead of PredictionGuard due to API access limitations
+    Model Size: BridgeTower model requires ~3.5GB download
+    Image Downloads: Some Flickr images may be unavailable; implement robust error handling
+    Token Decoding: BridgeTower contrastive learning model works with embeddings, not token predictions
+    Install from git+https://github.com/openai/whisper.git
+# Install ffmepg using brew
+    ```bash
+        brew install ffmpeg
+        brew link ffmpeg
+    ```
+# Learning and Skills
+## Technical Skills:
+    Basic Machine learning and deep learning
+    Vector embeddings and similarity search
+    Multimodal data processing
+## Framework & Library Expertise:
+    Hugging Face Transformers
+    Gradio UI development
+    LangChain integration (Basic)
+    PyTorch basics
+    LanceDB vector storage
+## AI/ML Concepts:
+    Multimodal RAG system architecture
+    Vector embeddings and similarity search
+    Large Language Models (LLaVA)
+    Image-text pair processing
+    Dimensionality reduction techniques
+## Multimedia Processing:
+    Video frame extraction
+    Audio transcription (Whisper)
+    Image processing (PIL)
+    Base64 encoding/decoding
+    WebVTT handling
+## System Design:
+    Client-server architecture
+    API endpoint design
+    Data pipeline construction
+    Vector store implementation
+    Multimodal system integration

app.py CHANGED Viewed

@@ -1,376 +1,385 @@
-from pathlib import Path
-import gradio as gr
-import os
-from PIL import Image
-import ollama
-from utility import download_video, get_transcript_vtt, extract_meta_data, lvlm_inference_with_phi, lvlm_inference_with_tiny_model, lvlm_inference_with_tiny_model
-from mm_rag.embeddings.bridgetower_embeddings import (
-    BridgeTowerEmbeddings
-)
-from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
-import lancedb
-import json
-import os
-from PIL import Image
-from utility import load_json_file, display_retrieved_results
-import pyarrow as pa
-# declare host file
-LANCEDB_HOST_FILE = "./shared_data/.lancedb"
-# declare table name
-# initialize vectorstore
-db = lancedb.connect(LANCEDB_HOST_FILE)
-# initialize an BridgeTower embedder
-embedder = BridgeTowerEmbeddings()
-base_dir = "./shared_data/videos/yt_video"
-Path(base_dir).mkdir(parents=True, exist_ok=True)
-def open_table(table_name):
-    # open a connection to table TBL_NAME
-    tbl = db.open_table(table_name)
-    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
-    # display the first 3 rows of the table
-    tbl.to_pandas()[['text', 'image_path']].head(3)
-def check_if_table_exists(table_name):
-    return table_name in db.table_names()
-def store_in_rag(vid_table_name, vid_metadata_path):
-    # load metadata files
-    vid_metadata = load_json_file(vid_metadata_path)
-    vid_subs = [vid['transcript'] for vid in vid_metadata]
-    vid_img_path = [vid['extracted_frame_path'] for vid in vid_metadata]
-    # for video1, we pick n = 7
-    n = 7
-    updated_vid_subs = [
-    ' '.join(vid_subs[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
-    ' '.join(vid_subs[0 : i + int(n/2)]) for i in range(len(vid_subs))
-    ]
-    # also need to update the updated transcripts in metadata
-    for i in range(len(updated_vid_subs)):
-        vid_metadata[i]['transcript'] = updated_vid_subs[i]
-    # you can pass in mode="append"
-    # to add more entries to the vector store
-    # in case you want to start with a fresh vector store,
-    # you can pass in mode="overwrite" instead
-    print("Creating vid_table_name ", vid_table_name)
-    _ = MultimodalLanceDB.from_text_image_pairs(
-        texts=updated_vid_subs,
-        image_paths=vid_img_path,
-        embedding=embedder,
-        metadatas=vid_metadata,
-        connection=db,
-        table_name=vid_table_name,
-        mode="overwrite",
-    )
-    open_table(vid_table_name)
-    return vid_table_name
-def get_metadata_of_yt_video_with_captions(vid_url, from_gen=False):
-    vid_filepath, vid_folder_path, is_downloaded = download_video(vid_url, base_dir)
-    if is_downloaded:
-        print("Video downloaded at ", vid_filepath)
-    if from_gen:
-        # Delete existing caption and metadata files if they exist
-        caption_file = f"{vid_folder_path}/captions.vtt"
-        metadata_file = f"{vid_folder_path}/metadatas.json"
-        if os.path.exists(caption_file):
-            os.remove(caption_file)
-            print(f"Deleted existing caption file: {caption_file}")
-        if os.path.exists(metadata_file):
-            os.remove(metadata_file)
-            print(f"Deleted existing metadata file: {metadata_file}")
-    print("checking transcript")
-    vid_transcript_filepath = get_transcript_vtt(vid_folder_path, vid_url, vid_filepath, from_gen)
-    vid_metadata_path = f"{vid_folder_path}/metadatas.json"
-    print("checking metadatas at", vid_metadata_path)
-    if os.path.exists(vid_metadata_path):
-        print('Metadatas already exists')
-    else:
-        print("Downloading metadatas for the video ", vid_filepath)
-        extract_meta_data(vid_folder_path, vid_filepath, vid_transcript_filepath) #should return lowercase file name without spaces
-    parent_dir_name = os.path.basename(os.path.dirname(vid_metadata_path))
-    vid_table_name = f"{parent_dir_name}_table"
-    print("Checking db and Table name ", vid_table_name)
-    if not check_if_table_exists(vid_table_name):
-        print("Table does not exists Storing in RAG")
-    else:
-        print("Table exists")
-        def delete_table(table_name):
-            db.drop_table(table_name)
-            print(f"Deleted table {table_name}")
-        delete_table(vid_table_name)
-    store_in_rag(vid_table_name, vid_metadata_path)
-    return vid_filepath, vid_table_name
-"""
-def chat_response_llvm(instruction):
-    #file_path = the_metadatas[0]
-    file_path = 'shared_data/videos/yt_video/extracted_frame/'
-    result = ollama.generate(
-        model='llava',
-        prompt=instruction,
-        images=[file_path],
-        stream=True
-    )['response']
-    return result
-     """
-def return_top_k_most_similar_docs(vid_table_name, query, use_llm=False):
-    # Initialize results variable outside the if condition
-    max_docs = 2
-    print("Querying ", vid_table_name)
-    vectorstore = MultimodalLanceDB(
-        uri=LANCEDB_HOST_FILE,
-        embedding=embedder,
-        table_name=vid_table_name
-    )
-    retriever = vectorstore.as_retriever(
-        search_type='similarity',
-        search_kwargs={"k": max_docs}
-    )
-    # Get results first
-    results = retriever.invoke(query)
-    if use_llm:
-        # Read captions.vtt file
-        def read_vtt_file(file_path):
-            with open(file_path, 'r', encoding='utf-8') as f:
-                return f.read()
-        vid_table_name = vid_table_name.split('_table')[0]
-        caption_file = 'shared_data/videos/yt_video/' + vid_table_name + '/captions.vtt'
-        print("Caption file path ", caption_file)
-        captions = read_vtt_file(caption_file)
-        prompt = "Answer this query : " + query + " from the content " + captions
-        print("Prompt ", prompt)
-        all_page_content = lvlm_inference_with_phi(prompt)
-    else:
-        all_page_content = "\n\n".join([result.page_content for result in results])
-    page_content = gr.Textbox(all_page_content, label="Response", elem_id='chat-response', visible=True, interactive=False)
-    image1 = Image.open(results[0].metadata['extracted_frame_path'])
-    image2_path = results[1].metadata['extracted_frame_path']
-    if results[0].metadata['extracted_frame_path'] == image2_path:
-        image2 = gr.update(visible=False)
-    else:
-        image2 = Image.open(image2_path)
-        image2 = gr.update(value=image2, visible=True)
-    return page_content, image1, image2
-def process_url_and_init(youtube_url, from_gen=False):
-    url_input = gr.update(visible=False)
-    submit_btn = gr.update(visible=True)
-    chatbox = gr.update(visible=True)
-    submit_btn2 = gr.update(visible=True)
-    frame1 = gr.update(visible=True)
-    frame2 = gr.update(visible=False)
-    chatbox_llm, submit_btn_chat = gr.update(visible=True), gr.update(visible=True)
-    vid_filepath, vid_table_name = get_metadata_of_yt_video_with_captions(youtube_url, from_gen)
-    video = gr.Video(vid_filepath,render=True)
-    return url_input, submit_btn, video, vid_table_name, chatbox,submit_btn2, frame1, frame2, chatbox_llm, submit_btn_chat
-def test_btn():
-    text = "hi"
-    res = lvlm_inference_with_phi(text)
-    response = gr.Textbox(res, visible=True,interactive=False)
-    return response
-def init_ui():
-    with gr.Blocks() as demo:
-        gr.Markdown("Welcome to video chat demo - Initial processing can take up to 2 minutes, and responses may be slow. Please be patient and avoid clicking repeatedly.")
-        url_input = gr.Textbox(label="Enter YouTube URL", visible=False, elem_id='url-inp',value="https://www.youtube.com/watch?v=kOEDG3j1bjs", interactive=True)
-        vid_table_name = gr.Textbox(label="Enter Table Name", visible=False, interactive=False)
-        video = gr.Video()
-        with gr.Row():
-            submit_btn = gr.Button("Process Video By Download Subtitles")
-            submit_btn_gen = gr.Button("Process Video By Generating Subtitles")
-        with gr.Row():
-            chatbox = gr.Textbox(label="Enter the keyword/s and AI will get related captions and images", visible=False, value="event horizan", scale=4)
-            submit_btn_whisper = gr.Button("Submit", elem_id='chat-submit', visible=False, scale=1)
-        with gr.Row():
-            chatbox_llm = gr.Textbox(label="Ask a Question", visible=False, value="what this video is about?", scale=4)
-            submit_btn_chat = gr.Button("Ask", visible=False, scale=1)
-        response = gr.Textbox(label="Response", elem_id='chat-response',  visible=False,interactive=False)
-        with gr.Row():
-            frame1 = gr.Image(visible=False, interactive=False, scale=2)
-            frame2 = gr.Image(visible=False, interactive=False, scale=2)
-        submit_btn.click(fn=process_url_and_init, inputs=[url_input], outputs=[url_input, submit_btn, video, vid_table_name, chatbox,submit_btn_whisper, frame1, frame2, chatbox_llm, submit_btn_chat])
-        submit_btn_gen.click(fn=lambda x: process_url_and_init(x, from_gen=True), inputs=[url_input], outputs=[url_input, submit_btn, video, vid_table_name, chatbox,submit_btn_whisper, frame1, frame2,chatbox_llm, submit_btn_chat])
-        submit_btn_whisper.click(fn=return_top_k_most_similar_docs, inputs=[vid_table_name, chatbox], outputs=[response, frame1, frame2])
-        submit_btn_chat.click(
-            fn=lambda table_name, query: return_top_k_most_similar_docs(
-                vid_table_name=table_name,
-                query=query,
-                use_llm=True
-            ),
-            inputs=[vid_table_name, chatbox_llm],
-            outputs=[response, frame1, frame2]
-        )
-        reset_btn = gr.Button("Reload Page")
-        reset_btn.click(None, js="() => { location.reload(); }")
-        test_llama = gr.Button("Test Llama")
-        test_llama.click(test_btn, None, outputs=[response])
-    return demo
-def init_improved_ui():
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        # Header Section with Introduction
-        with gr.Group():
-            gr.Markdown("""
-            # 🎬 Video Analysis Assistant
-            ## How it Works:
-            1. 📥 Provide a YouTube URL.
-            2. 🔄 Choose a processing method:
-               - Download the video and its captions/subtitles from YouTube.
-               - Download the video and generate captions using Whisper AI.
-                The system will load the video in video player for preview and process the video and extract frames from it.
-                It will then pass the captions and images to the RAG model to store them in the database.
-                The RAG (Lance DB) uses a pre-trained BridgeTower model to generate embeddings that provide pairs of captions and related images.
-            3. 🤖 Analyze video content through:
-               - Keyword Search - Use this functionality to search for keywords in the video. Our RAG model will return the most relevant captions and images.
-               - AI-powered Q&A - Use this functionality to ask questions about the video content. Our system will use the Meta/LLaMA model to analyze the captions and images and provide detailed answers.
-            4. 📊 Results will be displayed in the response section with related images.
-            > **Note**: Initial processing takes several minutes. Please be patient and monitor the logs for progress updates.
-            """)
-        # Video Input Section
-        with gr.Group():
-            url_input = gr.Textbox(
-                label="YouTube URL",
-                value="https://www.youtube.com/watch?v=kOEDG3j1bjs",
-                visible=True,
-                elem_id='url-inp',
-                interactive=False
-            )
-            vid_table_name = gr.Textbox(label="Table Name", visible=False)
-            video = gr.Video(label="Video Preview")
-            with gr.Row():
-                submit_btn = gr.Button("📥 Process with Existing Subtitles", variant="primary")
-                submit_btn_gen = gr.Button("🎯 Generate New Subtitles", variant="secondary")
-        # Analysis Tools Section
-        with gr.Group():
-            gr.Markdown("### 🔍 Analysis Tools")
-            with gr.Tab("Keyword Search"):
-                with gr.Row():
-                    chatbox = gr.Textbox(
-                        label="Search Keywords",
-                        value="event horizon",
-                        visible=False,
-                        scale=4
-                    )
-                    submit_btn_whisper = gr.Button(
-                        "🔎 Search",
-                        elem_id='chat-submit',
-                        visible=False,
-                        scale=1
-                    )
-            with gr.Tab("AI Q&A"):
-                with gr.Row():
-                    chatbox_llm = gr.Textbox(
-                        label="Ask AI about the video",
-                        value="What is this video about?",
-                        visible=False,
-                        scale=4
-                    )
-                    submit_btn_chat = gr.Button(
-                        "🤖 Ask",
-                        visible=False,
-                        scale=1
-                    )
-        # Results Display Section
-        with gr.Group():
-            gr.Markdown("### 📊 Results")
-            response = gr.Textbox(
-                label="AI Response",
-                elem_id='chat-response',
-                visible=False,
-                interactive=False
-            )
-            with gr.Row():
-                frame1 = gr.Image(visible=False, label="Related Frame 1", scale=2)
-                frame2 = gr.Image(visible=False, label="Related Frame 2", scale=2)
-        # Control Buttons
-        with gr.Row():
-            reset_btn = gr.Button("🔄 Start Over", variant="secondary")
-            test_llama = gr.Button("🧪 Say Hi to Llama", variant="secondary")
-        # Event Handlers
-        submit_btn.click(
-            fn=process_url_and_init,
-            inputs=[url_input],
-            outputs=[url_input, submit_btn, video, vid_table_name,
-                    chatbox, submit_btn_whisper, frame1, frame2,
-                    chatbox_llm, submit_btn_chat]
-        )
-        submit_btn_gen.click(
-            fn=lambda x: process_url_and_init(x, from_gen=True),
-            inputs=[url_input],
-            outputs=[url_input, submit_btn, video, vid_table_name,
-                    chatbox, submit_btn_whisper, frame1, frame2,
-                    chatbox_llm, submit_btn_chat]
-        )
-        submit_btn_whisper.click(
-            fn=return_top_k_most_similar_docs,
-            inputs=[vid_table_name, chatbox],
-            outputs=[response, frame1, frame2]
-        )
-        submit_btn_chat.click(
-            fn=lambda table_name, query: return_top_k_most_similar_docs(
-                vid_table_name=table_name,
-                query=query,
-                use_llm=True
-            ),
-            inputs=[vid_table_name, chatbox_llm],
-            outputs=[response, frame1, frame2]
-        )
-        reset_btn.click(None, js="() => { location.reload(); }")
-        test_llama.click(test_btn, None, outputs=[response])
-    return demo
-if __name__ == '__main__':
-    demo = init_improved_ui()  # Updated function name here
-    demo.launch(share=True, debug=True)

+from pathlib import Path
+import gradio as gr
+import os
+from PIL import Image
+import ollama
+from utility import download_video, get_transcript_vtt, extract_meta_data, lvlm_inference_with_phi, lvlm_inference_with_tiny_model, lvlm_inference_with_tiny_model
+from mm_rag.embeddings.bridgetower_embeddings import (
+    BridgeTowerEmbeddings
+)
+from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
+import lancedb
+import json
+import os
+from PIL import Image
+from utility import load_json_file, display_retrieved_results
+import pyarrow as pa
+# declare host file
+LANCEDB_HOST_FILE = "./shared_data/.lancedb"
+# declare table name
+# initialize vectorstore
+db = lancedb.connect(LANCEDB_HOST_FILE)
+# initialize an BridgeTower embedder
+embedder = BridgeTowerEmbeddings()
+video_processed = False
+base_dir = "./shared_data/videos/yt_video"
+Path(base_dir).mkdir(parents=True, exist_ok=True)
+def open_table(table_name):
+    # open a connection to table TBL_NAME
+    tbl = db.open_table(table_name)
+    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
+    # display the first 3 rows of the table
+    tbl.to_pandas()[['text', 'image_path']].head(3)
+def check_if_table_exists(table_name):
+    return table_name in db.table_names()
+def store_in_rag(vid_table_name, vid_metadata_path):
+    # load metadata files
+    vid_metadata = load_json_file(vid_metadata_path)
+    vid_subs = [vid['transcript'] for vid in vid_metadata]
+    vid_img_path = [vid['extracted_frame_path'] for vid in vid_metadata]
+    # for video1, we pick n = 7
+    n = 7
+    updated_vid_subs = [
+        ' '.join(vid_subs[i-int(n/2): i+int(n/2)]) if i-int(n/2) >= 0 else
+        ' '.join(vid_subs[0: i + int(n/2)]) for i in range(len(vid_subs))
+    ]
+    # also need to update the updated transcripts in metadata
+    for i in range(len(updated_vid_subs)):
+        vid_metadata[i]['transcript'] = updated_vid_subs[i]
+    # you can pass in mode="append"
+    # to add more entries to the vector store
+    # in case you want to start with a fresh vector store,
+    # you can pass in mode="overwrite" instead
+    print("Creating vid_table_name ", vid_table_name)
+    _ = MultimodalLanceDB.from_text_image_pairs(
+        texts=updated_vid_subs,
+        image_paths=vid_img_path,
+        embedding=embedder,
+        metadatas=vid_metadata,
+        connection=db,
+        table_name=vid_table_name,
+        mode="overwrite",
+    )
+    open_table(vid_table_name)
+    return vid_table_name
+def get_metadata_of_yt_video_with_captions(vid_url, from_gen=False):
+    vid_filepath, vid_folder_path, is_downloaded = download_video(
+        vid_url, base_dir)
+    if is_downloaded:
+        print("Video downloaded at ", vid_filepath)
+    if from_gen:
+        # Delete existing caption and metadata files if they exist
+        caption_file = f"{vid_folder_path}/captions.vtt"
+        metadata_file = f"{vid_folder_path}/metadatas.json"
+        if os.path.exists(caption_file):
+            os.remove(caption_file)
+            print(f"Deleted existing caption file: {caption_file}")
+        if os.path.exists(metadata_file):
+            os.remove(metadata_file)
+            print(f"Deleted existing metadata file: {metadata_file}")
+    print("checking transcript")
+    vid_transcript_filepath = get_transcript_vtt(
+        vid_folder_path, vid_url, vid_filepath, from_gen)
+    vid_metadata_path = f"{vid_folder_path}/metadatas.json"
+    print("checking metadatas at", vid_metadata_path)
+    if os.path.exists(vid_metadata_path):
+        print('Metadatas already exists')
+    else:
+        print("Downloading metadatas for the video ", vid_filepath)
+        # should return lowercase file name without spaces
+        extract_meta_data(vid_folder_path, vid_filepath,
+                          vid_transcript_filepath)
+    parent_dir_name = os.path.basename(os.path.dirname(vid_metadata_path))
+    vid_table_name = f"{parent_dir_name}_table"
+    print("Checking db and Table name ", vid_table_name)
+    if not check_if_table_exists(vid_table_name):
+        print("Table does not exists Storing in RAG")
+    else:
+        print("Table exists")
+        def delete_table(table_name):
+            db.drop_table(table_name)
+            print(f"Deleted table {table_name}")
+        delete_table(vid_table_name)
+    store_in_rag(vid_table_name, vid_metadata_path)
+    return vid_filepath, vid_table_name
+def return_top_k_most_similar_docs(vid_table_name, query, use_llm=False):
+    if not video_processed:
+        gr.Error("Please process the video first in Step 1")
+    # Initialize results variable outside the if condition
+    max_docs = 2
+    print("Querying ", vid_table_name)
+    vectorstore = MultimodalLanceDB(
+        uri=LANCEDB_HOST_FILE,
+        embedding=embedder,
+        table_name=vid_table_name
+    )
+    retriever = vectorstore.as_retriever(
+        search_type='similarity',
+        search_kwargs={"k": max_docs}
+    )
+    # Get results first
+    results = retriever.invoke(query)
+    if use_llm:
+        # Read captions.vtt file
+        def read_vtt_file(file_path):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        vid_table_name = vid_table_name.split('_table')[0]
+        caption_file = 'shared_data/videos/yt_video/' + vid_table_name + '/captions.vtt'
+        print("Caption file path ", caption_file)
+        captions = read_vtt_file(caption_file)
+        prompt = "Answer this query : " + query + " from the content " + captions
+        print("Prompt ", prompt)
+        all_page_content = lvlm_inference_with_phi(prompt)
+    else:
+        all_page_content = "\n\n".join(
+            [result.page_content for result in results])
+    page_content = gr.Textbox(all_page_content, label="Response",
+                              elem_id='chat-response', visible=True, interactive=False)
+    image1 = Image.open(results[0].metadata['extracted_frame_path'])
+    image2_path = results[1].metadata['extracted_frame_path']
+    if results[0].metadata['extracted_frame_path'] == image2_path:
+        image2 = gr.update(visible=False)
+    else:
+        image2 = Image.open(image2_path)
+        image2 = gr.update(value=image2, visible=True)
+    return page_content, image1, image2
+def process_url_and_init(youtube_url, from_gen=False):
+    video_processed = True
+    url_input = gr.update(visible=False)
+    submit_btn = gr.update(visible=True)
+    chatbox = gr.update(visible=True)
+    submit_btn2 = gr.update(visible=True)
+    frame1 = gr.update(visible=True)
+    frame2 = gr.update(visible=False)
+    chatbox_llm, submit_btn_chat = gr.update(
+        visible=True), gr.update(visible=True)
+    vid_filepath, vid_table_name = get_metadata_of_yt_video_with_captions(
+        youtube_url, from_gen)
+    video = gr.Video(vid_filepath, render=True)
+    return url_input, submit_btn, video, vid_table_name, chatbox, submit_btn2, frame1, frame2, chatbox_llm, submit_btn_chat
+def test_btn():
+    text = "hi"
+    res = lvlm_inference_with_phi(text)
+    response = gr.Textbox(res, visible=True, interactive=False)
+    return response
+def init_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown("Welcome to video chat demo - Initial processing can take up to 2 minutes, and responses may be slow. Please be patient and avoid clicking repeatedly.")
+        url_input = gr.Textbox(label="Enter YouTube URL", visible=False, elem_id='url-inp',
+                               value="https://www.youtube.com/watch?v=kOEDG3j1bjs", interactive=True)
+        vid_table_name = gr.Textbox(
+            label="Enter Table Name", visible=False, interactive=False)
+        video = gr.Video()
+        with gr.Row():
+            submit_btn = gr.Button("Process Video By Download Subtitles")
+            submit_btn_gen = gr.Button("Process Video By Generating Subtitles")
+        with gr.Row():
+            chatbox = gr.Textbox(label="Enter the keyword/s and AI will get related captions and images",
+                                 visible=False, value="event horizan", scale=4)
+            submit_btn_whisper = gr.Button(
+                "Submit", elem_id='chat-submit', visible=False, scale=1)
+        with gr.Row():
+            chatbox_llm = gr.Textbox(
+                label="Ask a Question", visible=False, value="what this video is about?", scale=4)
+            submit_btn_chat = gr.Button("Ask", visible=False, scale=1)
+        response = gr.Textbox(
+            label="Response", elem_id='chat-response',  visible=False, interactive=False)
+        with gr.Row():
+            frame1 = gr.Image(visible=False, interactive=False, scale=2)
+            frame2 = gr.Image(visible=False, interactive=False, scale=2)
+        submit_btn.click(fn=process_url_and_init, inputs=[url_input], outputs=[
+                         url_input, submit_btn, video, vid_table_name, chatbox, submit_btn_whisper, frame1, frame2, chatbox_llm, submit_btn_chat])
+        submit_btn_gen.click(fn=lambda x: process_url_and_init(x, from_gen=True), inputs=[url_input], outputs=[
+                             url_input, submit_btn, video, vid_table_name, chatbox, submit_btn_whisper, frame1, frame2, chatbox_llm, submit_btn_chat])
+        submit_btn_whisper.click(fn=return_top_k_most_similar_docs, inputs=[
+                                 vid_table_name, chatbox], outputs=[response, frame1, frame2])
+        submit_btn_chat.click(
+            fn=lambda table_name, query: return_top_k_most_similar_docs(
+                vid_table_name=table_name,
+                query=query,
+                use_llm=True
+            ),
+            inputs=[vid_table_name, chatbox_llm],
+            outputs=[response, frame1, frame2]
+        )
+        reset_btn = gr.Button("Reload Page")
+        reset_btn.click(None, js="() => { location.reload(); }")
+        test_llama = gr.Button("Test Llama")
+        test_llama.click(test_btn, None, outputs=[response])
+    return demo
+def init_improved_ui():
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        # Header Section with Introduction
+        with gr.Accordion(label=" # 🎬 Video Analysis Assistant", open=True):
+            gr.Markdown("""
+            ## How it Works:
+            1. 📥 Provide a YouTube URL.
+            2. 🔄 Choose a processing method:
+               - Download the video and its captions/subtitles from YouTube.
+               - Download the video and generate captions using Whisper AI.
+                The system will load the video in video player for preview and process the video and extract frames from it.
+                It will then pass the captions and images to the RAG model to store them in the database.
+                The RAG (Lance DB) uses a pre-trained BridgeTower model to generate embeddings that provide pairs of captions and related images.
+            3. 🤖 Analyze video content through:
+               - Keyword Search - Use this functionality to search for keywords in the video. Our RAG model will return the most relevant captions and images.
+               - AI-powered Q&A - Use this functionality to ask questions about the video content. Our system will use the Meta/LLaMA model to analyze the captions and images and provide detailed answers.
+            4. 📊 Results will be displayed in the response section with related images.
+            > **Note**: Initial processing takes several minutes. Please be patient and monitor the logs for progress updates.
+            """)
+        # Video Input Section
+        with gr.Group():
+            url_input = gr.Textbox(
+                label="YouTube URL",
+                value="https://www.youtube.com/watch?v=kOEDG3j1bjs",
+                visible=True,
+                interactive=False
+            )
+            vid_table_name = gr.Textbox(label="Table Name", visible=False)
+            video = gr.Video(label="Video Preview")
+            with gr.Row():
+                submit_btn = gr.Button(
+                    "📥 Step 1: Process with Existing Subtitles", variant="primary", size='md')
+                submit_btn_gen = gr.Button(
+                    "🎯 Generate New Subtitles", variant="secondary", visible=False)
+        # Analysis Tools Section
+        with gr.Group():
+            gr.Markdown("### 🔍 Step 2: Chat AI about the video")
+            with gr.Row():
+                chatbox = gr.Textbox(
+                    label="Step 2: Search Keywords",
+                    value="event horizon, black holes, space",
+                    visible=False
+                )
+                submit_btn_whisper = gr.Button(
+                    "🔎 Search",
+                    visible=False,
+                    variant="primary"
+                )
+            with gr.Row():
+                chatbox_llm = gr.Textbox(
+                    label="",
+                    value="What is this video about?",
+                    visible=True
+                )
+                submit_btn_chat = gr.Button(
+                    "🤖 Ask",
+                    visible=True,
+                    scale=1
+                )
+        # Results Display Section
+        with gr.Group():
+            gr.Markdown("### 📊 AI Response")
+            response = gr.Textbox(
+                label="AI Response",
+                visible=True,
+                interactive=False
+            )
+            with gr.Row():
+                frame1 = gr.Image(
+                    visible=False, label="Related Frame 1", scale=1)
+                frame2 = gr.Image(
+                    visible=False, label="Related Frame 2", scale=2)
+        # Control Buttons
+        with gr.Row():
+            reset_btn = gr.Button("🔄 Start Over", variant="secondary")
+            test_llama = gr.Button("🧪 Say Hi to Llama",
+                                   visible=False, variant="secondary")
+        # Event Handlers
+        submit_btn.click(
+            fn=process_url_and_init,
+            inputs=[url_input],
+            outputs=[url_input, submit_btn, video, vid_table_name,
+                     chatbox, submit_btn_whisper, frame1, frame2,
+                     chatbox_llm, submit_btn_chat]
+        )
+        submit_btn_gen.click(
+            fn=lambda x: process_url_and_init(x, from_gen=True),
+            inputs=[url_input],
+            outputs=[url_input, submit_btn, video, vid_table_name,
+                     chatbox, submit_btn_whisper, frame1, frame2,
+                     chatbox_llm, submit_btn_chat]
+        )
+        submit_btn_whisper.click(
+            fn=return_top_k_most_similar_docs,
+            inputs=[vid_table_name, chatbox],
+            outputs=[response, frame1, frame2]
+        )
+        submit_btn_chat.click(
+            fn=lambda table_name, query: return_top_k_most_similar_docs(
+                vid_table_name=table_name,
+                query=query,
+                use_llm=True
+            ),
+            inputs=[vid_table_name, chatbox_llm],
+            outputs=[response, frame1, frame2]
+        )
+        reset_btn.click(None, js="() => { location.reload(); }")
+        test_llama.click(test_btn, None, outputs=[response])
+    return demo
+if __name__ == '__main__':
+    demo = init_improved_ui()  # Updated function name here
+    demo.launch(share=True, debug=True)

gradio_utils.py CHANGED Viewed

@@ -1,483 +1,483 @@
-import gradio as gr
-import io
-import sys
-import time
-import dataclasses
-from pathlib import Path
-import os
-from enum import auto, Enum
-from typing import List, Tuple, Any
-from utility import prediction_guard_llava_conv
-import lancedb
-from utility import load_json_file
-from mm_rag.embeddings.bridgetower_embeddings import BridgeTowerEmbeddings
-from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
-from mm_rag.MLM.client import PredictionGuardClient
-from mm_rag.MLM.lvlm import LVLM
-from PIL import Image
-from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
-from moviepy.video.io.VideoFileClip import VideoFileClip
-from utility import prediction_guard_llava_conv, encode_image, Conversation, lvlm_inference_with_conversation
-server_error_msg="**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
-# function to split video at a timestamp
-def split_video(video_path, timestamp_in_ms, output_video_path: str = "./shared_data/splitted_videos", output_video_name: str="video_tmp.mp4", play_before_sec: int=3, play_after_sec: int=3):
-    timestamp_in_sec = int(timestamp_in_ms / 1000)
-    # create output_video_name folder if not exist:
-    Path(output_video_path).mkdir(parents=True, exist_ok=True)
-    output_video = os.path.join(output_video_path, output_video_name)
-    with VideoFileClip(video_path) as video:
-        duration = video.duration
-        start_time = max(timestamp_in_sec - play_before_sec, 0)
-        end_time = min(timestamp_in_sec + play_after_sec, duration)
-        new = video.subclip(start_time, end_time)
-        new.write_videofile(output_video, audio_codec='aac')
-    return output_video
-prompt_template = """The transcript associated with the image is '{transcript}'. {user_query}"""
-# define default rag_chain
-def get_default_rag_chain():
-    # declare host file
-    LANCEDB_HOST_FILE = "./shared_data/.lancedb"
-    # declare table name
-    TBL_NAME = "demo_tbl"
-    # initialize vectorstore
-    db = lancedb.connect(LANCEDB_HOST_FILE)
-    # initialize an BridgeTower embedder
-    embedder = BridgeTowerEmbeddings()
-    ## Creating a LanceDB vector store
-    vectorstore = MultimodalLanceDB(uri=LANCEDB_HOST_FILE, embedding=embedder, table_name=TBL_NAME)
-    ### creating a retriever for the vector store
-    retriever_module = vectorstore.as_retriever(search_type='similarity', search_kwargs={"k": 1})
-    # initialize a client as PredictionGuardClien
-    client = PredictionGuardClient()
-    # initialize LVLM with the given client
-    lvlm_inference_module = LVLM(client=client)
-    def prompt_processing(input):
-        # get the retrieved results and user's query
-        retrieved_results, user_query = input['retrieved_results'], input['user_query']
-        # get the first retrieved result by default
-        retrieved_result = retrieved_results[0]
-        # prompt_template = """The transcript associated with the image is '{transcript}'. {user_query}"""
-        # get all metadata of the retrieved video segment
-        metadata_retrieved_video_segment = retrieved_result.metadata['metadata']
-        # get the frame and the corresponding transcript, path to extracted frame, path to whole video, and time stamp of the retrieved video segment.
-        transcript = metadata_retrieved_video_segment['transcript']
-        frame_path = metadata_retrieved_video_segment['extracted_frame_path']
-        return {
-            'prompt': prompt_template.format(transcript=transcript, user_query=user_query),
-            'image' : frame_path,
-            'metadata' : metadata_retrieved_video_segment,
-        }
-    # initialize prompt processing module as a Langchain RunnableLambda of function prompt_processing
-    prompt_processing_module = RunnableLambda(prompt_processing)
-    # the output of this new chain will be a dictionary
-    mm_rag_chain_with_retrieved_image = (
-                    RunnableParallel({"retrieved_results": retriever_module ,
-                                      "user_query": RunnablePassthrough()})
-                    | prompt_processing_module
-                    | RunnableParallel({'final_text_output': lvlm_inference_module,
-                                        'input_to_lvlm' : RunnablePassthrough()})
-                   )
-    return mm_rag_chain_with_retrieved_image
-class SeparatorStyle(Enum):
-    """Different separator style."""
-    SINGLE = auto()
-@dataclasses.dataclass
-class GradioInstance:
-    """A class that keeps all conversation history."""
-    system: str
-    roles: List[str]
-    messages: List[List[str]]
-    offset: int
-    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
-    sep: str = "\n"
-    sep2: str = None
-    version: str = "Unknown"
-    path_to_img: str = None
-    video_title: str = None
-    path_to_video: str = None
-    caption: str = None
-    mm_rag_chain: Any = None
-    skip_next: bool = False
-    def _template_caption(self):
-        out = ""
-        if self.caption is not None:
-            out = f"The caption associated with the image is '{self.caption}'. "
-        return out
-    def get_prompt_for_rag(self):
-        messages = self.messages
-        assert len(messages) == 2, "length of current conversation should be 2"
-        assert messages[1][1] is None, "the first response message of current conversation should be None"
-        ret = messages[0][1]
-        return ret
-    def get_conversation_for_lvlm(self):
-        pg_conv = prediction_guard_llava_conv.copy()
-        image_path = self.path_to_img
-        b64_img = encode_image(image_path)
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
-            if msg is None:
-                break
-            if i == 0:
-                pg_conv.append_message(prediction_guard_llava_conv.roles[0], [msg, b64_img])
-            elif i == len(self.messages[self.offset:]) - 2:
-                pg_conv.append_message(role, [prompt_template.format(transcript=self.caption, user_query=msg)])
-            else:
-                pg_conv.append_message(role, [msg])
-        return pg_conv
-    def append_message(self, role, message):
-        self.messages.append([role, message])
-    def get_images(self, return_pil=False):
-        images = []
-        if self.path_to_img is not None:
-            path_to_image = self.path_to_img
-            images.append(path_to_image)
-        return images
-    def to_gradio_chatbot(self):
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    import base64
-                    from io import BytesIO
-                    msg, image, image_process_mode = msg
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    buffered = BytesIO()
-                    image.save(buffered, format="JPEG")
-                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = img_str + msg.replace('<image>', '').strip()
-                    ret.append([msg, None])
-                else:
-                    ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-    def copy(self):
-        return GradioInstance(
-            system=self.system,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2,
-            version=self.version,
-            mm_rag_chain=self.mm_rag_chain,
-        )
-    def dict(self):
-        return {
-            "system": self.system,
-            "roles": self.roles,
-            "messages": self.messages,
-            "offset": self.offset,
-            "sep": self.sep,
-            "sep2": self.sep2,
-            "path_to_img": self.path_to_img,
-            "video_title" : self.video_title,
-            "path_to_video": self.path_to_video,
-            "caption" : self.caption,
-        }
-    def get_path_to_subvideos(self):
-        if self.video_title is not None and self.path_to_img is not None:
-            info = video_helper_map[self.video_title]
-            path = info['path']
-            prefix = info['prefix']
-            vid_index = self.path_to_img.split('/')[-1]
-            vid_index = vid_index.split('_')[-1]
-            vid_index = vid_index.replace('.jpg', '')
-            ret = f"{prefix}{vid_index}.mp4"
-            ret = os.path.join(path, ret)
-            return ret
-        elif self.path_to_video is not None:
-            return self.path_to_video
-        return None
-def get_gradio_instance(mm_rag_chain=None):
-    if mm_rag_chain is None:
-        mm_rag_chain = get_default_rag_chain()
-    instance = GradioInstance(
-        system="",
-        roles=prediction_guard_llava_conv.roles,
-        messages=[],
-        offset=0,
-        sep_style=SeparatorStyle.SINGLE,
-        sep="\n",
-        path_to_img=None,
-        video_title=None,
-        caption=None,
-        mm_rag_chain=mm_rag_chain,
-    )
-    return instance
-gr.set_static_paths(paths=["./assets/"])
-theme = gr.themes.Base(
-    primary_hue=gr.themes.Color(
-        c100="#dbeafe", c200="#bfdbfe", c300="#93c5fd", c400="#60a5fa", c50="#eff6ff", c500="#0054ae", c600="#00377c", c700="#00377c", c800="#1e40af", c900="#1e3a8a", c950="#0a0c2b"),
-    secondary_hue=gr.themes.Color(
-        c100="#dbeafe", c200="#bfdbfe", c300="#93c5fd", c400="#60a5fa", c50="#eff6ff", c500="#0054ae", c600="#0054ae", c700="#0054ae", c800="#1e40af", c900="#1e3a8a", c950="#1d3660"),
-).set(
-    body_background_fill_dark='*primary_950',
-    body_text_color_dark='*neutral_300',
-    border_color_accent='*primary_700',
-    border_color_accent_dark='*neutral_800',
-    block_background_fill_dark='*primary_950',
-    block_border_width='2px',
-    block_border_width_dark='2px',
-    button_primary_background_fill_dark='*primary_500',
-    button_primary_border_color_dark='*primary_500'
-)
-css='''
-    @font-face {
-        font-family: IntelOne;
-        src: url("/file=./assets/intelone-bodytext-font-family-regular.ttf");
-    }
-    .gradio-container {background-color: #0a0c2b}
-    table {
-      border-collapse: collapse;
-      border: none;
-    }
-'''
-##     <td style="border-bottom:0"><img src="file/assets/DCAI_logo.png" height="300" width="300"></td>
-# html_title = '''
-# <table style="bordercolor=#0a0c2b; border=0">
-# <tr style="height:150px; border:0">
-#     <td style="border:0"><img src="/file=../assets/intel-labs.png" height="100" width="100"></td>
-#     <td style="vertical-align:bottom; border:0">
-#     <p style="font-size:xx-large;font-family:IntelOne, Georgia, sans-serif;color: white;">
-#      Multimodal RAG:
-#      <br>
-#      Chat with Videos
-#     </p>
-#     </td>
-#     <td style="border:0"><img src="/file=../assets/gaudi.png" width="100" height="100"></td>
-#     <td style="border:0"><img src="/file=../assets/IDC7.png" width="300" height="350"></td>
-#     <td style="border:0"><img src="/file=../assets/prediction_guard3.png" width="120" height="120"></td>
-# </tr>
-# </table>
-# '''
-html_title = '''
-<table style="bordercolor=#0a0c2b; border=0">
-<tr style="height:150px; border:0">
-    <td style="border:0"><img src="/file=./assets/header.png"></td>
-</tr>
-</table>
-'''
-#<td style="border:0"><img src="/file=../assets/xeon.png" width="100" height="100"></td>
-dropdown_list = [
-    "What is the name of one of the astronauts?",
-    "An astronaut's spacewalk",
-    "What does the astronaut say?",
-]
-no_change_btn = gr.Button()
-enable_btn = gr.Button(interactive=True)
-disable_btn = gr.Button(interactive=False)
-def clear_history(state, request: gr.Request):
-    state = get_gradio_instance(state.mm_rag_chain)
-    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 1
-def add_text(state, text, request: gr.Request):
-    if len(text) <= 0 :
-        state.skip_next = True
-        return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 1
-    text = text[:1536]  # Hard cut-off
-    state.append_message(state.roles[0], text)
-    state.append_message(state.roles[1], None)
-    state.skip_next = False
-    return (state, state.to_gradio_chatbot(), "") + (disable_btn,) * 1
-def http_bot(
-    state, request: gr.Request
-):
-    start_tstamp = time.time()
-    if state.skip_next:
-        # This generate call is skipped due to invalid inputs
-        path_to_sub_videos = state.get_path_to_subvideos()
-        yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (no_change_btn,) * 1
-        return
-    if len(state.messages) == state.offset + 2:
-        # First round of conversation
-        new_state = get_gradio_instance(state.mm_rag_chain)
-        new_state.append_message(new_state.roles[0], state.messages[-2][1])
-        new_state.append_message(new_state.roles[1], None)
-        state = new_state
-    all_images = state.get_images(return_pil=False)
-    # Make requests
-    is_very_first_query = True
-    if len(all_images) == 0:
-        # first query need to do RAG
-        # Construct prompt
-        prompt_or_conversation = state.get_prompt_for_rag()
-    else:
-        # subsequence queries, no need to do Retrieval
-        is_very_first_query = False
-        prompt_or_conversation = state.get_conversation_for_lvlm()
-    if is_very_first_query:
-        executor = state.mm_rag_chain
-    else:
-        executor = lvlm_inference_with_conversation
-    state.messages[-1][-1] = "▌"
-    path_to_sub_videos = state.get_path_to_subvideos()
-    yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (disable_btn,) * 1
-    try:
-        if is_very_first_query:
-            # get response by invoke executor chain
-            response = executor.invoke(prompt_or_conversation)
-            message = response['final_text_output']
-            if 'metadata' in response['input_to_lvlm']:
-                metadata = response['input_to_lvlm']['metadata']
-                if (state.path_to_img is None
-                    and 'input_to_lvlm' in response
-                    and 'image' in response['input_to_lvlm']
-                   ):
-                        state.path_to_img = response['input_to_lvlm']['image']
-                if state.path_to_video is None and 'video_path' in metadata:
-                        video_path = metadata['video_path']
-                        mid_time_ms = metadata['mid_time_ms']
-                        splited_video_path = split_video(video_path, mid_time_ms)
-                        state.path_to_video = splited_video_path
-                if state.caption is None and 'transcript' in metadata:
-                    state.caption = metadata['transcript']
-            else:
-                raise ValueError("Response's format is changed")
-        else:
-            # get the response message by directly call PredictionGuardAPI
-            message = executor(prompt_or_conversation)
-    except Exception as e:
-        print(e)
-        state.messages[-1][-1] = server_error_msg
-        yield (state, state.to_gradio_chatbot(), None) + (
-            enable_btn,
-        )
-        return
-    state.messages[-1][-1] = message
-    path_to_sub_videos = state.get_path_to_subvideos()
-    # path_to_image = state.path_to_img
-    # caption = state.caption
-    # # print(path_to_sub_videos)
-    # # print(path_to_image)
-    # # print('caption: ', caption)
-    yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (enable_btn,) * 1
-    finish_tstamp = time.time()
-    return
-def get_demo(rag_chain=None):
-    if rag_chain is None:
-        rag_chain = get_default_rag_chain()
-    with gr.Blocks(theme=theme, css=css) as demo:
-        # gr.Markdown(description)
-        instance = get_gradio_instance(rag_chain)
-        state = gr.State(instance)
-        demo.load(
-            None,
-            None,
-            js="""
-      () => {
-      const params = new URLSearchParams(window.location.search);
-      if (!params.has('__theme')) {
-        params.set('__theme', 'dark');
-        window.location.search = params.toString();
-      }
-      }""",
-        )
-        gr.HTML(value=html_title)
-        with gr.Row():
-            with gr.Column(scale=4):
-                video = gr.Video(height=512, width=512, elem_id="video", interactive=False )
-            with gr.Column(scale=7):
-                chatbot = gr.Chatbot(
-                            elem_id="chatbot", label="Multimodal RAG Chatbot", height=512,
-                    )
-                with gr.Row():
-                    with gr.Column(scale=8):
-                        # textbox.render()
-                        textbox = gr.Dropdown(
-                            dropdown_list,
-                            allow_custom_value=True,
-                            # show_label=False,
-                            # container=False,
-                            label="Query",
-                            info="Enter your query here or choose a sample from the dropdown list!"
-                        )
-                    with gr.Column(scale=1, min_width=50):
-                        submit_btn = gr.Button(
-                            value="Send", variant="primary", interactive=True
-                        )
-                with gr.Row(elem_id="buttons") as button_row:
-                    clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
-        btn_list = [clear_btn]
-        clear_btn.click(
-            clear_history, [state], [state, chatbot, textbox, video] + btn_list
-        )
-        submit_btn.click(
-            add_text,
-            [state, textbox],
-            [state, chatbot, textbox,] + btn_list,
-        ).then(
-            http_bot,
-            [state],
-            [state, chatbot, video] + btn_list,
-        )
-    return demo

+import gradio as gr
+import io
+import sys
+import time
+import dataclasses
+from pathlib import Path
+import os
+from enum import auto, Enum
+from typing import List, Tuple, Any
+from utility import prediction_guard_llava_conv
+import lancedb
+from utility import load_json_file
+from mm_rag.embeddings.bridgetower_embeddings import BridgeTowerEmbeddings
+from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
+from mm_rag.MLM.client import PredictionGuardClient
+from mm_rag.MLM.lvlm import LVLM
+from PIL import Image
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from utility import prediction_guard_llava_conv, encode_image, Conversation, lvlm_inference_with_conversation
+server_error_msg="**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+# function to split video at a timestamp
+def split_video(video_path, timestamp_in_ms, output_video_path: str = "./shared_data/splitted_videos", output_video_name: str="video_tmp.mp4", play_before_sec: int=3, play_after_sec: int=3):
+    timestamp_in_sec = int(timestamp_in_ms / 1000)
+    # create output_video_name folder if not exist:
+    Path(output_video_path).mkdir(parents=True, exist_ok=True)
+    output_video = os.path.join(output_video_path, output_video_name)
+    with VideoFileClip(video_path) as video:
+        duration = video.duration
+        start_time = max(timestamp_in_sec - play_before_sec, 0)
+        end_time = min(timestamp_in_sec + play_after_sec, duration)
+        new = video.subclip(start_time, end_time)
+        new.write_videofile(output_video, audio_codec='aac')
+    return output_video
+prompt_template = """The transcript associated with the image is '{transcript}'. {user_query}"""
+# define default rag_chain
+def get_default_rag_chain():
+    # declare host file
+    LANCEDB_HOST_FILE = "./shared_data/.lancedb"
+    # declare table name
+    TBL_NAME = "demo_tbl"
+    # initialize vectorstore
+    db = lancedb.connect(LANCEDB_HOST_FILE)
+    # initialize an BridgeTower embedder
+    embedder = BridgeTowerEmbeddings()
+    ## Creating a LanceDB vector store
+    vectorstore = MultimodalLanceDB(uri=LANCEDB_HOST_FILE, embedding=embedder, table_name=TBL_NAME)
+    ### creating a retriever for the vector store
+    retriever_module = vectorstore.as_retriever(search_type='similarity', search_kwargs={"k": 1})
+    # initialize a client as PredictionGuardClien
+    client = PredictionGuardClient()
+    # initialize LVLM with the given client
+    lvlm_inference_module = LVLM(client=client)
+    def prompt_processing(input):
+        # get the retrieved results and user's query
+        retrieved_results, user_query = input['retrieved_results'], input['user_query']
+        # get the first retrieved result by default
+        retrieved_result = retrieved_results[0]
+        # prompt_template = """The transcript associated with the image is '{transcript}'. {user_query}"""
+        # get all metadata of the retrieved video segment
+        metadata_retrieved_video_segment = retrieved_result.metadata['metadata']
+        # get the frame and the corresponding transcript, path to extracted frame, path to whole video, and time stamp of the retrieved video segment.
+        transcript = metadata_retrieved_video_segment['transcript']
+        frame_path = metadata_retrieved_video_segment['extracted_frame_path']
+        return {
+            'prompt': prompt_template.format(transcript=transcript, user_query=user_query),
+            'image' : frame_path,
+            'metadata' : metadata_retrieved_video_segment,
+        }
+    # initialize prompt processing module as a Langchain RunnableLambda of function prompt_processing
+    prompt_processing_module = RunnableLambda(prompt_processing)
+    # the output of this new chain will be a dictionary
+    mm_rag_chain_with_retrieved_image = (
+                    RunnableParallel({"retrieved_results": retriever_module ,
+                                      "user_query": RunnablePassthrough()})
+                    | prompt_processing_module
+                    | RunnableParallel({'final_text_output': lvlm_inference_module,
+                                        'input_to_lvlm' : RunnablePassthrough()})
+                   )
+    return mm_rag_chain_with_retrieved_image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+@dataclasses.dataclass
+class GradioInstance:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    version: str = "Unknown"
+    path_to_img: str = None
+    video_title: str = None
+    path_to_video: str = None
+    caption: str = None
+    mm_rag_chain: Any = None
+    skip_next: bool = False
+    def _template_caption(self):
+        out = ""
+        if self.caption is not None:
+            out = f"The caption associated with the image is '{self.caption}'. "
+        return out
+    def get_prompt_for_rag(self):
+        messages = self.messages
+        assert len(messages) == 2, "length of current conversation should be 2"
+        assert messages[1][1] is None, "the first response message of current conversation should be None"
+        ret = messages[0][1]
+        return ret
+    def get_conversation_for_lvlm(self):
+        pg_conv = prediction_guard_llava_conv.copy()
+        image_path = self.path_to_img
+        b64_img = encode_image(image_path)
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if msg is None:
+                break
+            if i == 0:
+                pg_conv.append_message(prediction_guard_llava_conv.roles[0], [msg, b64_img])
+            elif i == len(self.messages[self.offset:]) - 2:
+                pg_conv.append_message(role, [prompt_template.format(transcript=self.caption, user_query=msg)])
+            else:
+                pg_conv.append_message(role, [msg])
+        return pg_conv
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        if self.path_to_img is not None:
+            path_to_image = self.path_to_img
+            images.append(path_to_image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return GradioInstance(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+            mm_rag_chain=self.mm_rag_chain,
+        )
+    def dict(self):
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "path_to_img": self.path_to_img,
+            "video_title" : self.video_title,
+            "path_to_video": self.path_to_video,
+            "caption" : self.caption,
+        }
+    def get_path_to_subvideos(self):
+        if self.video_title is not None and self.path_to_img is not None:
+            info = video_helper_map[self.video_title]
+            path = info['path']
+            prefix = info['prefix']
+            vid_index = self.path_to_img.split('/')[-1]
+            vid_index = vid_index.split('_')[-1]
+            vid_index = vid_index.replace('.jpg', '')
+            ret = f"{prefix}{vid_index}.mp4"
+            ret = os.path.join(path, ret)
+            return ret
+        elif self.path_to_video is not None:
+            return self.path_to_video
+        return None
+def get_gradio_instance(mm_rag_chain=None):
+    if mm_rag_chain is None:
+        mm_rag_chain = get_default_rag_chain()
+    instance = GradioInstance(
+        system="",
+        roles=prediction_guard_llava_conv.roles,
+        messages=[],
+        offset=0,
+        sep_style=SeparatorStyle.SINGLE,
+        sep="\n",
+        path_to_img=None,
+        video_title=None,
+        caption=None,
+        mm_rag_chain=mm_rag_chain,
+    )
+    return instance
+gr.set_static_paths(paths=["./assets/"])
+theme = gr.themes.Base(
+    primary_hue=gr.themes.Color(
+        c100="#dbeafe", c200="#bfdbfe", c300="#93c5fd", c400="#60a5fa", c50="#eff6ff", c500="#0054ae", c600="#00377c", c700="#00377c", c800="#1e40af", c900="#1e3a8a", c950="#0a0c2b"),
+    secondary_hue=gr.themes.Color(
+        c100="#dbeafe", c200="#bfdbfe", c300="#93c5fd", c400="#60a5fa", c50="#eff6ff", c500="#0054ae", c600="#0054ae", c700="#0054ae", c800="#1e40af", c900="#1e3a8a", c950="#1d3660"),
+).set(
+    body_background_fill_dark='*primary_950',
+    body_text_color_dark='*neutral_300',
+    border_color_accent='*primary_700',
+    border_color_accent_dark='*neutral_800',
+    block_background_fill_dark='*primary_950',
+    block_border_width='2px',
+    block_border_width_dark='2px',
+    button_primary_background_fill_dark='*primary_500',
+    button_primary_border_color_dark='*primary_500'
+)
+css='''
+    @font-face {
+        font-family: IntelOne;
+        src: url("/file=./assets/intelone-bodytext-font-family-regular.ttf");
+    }
+    .gradio-container {background-color: #0a0c2b}
+    table {
+      border-collapse: collapse;
+      border: none;
+    }
+'''
+##     <td style="border-bottom:0"><img src="file/assets/DCAI_logo.png" height="300" width="300"></td>
+# html_title = '''
+# <table style="bordercolor=#0a0c2b; border=0">
+# <tr style="height:150px; border:0">
+#     <td style="border:0"><img src="/file=../assets/intel-labs.png" height="100" width="100"></td>
+#     <td style="vertical-align:bottom; border:0">
+#     <p style="font-size:xx-large;font-family:IntelOne, Georgia, sans-serif;color: white;">
+#      Multimodal RAG:
+#      <br>
+#      Chat with Videos
+#     </p>
+#     </td>
+#     <td style="border:0"><img src="/file=../assets/gaudi.png" width="100" height="100"></td>
+#     <td style="border:0"><img src="/file=../assets/IDC7.png" width="300" height="350"></td>
+#     <td style="border:0"><img src="/file=../assets/prediction_guard3.png" width="120" height="120"></td>
+# </tr>
+# </table>
+# '''
+html_title = '''
+<table style="bordercolor=#0a0c2b; border=0">
+<tr style="height:150px; border:0">
+    <td style="border:0"><img src="/file=./assets/header.png"></td>
+</tr>
+</table>
+'''
+#<td style="border:0"><img src="/file=../assets/xeon.png" width="100" height="100"></td>
+dropdown_list = [
+    "What is the name of one of the astronauts?",
+    "An astronaut's spacewalk",
+    "What does the astronaut say?",
+]
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+def clear_history(state, request: gr.Request):
+    state = get_gradio_instance(state.mm_rag_chain)
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 1
+def add_text(state, text, request: gr.Request):
+    if len(text) <= 0 :
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 1
+    text = text[:1536]  # Hard cut-off
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "") + (disable_btn,) * 1
+def http_bot(
+    state, request: gr.Request
+):
+    start_tstamp = time.time()
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        path_to_sub_videos = state.get_path_to_subvideos()
+        yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (no_change_btn,) * 1
+        return
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        new_state = get_gradio_instance(state.mm_rag_chain)
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+    all_images = state.get_images(return_pil=False)
+    # Make requests
+    is_very_first_query = True
+    if len(all_images) == 0:
+        # first query need to do RAG
+        # Construct prompt
+        prompt_or_conversation = state.get_prompt_for_rag()
+    else:
+        # subsequence queries, no need to do Retrieval
+        is_very_first_query = False
+        prompt_or_conversation = state.get_conversation_for_lvlm()
+    if is_very_first_query:
+        executor = state.mm_rag_chain
+    else:
+        executor = lvlm_inference_with_conversation
+    state.messages[-1][-1] = "▌"
+    path_to_sub_videos = state.get_path_to_subvideos()
+    yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (disable_btn,) * 1
+    try:
+        if is_very_first_query:
+            # get response by invoke executor chain
+            response = executor.invoke(prompt_or_conversation)
+            message = response['final_text_output']
+            if 'metadata' in response['input_to_lvlm']:
+                metadata = response['input_to_lvlm']['metadata']
+                if (state.path_to_img is None
+                    and 'input_to_lvlm' in response
+                    and 'image' in response['input_to_lvlm']
+                   ):
+                        state.path_to_img = response['input_to_lvlm']['image']
+                if state.path_to_video is None and 'video_path' in metadata:
+                        video_path = metadata['video_path']
+                        mid_time_ms = metadata['mid_time_ms']
+                        splited_video_path = split_video(video_path, mid_time_ms)
+                        state.path_to_video = splited_video_path
+                if state.caption is None and 'transcript' in metadata:
+                    state.caption = metadata['transcript']
+            else:
+                raise ValueError("Response's format is changed")
+        else:
+            # get the response message by directly call PredictionGuardAPI
+            message = executor(prompt_or_conversation)
+    except Exception as e:
+        print(e)
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot(), None) + (
+            enable_btn,
+        )
+        return
+    state.messages[-1][-1] = message
+    path_to_sub_videos = state.get_path_to_subvideos()
+    # path_to_image = state.path_to_img
+    # caption = state.caption
+    # # print(path_to_sub_videos)
+    # # print(path_to_image)
+    # # print('caption: ', caption)
+    yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (enable_btn,) * 1
+    finish_tstamp = time.time()
+    return
+def get_demo(rag_chain=None):
+    if rag_chain is None:
+        rag_chain = get_default_rag_chain()
+    with gr.Blocks(theme=theme, css=css) as demo:
+        # gr.Markdown(description)
+        instance = get_gradio_instance(rag_chain)
+        state = gr.State(instance)
+        demo.load(
+            None,
+            None,
+            js="""
+      () => {
+      const params = new URLSearchParams(window.location.search);
+      if (!params.has('__theme')) {
+        params.set('__theme', 'dark');
+        window.location.search = params.toString();
+      }
+      }""",
+        )
+        gr.HTML(value=html_title)
+        with gr.Row():
+            with gr.Column(scale=4):
+                video = gr.Video(height=512, width=512, elem_id="video", interactive=False )
+            with gr.Column(scale=7):
+                chatbot = gr.Chatbot(
+                            elem_id="chatbot", label="Multimodal RAG Chatbot", height=512,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        # textbox.render()
+                        textbox = gr.Dropdown(
+                            dropdown_list,
+                            allow_custom_value=True,
+                            # show_label=False,
+                            # container=False,
+                            label="Query",
+                            info="Enter your query here or choose a sample from the dropdown list!"
+                        )
+                    with gr.Column(scale=1, min_width=50):
+                        submit_btn = gr.Button(
+                            value="Send", variant="primary", interactive=True
+                        )
+                with gr.Row(elem_id="buttons") as button_row:
+                    clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
+        btn_list = [clear_btn]
+        clear_btn.click(
+            clear_history, [state], [state, chatbot, textbox, video] + btn_list
+        )
+        submit_btn.click(
+            add_text,
+            [state, textbox],
+            [state, chatbot, textbox,] + btn_list,
+        ).then(
+            http_bot,
+            [state],
+            [state, chatbot, video] + btn_list,
+        )
+    return demo

mm_rag/MLM/client.py CHANGED Viewed

@@ -1,135 +1,135 @@
-"""Base interface for client making requests/call to visual language model provider API"""
-from abc import ABC, abstractmethod
-from typing import List, Optional, Dict, Union, Iterator
-import requests
-import json
-from utility import isBase64, encode_image, encode_image_from_path_or_url, lvlm_inference
-class BaseClient(ABC):
-    def __init__(self,
-                 hostname: str = "127.0.0.1",
-                 port: int = 8090,
-                 timeout: int = 60,
-                 url: Optional[str] = None):
-        self.connection_url = f"http://{hostname}:{port}" if url is None else url
-        self.timeout = timeout
-        # self.headers = {'Content-Type': 'application/x-www-form-urlencoded'}
-        self.headers = {'Content-Type': 'application/json'}
-    def root(self):
-        """Request for showing welcome message"""
-        connection_route = f"{self.connection_url}/"
-        return requests.get(connection_route)
-    @abstractmethod
-    def generate(self,
-                 prompt: str,
-                 image: str,
-                 **kwargs
-        ) -> str:
-        """Send request to visual language model API
-        and return generated text that was returned by the visual language model API
-        Use this method when you want to call visual language model API to generate text without streaming
-        Args:
-            prompt: A prompt.
-            image: A string that can be either path to image or base64 of an image.
-            **kwargs: Arbitrary additional keyword arguments.
-                These are usually passed to the model provider API call as hyperparameter for generation.
-        Returns:
-            Text returned from visual language model provider API call
-        """
-    def generate_stream(
-            self,
-            prompt: str,
-            image: str,
-            **kwargs
-    ) -> Iterator[str]:
-        """Send request to visual language model API
-        and return an iterator of streaming text that were returned from the visual language model API call
-        Use this method when you want to call visual language model API to stream generated text.
-        Args:
-            prompt: A prompt.
-            image: A string that can be either path to image or base64 of an image.
-            **kwargs: Arbitrary additional keyword arguments.
-                These are usually passed to the model provider API call as hyperparameter for generation.
-        Returns:
-            Iterator of text streamed from visual language model provider API call
-        """
-        raise NotImplementedError()
-    def generate_batch(
-            self,
-            prompt: List[str],
-            image: List[str],
-            **kwargs
-    ) -> List[str]:
-        """Send a request to visual language model API for multi-batch generation
-        and return a list of generated text that was returned by the visual language model API
-        Use this method when you want to call visual language model API to multi-batch generate text.
-        Multi-batch generation does not support streaming.
-        Args:
-            prompt: List of prompts.
-            image: List of strings; each of which can be either path to image or base64 of an image.
-            **kwargs: Arbitrary additional keyword arguments.
-                These are usually passed to the model provider API call as hyperparameter for generation.
-        Returns:
-            List of texts returned from visual language model provider API call
-        """
-        raise NotImplementedError()
-class PredictionGuardClient(BaseClient):
-    generate_kwargs = ['max_tokens',
-                       'temperature',
-                       'top_p',
-                       'top_k']
-    def filter_accepted_genkwargs(self, kwargs):
-        gen_args = {}
-        if "generate_kwargs" in kwargs and isinstance(kwargs["generate_kwargs"], dict):
-            gen_args = {k:kwargs["generate_kwargs"][k]
-                        for k in self.generate_kwargs
-                        if k in kwargs["generate_kwargs"]}
-        return gen_args
-    def generate(self,
-                 prompt: str,
-                 image: str,
-                 **kwargs
-        ) -> str:
-        """Send request to PredictionGuard's API
-        and return generated text that was returned by LLAVA model
-        Use this method when you want to call LLAVA model API to generate text without streaming
-        Args:
-            prompt: A prompt.
-            image: A string that can be either path/URL to image or base64 of an image.
-            **kwargs: Arbitrary additional keyword arguments.
-                These are usually passed to the model provider API call as hyperparameter for generation.
-        Returns:
-            Text returned from visual language model provider API call
-        """
-        assert image is not None and len(image) != "", "the input image cannot be None, it must be either base64-encoded image or path/URL to image"
-        if isBase64(image):
-            base64_image = image
-        else: # this is path to image or URL to image
-            base64_image = encode_image_from_path_or_url(image)
-        args = self.filter_accepted_genkwargs(kwargs)
-        return lvlm_inference(prompt=prompt, image=base64_image, **args)

+"""Base interface for client making requests/call to visual language model provider API"""
+from abc import ABC, abstractmethod
+from typing import List, Optional, Dict, Union, Iterator
+import requests
+import json
+from utility import isBase64, encode_image, encode_image_from_path_or_url, lvlm_inference
+class BaseClient(ABC):
+    def __init__(self,
+                 hostname: str = "127.0.0.1",
+                 port: int = 8090,
+                 timeout: int = 60,
+                 url: Optional[str] = None):
+        self.connection_url = f"http://{hostname}:{port}" if url is None else url
+        self.timeout = timeout
+        # self.headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+        self.headers = {'Content-Type': 'application/json'}
+    def root(self):
+        """Request for showing welcome message"""
+        connection_route = f"{self.connection_url}/"
+        return requests.get(connection_route)
+    @abstractmethod
+    def generate(self,
+                 prompt: str,
+                 image: str,
+                 **kwargs
+        ) -> str:
+        """Send request to visual language model API
+        and return generated text that was returned by the visual language model API
+        Use this method when you want to call visual language model API to generate text without streaming
+        Args:
+            prompt: A prompt.
+            image: A string that can be either path to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            Text returned from visual language model provider API call
+        """
+    def generate_stream(
+            self,
+            prompt: str,
+            image: str,
+            **kwargs
+    ) -> Iterator[str]:
+        """Send request to visual language model API
+        and return an iterator of streaming text that were returned from the visual language model API call
+        Use this method when you want to call visual language model API to stream generated text.
+        Args:
+            prompt: A prompt.
+            image: A string that can be either path to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            Iterator of text streamed from visual language model provider API call
+        """
+        raise NotImplementedError()
+    def generate_batch(
+            self,
+            prompt: List[str],
+            image: List[str],
+            **kwargs
+    ) -> List[str]:
+        """Send a request to visual language model API for multi-batch generation
+        and return a list of generated text that was returned by the visual language model API
+        Use this method when you want to call visual language model API to multi-batch generate text.
+        Multi-batch generation does not support streaming.
+        Args:
+            prompt: List of prompts.
+            image: List of strings; each of which can be either path to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            List of texts returned from visual language model provider API call
+        """
+        raise NotImplementedError()
+class PredictionGuardClient(BaseClient):
+    generate_kwargs = ['max_tokens',
+                       'temperature',
+                       'top_p',
+                       'top_k']
+    def filter_accepted_genkwargs(self, kwargs):
+        gen_args = {}
+        if "generate_kwargs" in kwargs and isinstance(kwargs["generate_kwargs"], dict):
+            gen_args = {k:kwargs["generate_kwargs"][k]
+                        for k in self.generate_kwargs
+                        if k in kwargs["generate_kwargs"]}
+        return gen_args
+    def generate(self,
+                 prompt: str,
+                 image: str,
+                 **kwargs
+        ) -> str:
+        """Send request to PredictionGuard's API
+        and return generated text that was returned by LLAVA model
+        Use this method when you want to call LLAVA model API to generate text without streaming
+        Args:
+            prompt: A prompt.
+            image: A string that can be either path/URL to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            Text returned from visual language model provider API call
+        """
+        assert image is not None and len(image) != "", "the input image cannot be None, it must be either base64-encoded image or path/URL to image"
+        if isBase64(image):
+            base64_image = image
+        else: # this is path to image or URL to image
+            base64_image = encode_image_from_path_or_url(image)
+        args = self.filter_accepted_genkwargs(kwargs)
+        return lvlm_inference(prompt=prompt, image=base64_image, **args)

mm_rag/MLM/lvlm.py CHANGED Viewed

@@ -1,301 +1,301 @@
-from .client import PredictionGuardClient
-from langchain_core.language_models.llms import LLM
-from langchain_core.pydantic_v1 import Extra, root_validator
-from typing import Any, Optional, List, Dict, Iterator, AsyncIterator
-from langchain_core.callbacks import CallbackManagerForLLMRun
-from utility import get_from_dict_or_env, MultimodalModelInput
-from langchain_core.runnables import RunnableConfig, ensure_config
-from langchain_core.language_models.base import LanguageModelInput
-from langchain_core.prompt_values import StringPromptValue
-# from langchain_core.outputs import GenerationChunk, LLMResult
-from langchain_core.language_models.llms import BaseLLM
-from langchain_core.callbacks import (
-    # CallbackManager,
-    CallbackManagerForLLMRun,
-)
-# from langchain_core.load import dumpd
-from langchain_core.runnables.config import run_in_executor
-class LVLM(LLM):
-    """This class extends LLM class for implementing a custom request to LVLM provider API"""
-    client: Any = None #: :meta private:
-    hostname: Optional[str] = None
-    port: Optional[int] = None
-    url: Optional[str] = None
-    max_new_tokens: Optional[int] =  200
-    temperature: Optional[float] = 0.6
-    top_k: Optional[float] = 0
-    stop: Optional[List[str]] = None
-    ignore_eos: Optional[bool] = False
-    do_sample: Optional[bool] = True
-    lazy_mode: Optional[bool] = True
-    hpu_graphs: Optional[bool] = True
-    @root_validator()
-    def validate_environment(cls, values: Dict) -> Dict:
-        """Validate that the access token and python package exists in environment if needed"""
-        if values['client'] is None:
-            # check if url of API is provided
-            url = get_from_dict_or_env(values, 'url', "VLM_URL", None)
-            if url is None:
-                hostname = get_from_dict_or_env(values, 'hostname', 'VLM_HOSTNAME', None)
-                port = get_from_dict_or_env(values, 'port', 'VLM_PORT', None)
-                if hostname is not None and port is not None:
-                    values['client'] = PredictionGuardClient(hostname=hostname, port=port)
-                else:
-                    # using default hostname and port to create Client
-                    values['client'] = PredictionGuardClient()
-            else:
-                values['client'] = PredictionGuardClient(url=url)
-        return values
-    @property
-    def _llm_type(self) -> str:
-        """Return type of llm"""
-        return "Large Vision Language Model"
-    @property
-    def _default_params(self) -> Dict[str, Any]:
-        """Get the default parameters for calling the Prediction Guard API."""
-        return {
-            "max_tokens": self.max_new_tokens,
-            "temperature": self.temperature,
-            "top_k": self.top_k,
-            "ignore_eos": self.ignore_eos,
-            "do_sample": self.do_sample,
-            "stop" : self.stop,
-        }
-    def get_params(self, **kwargs):
-        params = self._default_params
-        params.update(kwargs)
-        return params
-    def _call(
-        self,
-        prompt: str,
-        image: str,
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> str:
-        """Run the VLM on the given input.
-        Args:
-            prompt: The prompt to generate from.
-            image: This can be either path to image or base64 encode of the image.
-            stop: Stop words to use when generating. Model output is cut off at the
-                first occurrence of any of the stop substrings.
-                If stop tokens are not supported consider raising NotImplementedError.
-        Returns:
-            The model output as a string. Actual completions DOES NOT include the prompt
-        Example: TBD
-        """
-        params = {}
-        if stop is not None:
-            raise ValueError("stop kwargs are not permitted.")
-        params['generate_kwargs'] = self.get_params(**kwargs)
-        response = self.client.generate(prompt=prompt, image=image, **params)
-        return response
-    def _stream(
-        self,
-        prompt: str,
-        image: str,
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> Iterator[str]:
-        """Stream the VLM on the given prompt and image.
-        Args:
-            prompt: The prompt to generate from.
-            image: This can be either path to image or base64 encode of the image.
-            stop: Stop words to use when generating. Model output is cut off at the
-                first occurrence of any of the stop substrings.
-                If stop tokens are not supported consider raising NotImplementedError.
-        Returns:
-            The model outputs an iterator of string. Actual completions DOES NOT include the prompt
-        Example: TBD
-        """
-        params = {}
-        params['generate_kwargs'] = self.get_params(**kwargs)
-        for chunk in self.client.generate_stream(prompt=prompt, image=image, **params):
-            yield chunk
-    async def _astream(
-        self,
-        prompt: str,
-        image: str,
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> AsyncIterator[str]:
-        """An async version of _stream method that stream the VLM on the given prompt and image.
-        Args:
-            prompt: The prompt to generate from.
-            image: This can be either path to image or base64 encode of the image.
-            stop: Stop words to use when generating. Model output is cut off at the
-                first occurrence of any of the stop substrings.
-                If stop tokens are not supported consider raising NotImplementedError.
-        Returns:
-            The model outputs an async iterator of string. Actual completions DOES NOT include the prompt
-        Example: TBD
-        """
-        iterator = await run_in_executor(
-            None,
-            self._stream,
-            prompt,
-            image,
-            stop,
-            run_manager.get_sync() if run_manager else None,
-            **kwargs,
-        )
-        done = object()
-        while True:
-            item = await run_in_executor(
-                None,
-                next,
-                iterator,
-                done,  # type: ignore[call-arg, arg-type]
-            )
-            if item is done:
-                break
-            yield item  # type: ignore[misc]
-    def invoke(
-        self,
-        input: MultimodalModelInput,
-        config: Optional[RunnableConfig] = None,
-        *,
-        stop: Optional[List[str]] = None,
-        **kwargs: Any,
-    ) -> str:
-        config = ensure_config(config)
-        if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
-            return (
-                self.generate_prompt(
-                    [self._convert_input(StringPromptValue(text=input['prompt']))],
-                    stop=stop,
-                    callbacks=config.get("callbacks"),
-                    tags=config.get("tags"),
-                    metadata=config.get("metadata"),
-                    run_name=config.get("run_name"),
-                    run_id=config.pop("run_id", None),
-                    image= input['image'],
-                    **kwargs,
-                )
-                .generations[0][0]
-                .text
-            )
-        return (
-            self.generate_prompt(
-                [self._convert_input(input)],
-                stop=stop,
-                callbacks=config.get("callbacks"),
-                tags=config.get("tags"),
-                metadata=config.get("metadata"),
-                run_name=config.get("run_name"),
-                run_id=config.pop("run_id", None),
-                **kwargs,
-            )
-            .generations[0][0]
-            .text
-        )
-    async def ainvoke(
-        self,
-        input: MultimodalModelInput,
-        config: Optional[RunnableConfig] = None,
-        *,
-        stop: Optional[List[str]] = None,
-        **kwargs: Any,
-    ) -> str:
-        config = ensure_config(config)
-        if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
-            llm_result = await self.agenerate_prompt(
-            [self._convert_input(StringPromptValue(text=input['prompt']))],
-            stop=stop,
-            callbacks=config.get("callbacks"),
-            tags=config.get("tags"),
-            metadata=config.get("metadata"),
-            run_name=config.get("run_name"),
-            run_id=config.pop("run_id", None),
-            image=input['image'],
-            **kwargs,
-            )
-        else:
-            llm_result = await self.agenerate_prompt(
-            [self._convert_input(input)],
-            stop=stop,
-            callbacks=config.get("callbacks"),
-            tags=config.get("tags"),
-            metadata=config.get("metadata"),
-            run_name=config.get("run_name"),
-            run_id=config.pop("run_id", None),
-            **kwargs,
-        )
-        return llm_result.generations[0][0].text
-    def stream(
-        self,
-        input: MultimodalModelInput,
-        config: Optional[RunnableConfig] = None,
-        *,
-        stop: Optional[List[str]] = None,
-        **kwargs: Any,
-    ) -> Iterator[str]:
-        if type(self)._stream == BaseLLM._stream:
-            # model doesn't implement streaming, so use default implementation
-            yield self.invoke(input, config=config, stop=stop, **kwargs)
-        else:
-            if stop is not None:
-                raise ValueError("stop kwargs are not permitted.")
-            image = None
-            prompt = None
-            if isinstance(input, dict) and 'prompt' in input.keys():
-                prompt = self._convert_input(input['prompt']).to_string()
-            else:
-                raise ValueError("prompt must be provided")
-            if isinstance(input, dict) and 'image' in input.keys():
-                image = input['image']
-            for chunk in self._stream(
-                prompt=prompt, image=image, **kwargs
-            ):
-                yield chunk
-    async def astream(
-        self,
-        input: LanguageModelInput,
-        config: Optional[RunnableConfig] = None,
-        *,
-        stop: Optional[List[str]] = None,
-        **kwargs: Any,
-    ) -> AsyncIterator[str]:
-        if (
-            type(self)._astream is BaseLLM._astream
-            and type(self)._stream is BaseLLM._stream
-        ):
-            yield await self.ainvoke(input, config=config, stop=stop, **kwargs)
-            return
-        else:
-            if stop is not None:
-                raise ValueError("stop kwargs are not permitted.")
-            image = None
-            if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
-                prompt = self._convert_input(input['prompt']).to_string()
-                image = input['image']
-            else:
-                raise ValueError("missing image is not permitted")
-                prompt = self._convert_input(input).to_string()
-            async for chunk in self._astream(
-                prompt=prompt, image=image, **kwargs
-            ):
                 yield chunk

+from .client import PredictionGuardClient
+from langchain_core.language_models.llms import LLM
+from langchain_core.pydantic_v1 import Extra, root_validator
+from typing import Any, Optional, List, Dict, Iterator, AsyncIterator
+from langchain_core.callbacks import CallbackManagerForLLMRun
+from utility import get_from_dict_or_env, MultimodalModelInput
+from langchain_core.runnables import RunnableConfig, ensure_config
+from langchain_core.language_models.base import LanguageModelInput
+from langchain_core.prompt_values import StringPromptValue
+# from langchain_core.outputs import GenerationChunk, LLMResult
+from langchain_core.language_models.llms import BaseLLM
+from langchain_core.callbacks import (
+    # CallbackManager,
+    CallbackManagerForLLMRun,
+)
+# from langchain_core.load import dumpd
+from langchain_core.runnables.config import run_in_executor
+class LVLM(LLM):
+    """This class extends LLM class for implementing a custom request to LVLM provider API"""
+    client: Any = None #: :meta private:
+    hostname: Optional[str] = None
+    port: Optional[int] = None
+    url: Optional[str] = None
+    max_new_tokens: Optional[int] =  200
+    temperature: Optional[float] = 0.6
+    top_k: Optional[float] = 0
+    stop: Optional[List[str]] = None
+    ignore_eos: Optional[bool] = False
+    do_sample: Optional[bool] = True
+    lazy_mode: Optional[bool] = True
+    hpu_graphs: Optional[bool] = True
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that the access token and python package exists in environment if needed"""
+        if values['client'] is None:
+            # check if url of API is provided
+            url = get_from_dict_or_env(values, 'url', "VLM_URL", None)
+            if url is None:
+                hostname = get_from_dict_or_env(values, 'hostname', 'VLM_HOSTNAME', None)
+                port = get_from_dict_or_env(values, 'port', 'VLM_PORT', None)
+                if hostname is not None and port is not None:
+                    values['client'] = PredictionGuardClient(hostname=hostname, port=port)
+                else:
+                    # using default hostname and port to create Client
+                    values['client'] = PredictionGuardClient()
+            else:
+                values['client'] = PredictionGuardClient(url=url)
+        return values
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm"""
+        return "Large Vision Language Model"
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling the Prediction Guard API."""
+        return {
+            "max_tokens": self.max_new_tokens,
+            "temperature": self.temperature,
+            "top_k": self.top_k,
+            "ignore_eos": self.ignore_eos,
+            "do_sample": self.do_sample,
+            "stop" : self.stop,
+        }
+    def get_params(self, **kwargs):
+        params = self._default_params
+        params.update(kwargs)
+        return params
+    def _call(
+        self,
+        prompt: str,
+        image: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Run the VLM on the given input.
+        Args:
+            prompt: The prompt to generate from.
+            image: This can be either path to image or base64 encode of the image.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+                If stop tokens are not supported consider raising NotImplementedError.
+        Returns:
+            The model output as a string. Actual completions DOES NOT include the prompt
+        Example: TBD
+        """
+        params = {}
+        if stop is not None:
+            raise ValueError("stop kwargs are not permitted.")
+        params['generate_kwargs'] = self.get_params(**kwargs)
+        response = self.client.generate(prompt=prompt, image=image, **params)
+        return response
+    def _stream(
+        self,
+        prompt: str,
+        image: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> Iterator[str]:
+        """Stream the VLM on the given prompt and image.
+        Args:
+            prompt: The prompt to generate from.
+            image: This can be either path to image or base64 encode of the image.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+                If stop tokens are not supported consider raising NotImplementedError.
+        Returns:
+            The model outputs an iterator of string. Actual completions DOES NOT include the prompt
+        Example: TBD
+        """
+        params = {}
+        params['generate_kwargs'] = self.get_params(**kwargs)
+        for chunk in self.client.generate_stream(prompt=prompt, image=image, **params):
+            yield chunk
+    async def _astream(
+        self,
+        prompt: str,
+        image: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """An async version of _stream method that stream the VLM on the given prompt and image.
+        Args:
+            prompt: The prompt to generate from.
+            image: This can be either path to image or base64 encode of the image.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+                If stop tokens are not supported consider raising NotImplementedError.
+        Returns:
+            The model outputs an async iterator of string. Actual completions DOES NOT include the prompt
+        Example: TBD
+        """
+        iterator = await run_in_executor(
+            None,
+            self._stream,
+            prompt,
+            image,
+            stop,
+            run_manager.get_sync() if run_manager else None,
+            **kwargs,
+        )
+        done = object()
+        while True:
+            item = await run_in_executor(
+                None,
+                next,
+                iterator,
+                done,  # type: ignore[call-arg, arg-type]
+            )
+            if item is done:
+                break
+            yield item  # type: ignore[misc]
+    def invoke(
+        self,
+        input: MultimodalModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> str:
+        config = ensure_config(config)
+        if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
+            return (
+                self.generate_prompt(
+                    [self._convert_input(StringPromptValue(text=input['prompt']))],
+                    stop=stop,
+                    callbacks=config.get("callbacks"),
+                    tags=config.get("tags"),
+                    metadata=config.get("metadata"),
+                    run_name=config.get("run_name"),
+                    run_id=config.pop("run_id", None),
+                    image= input['image'],
+                    **kwargs,
+                )
+                .generations[0][0]
+                .text
+            )
+        return (
+            self.generate_prompt(
+                [self._convert_input(input)],
+                stop=stop,
+                callbacks=config.get("callbacks"),
+                tags=config.get("tags"),
+                metadata=config.get("metadata"),
+                run_name=config.get("run_name"),
+                run_id=config.pop("run_id", None),
+                **kwargs,
+            )
+            .generations[0][0]
+            .text
+        )
+    async def ainvoke(
+        self,
+        input: MultimodalModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> str:
+        config = ensure_config(config)
+        if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
+            llm_result = await self.agenerate_prompt(
+            [self._convert_input(StringPromptValue(text=input['prompt']))],
+            stop=stop,
+            callbacks=config.get("callbacks"),
+            tags=config.get("tags"),
+            metadata=config.get("metadata"),
+            run_name=config.get("run_name"),
+            run_id=config.pop("run_id", None),
+            image=input['image'],
+            **kwargs,
+            )
+        else:
+            llm_result = await self.agenerate_prompt(
+            [self._convert_input(input)],
+            stop=stop,
+            callbacks=config.get("callbacks"),
+            tags=config.get("tags"),
+            metadata=config.get("metadata"),
+            run_name=config.get("run_name"),
+            run_id=config.pop("run_id", None),
+            **kwargs,
+        )
+        return llm_result.generations[0][0].text
+    def stream(
+        self,
+        input: MultimodalModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> Iterator[str]:
+        if type(self)._stream == BaseLLM._stream:
+            # model doesn't implement streaming, so use default implementation
+            yield self.invoke(input, config=config, stop=stop, **kwargs)
+        else:
+            if stop is not None:
+                raise ValueError("stop kwargs are not permitted.")
+            image = None
+            prompt = None
+            if isinstance(input, dict) and 'prompt' in input.keys():
+                prompt = self._convert_input(input['prompt']).to_string()
+            else:
+                raise ValueError("prompt must be provided")
+            if isinstance(input, dict) and 'image' in input.keys():
+                image = input['image']
+            for chunk in self._stream(
+                prompt=prompt, image=image, **kwargs
+            ):
+                yield chunk
+    async def astream(
+        self,
+        input: LanguageModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        if (
+            type(self)._astream is BaseLLM._astream
+            and type(self)._stream is BaseLLM._stream
+        ):
+            yield await self.ainvoke(input, config=config, stop=stop, **kwargs)
+            return
+        else:
+            if stop is not None:
+                raise ValueError("stop kwargs are not permitted.")
+            image = None
+            if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
+                prompt = self._convert_input(input['prompt']).to_string()
+                image = input['image']
+            else:
+                raise ValueError("missing image is not permitted")
+                prompt = self._convert_input(input).to_string()
+            async for chunk in self._astream(
+                prompt=prompt, image=image, **kwargs
+            ):
                 yield chunk

mm_rag/embeddings/bridgetower_embeddings.py CHANGED Viewed

@@ -1,89 +1,89 @@
-from typing import List
-from langchain_core.embeddings import Embeddings
-import torch
-from transformers import (
-    BridgeTowerProcessor,
-    BridgeTowerForContrastiveLearning
-)
-from langchain_core.pydantic_v1 import (
-    BaseModel,
-)
-from lrn_vector_embeddings import bt_embeddings_from_local
-from utility import encode_image, bt_embedding_from_prediction_guard
-from tqdm import tqdm
-from PIL import Image
-class BridgeTowerEmbeddings(BaseModel, Embeddings):
-    """ BridgeTower embedding model """
-    def embed_documents(self, texts: List[str]) -> List[List[float]]:
-        """Embed a list of documents using BridgeTower.
-        Args:
-            texts: The list of texts to embed.
-        Returns:
-            List of embeddings, one for each text.
-        """
-        embeddings = []
-        img = Image.new('RGB', (100, 100))
-        for text in texts:
-            embedding = bt_embeddings_from_local(text, img)
-            embeddings.append(embedding)
-        return embeddings
-    def embed_query(self, text: str) -> List[float]:
-        """Embed a query using BridgeTower.
-        Args:
-            text: The text to embed.
-        Returns:
-            Embeddings for the text as a flat list of floats.
-        """
-        # Get embeddings
-        embeddings = self.embed_documents([text])[0]
-        # If embeddings is a dict, extract the text embeddings
-        if isinstance(embeddings, dict):
-            embeddings = embeddings["text_embeddings"]
-        # If embeddings is a nested list or tensor, flatten it
-        if isinstance(embeddings, (list, torch.Tensor)) and len(embeddings) == 1:
-            embeddings = embeddings[0]
-        # Convert tensor to list if needed
-        if torch.is_tensor(embeddings):
-            embeddings = embeddings.detach().tolist()
-        return embeddings
-    def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
-        """Embed a list of image-text pairs using BridgeTower.
-        Args:
-            texts: The list of texts to embed.
-            images: The list of path-to-images to embed
-            batch_size: the batch size to process, default to 2
-        Returns:
-            List of embeddings, one for each image-text pairs.
-        """
-        # the length of texts must be equal to the length of images
-        assert len(texts)==len(images), "the len of captions should be equal to the len of images"
-        processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
-        model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
-        embeddings = []
-        for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
-            inputs = processor(text=[text], images=[Image.open(path_to_img)], return_tensors="pt")
-            outputs = model(**inputs)
-            # Get embeddings and convert to list
-            embedding = outputs.text_embeds.detach().numpy().tolist()[0]
-            embeddings.append(embedding)
         return embeddings

+from typing import List
+from langchain_core.embeddings import Embeddings
+import torch
+from transformers import (
+    BridgeTowerProcessor,
+    BridgeTowerForContrastiveLearning
+)
+from langchain_core.pydantic_v1 import (
+    BaseModel,
+)
+from lrn_vector_embeddings import bt_embeddings_from_local
+from utility import encode_image, bt_embedding_from_prediction_guard
+from tqdm import tqdm
+from PIL import Image
+class BridgeTowerEmbeddings(BaseModel, Embeddings):
+    """ BridgeTower embedding model """
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using BridgeTower.
+        Args:
+            texts: The list of texts to embed.
+        Returns:
+            List of embeddings, one for each text.
+        """
+        embeddings = []
+        img = Image.new('RGB', (100, 100))
+        for text in texts:
+            embedding = bt_embeddings_from_local(text, img)
+            embeddings.append(embedding)
+        return embeddings
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using BridgeTower.
+        Args:
+            text: The text to embed.
+        Returns:
+            Embeddings for the text as a flat list of floats.
+        """
+        # Get embeddings
+        embeddings = self.embed_documents([text])[0]
+        # If embeddings is a dict, extract the text embeddings
+        if isinstance(embeddings, dict):
+            embeddings = embeddings["text_embeddings"]
+        # If embeddings is a nested list or tensor, flatten it
+        if isinstance(embeddings, (list, torch.Tensor)) and len(embeddings) == 1:
+            embeddings = embeddings[0]
+        # Convert tensor to list if needed
+        if torch.is_tensor(embeddings):
+            embeddings = embeddings.detach().tolist()
+        return embeddings
+    def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
+        """Embed a list of image-text pairs using BridgeTower.
+        Args:
+            texts: The list of texts to embed.
+            images: The list of path-to-images to embed
+            batch_size: the batch size to process, default to 2
+        Returns:
+            List of embeddings, one for each image-text pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(images), "the len of captions should be equal to the len of images"
+        processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+        model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+        embeddings = []
+        for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
+            inputs = processor(text=[text], images=[Image.open(path_to_img)], return_tensors="pt")
+            outputs = model(**inputs)
+            # Get embeddings and convert to list
+            embedding = outputs.text_embeds.detach().numpy().tolist()[0]
+            embeddings.append(embedding)
         return embeddings

mm_rag/vectorstores/multimodal_lancedb.py CHANGED Viewed

@@ -1,131 +1,131 @@
-from typing import Any, Iterable, List, Optional
-from langchain_core.embeddings import Embeddings
-import uuid
-from langchain_community.vectorstores.lancedb import LanceDB
-class MultimodalLanceDB(LanceDB):
-    """`LanceDB` vector store to process multimodal data
-    To use, you should have ``lancedb`` python package installed.
-    You can install it with ``pip install lancedb``.
-    Args:
-        connection: LanceDB connection to use. If not provided, a new connection
-                    will be created.
-        embedding: Embedding to use for the vectorstore.
-        vector_key: Key to use for the vector in the database. Defaults to ``vector``.
-        id_key: Key to use for the id in the database. Defaults to ``id``.
-        text_key: Key to use for the text in the database. Defaults to ``text``.
-        image_path_key: Key to use for the path to image in the database. Defaults to ``image_path``.
-        table_name: Name of the table to use. Defaults to ``vectorstore``.
-        api_key: API key to use for LanceDB cloud database.
-        region: Region to use for LanceDB cloud database.
-        mode: Mode to use for adding data to the table. Defaults to ``overwrite``.
-    Example:
-        .. code-block:: python
-            vectorstore = MultimodalLanceDB(uri='/lancedb', embedding_function)
-            vectorstore.add_texts(['text1', 'text2'])
-            result = vectorstore.similarity_search('text1')
-    """
-    def __init__(
-        self,
-        connection: Optional[Any] = None,
-        embedding: Optional[Embeddings] = None,
-        uri: Optional[str] = "/tmp/lancedb",
-        vector_key: Optional[str] = "vector",
-        id_key: Optional[str] = "id",
-        text_key: Optional[str] = "text",
-        image_path_key: Optional[str] = "image_path",
-        table_name: Optional[str] = "vectorstore",
-        api_key: Optional[str] = None,
-        region: Optional[str] = None,
-        mode: Optional[str] = "append",
-    ):
-        super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
-        self._image_path_key = image_path_key
-    def add_text_image_pairs(
-        self,
-        texts: Iterable[str],
-        image_paths: Iterable[str],
-        metadatas: Optional[List[dict]] = None,
-        ids: Optional[List[str]] = None,
-        **kwargs: Any,
-    ) -> List[str]:
-        """Turn text-image pairs into embedding and add it to the database
-        Args:
-            texts: Iterable of strings to combine with corresponding images to add to the vectorstore.
-            images: Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
-            metadatas: Optional list of metadatas associated with the texts.
-            ids: Optional list of ids to associate w    ith the texts.
-        Returns:
-            List of ids of the added text-image pairs.
-        """
-        # the length of texts must be equal to the length of images
-        assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
-        # Embed texts and create documents
-        docs = []
-        ids = ids or [str(uuid.uuid4()) for _ in texts]
-        embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths))  # type: ignore
-        for idx, text in enumerate(texts):
-            embedding = embeddings[idx]
-            metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
-            docs.append(
-                {
-                    self._vector_key: embedding,
-                    self._id_key: ids[idx],
-                    self._text_key: text,
-                    self._image_path_key : image_paths[idx],
-                    "metadata": metadata,
-                }
-            )
-        if 'mode' in kwargs:
-            mode = kwargs['mode']
-        else:
-            mode = self.mode
-        if self._table_name in self._connection.table_names():
-            tbl = self._connection.open_table(self._table_name)
-            if self.api_key is None:
-                tbl.add(docs, mode=mode)
-            else:
-                tbl.add(docs)
-        else:
-            self._connection.create_table(self._table_name, data=docs)
-        return ids
-    @classmethod
-    def from_text_image_pairs(
-        cls,
-        texts: List[str],
-        image_paths: List[str],
-        embedding: Embeddings,
-        metadatas: Optional[List[dict]] = None,
-        connection: Any = None,
-        vector_key: Optional[str] = "vector",
-        id_key: Optional[str] = "id",
-        text_key: Optional[str] = "text",
-        image_path_key: Optional[str] = "image_path",
-        table_name: Optional[str] = "vectorstore",
-        **kwargs: Any,
-    ):
-        instance = MultimodalLanceDB(
-            connection=connection,
-            embedding=embedding,
-            vector_key=vector_key,
-            id_key=id_key,
-            text_key=text_key,
-            image_path_key=image_path_key,
-            table_name=table_name,
-        )
-        instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
         return instance

+from typing import Any, Iterable, List, Optional
+from langchain_core.embeddings import Embeddings
+import uuid
+from langchain_community.vectorstores.lancedb import LanceDB
+class MultimodalLanceDB(LanceDB):
+    """`LanceDB` vector store to process multimodal data
+    To use, you should have ``lancedb`` python package installed.
+    You can install it with ``pip install lancedb``.
+    Args:
+        connection: LanceDB connection to use. If not provided, a new connection
+                    will be created.
+        embedding: Embedding to use for the vectorstore.
+        vector_key: Key to use for the vector in the database. Defaults to ``vector``.
+        id_key: Key to use for the id in the database. Defaults to ``id``.
+        text_key: Key to use for the text in the database. Defaults to ``text``.
+        image_path_key: Key to use for the path to image in the database. Defaults to ``image_path``.
+        table_name: Name of the table to use. Defaults to ``vectorstore``.
+        api_key: API key to use for LanceDB cloud database.
+        region: Region to use for LanceDB cloud database.
+        mode: Mode to use for adding data to the table. Defaults to ``overwrite``.
+    Example:
+        .. code-block:: python
+            vectorstore = MultimodalLanceDB(uri='/lancedb', embedding_function)
+            vectorstore.add_texts(['text1', 'text2'])
+            result = vectorstore.similarity_search('text1')
+    """
+    def __init__(
+        self,
+        connection: Optional[Any] = None,
+        embedding: Optional[Embeddings] = None,
+        uri: Optional[str] = "/tmp/lancedb",
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        api_key: Optional[str] = None,
+        region: Optional[str] = None,
+        mode: Optional[str] = "append",
+    ):
+        super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
+        self._image_path_key = image_path_key
+    def add_text_image_pairs(
+        self,
+        texts: Iterable[str],
+        image_paths: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Turn text-image pairs into embedding and add it to the database
+        Args:
+            texts: Iterable of strings to combine with corresponding images to add to the vectorstore.
+            images: Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            ids: Optional list of ids to associate w    ith the texts.
+        Returns:
+            List of ids of the added text-image pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
+        # Embed texts and create documents
+        docs = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths))  # type: ignore
+        for idx, text in enumerate(texts):
+            embedding = embeddings[idx]
+            metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
+            docs.append(
+                {
+                    self._vector_key: embedding,
+                    self._id_key: ids[idx],
+                    self._text_key: text,
+                    self._image_path_key : image_paths[idx],
+                    "metadata": metadata,
+                }
+            )
+        if 'mode' in kwargs:
+            mode = kwargs['mode']
+        else:
+            mode = self.mode
+        if self._table_name in self._connection.table_names():
+            tbl = self._connection.open_table(self._table_name)
+            if self.api_key is None:
+                tbl.add(docs, mode=mode)
+            else:
+                tbl.add(docs)
+        else:
+            self._connection.create_table(self._table_name, data=docs)
+        return ids
+    @classmethod
+    def from_text_image_pairs(
+        cls,
+        texts: List[str],
+        image_paths: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        connection: Any = None,
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        **kwargs: Any,
+    ):
+        instance = MultimodalLanceDB(
+            connection=connection,
+            embedding=embedding,
+            vector_key=vector_key,
+            id_key=id_key,
+            text_key=text_key,
+            image_path_key=image_path_key,
+            table_name=table_name,
+        )
+        instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
         return instance

requirements.txt CHANGED Viewed

@@ -1,25 +1,25 @@
-gradio
-langchain-predictionguard
-IPython
-umap-learn
-pytubefix
-youtube_transcript_api
-torch
-transformers
-matplotlib
-seaborn
-datasets
-moviepy
-whisper
-webvtt-py
-tqdm
-lancedb
-langchain-core
-langchain-community
-ollama
-opencv-python
-openai-whisper
-huggingface_hub[cli]
-huggingface_hub
-pillow
-accelerate>=0.26.0

+gradio
+langchain-predictionguard
+IPython
+umap-learn
+pytubefix
+youtube_transcript_api
+torch
+transformers
+matplotlib
+seaborn
+datasets
+moviepy
+whisper
+webvtt-py
+tqdm
+lancedb
+langchain-core
+langchain-community
+ollama
+opencv-python
+openai-whisper
+huggingface_hub[cli]
+huggingface_hub
+pillow
+accelerate>=0.26.0

s6_prepare_video_input.py CHANGED Viewed

@@ -1,90 +1,90 @@
-from pathlib import Path
-import os
-from os import path as osp
-import whisper
-from moviepy import VideoFileClip
-from PIL import Image
-from utility import download_video, extract_meta_data, get_transcript_vtt, getSubs
-from urllib.request import urlretrieve
-from IPython.display import display
-import ollama
-def demp_video_input_that_has_transcript():
-    # first video's url
-    vid_url = "https://www.youtube.com/watch?v=7Hcg-rLYwdM"
-    # download Youtube video to ./shared_data/videos/video1
-    vid_dir = "./shared_data/videos/video1"
-    vid_filepath = download_video(vid_url, vid_dir)
-    # download Youtube video's subtitle to ./shared_data/videos/video1
-    vid_transcript_filepath = get_transcript_vtt(vid_url, vid_dir)
-    return extract_meta_data(vid_dir, vid_filepath, vid_transcript_filepath)
-def demp_video_input_that_has_no_transcript():
-        # second video's url
-    vid_url=(
-        "https://multimedia-commons.s3-us-west-2.amazonaws.com/"
-        "data/videos/mp4/010/a07/010a074acb1975c4d6d6e43c1faeb8.mp4"
-    )
-    vid_dir = "./shared_data/videos/video2"
-    vid_name = "toddler_in_playground.mp4"
-    # create folder to which video2 will be downloaded
-    Path(vid_dir).mkdir(parents=True, exist_ok=True)
-    vid_filepath = urlretrieve(
-                            vid_url,
-                            osp.join(vid_dir, vid_name)
-                        )[0]
-    path_to_video_no_transcript = vid_filepath
-    # declare where to save .mp3 audio
-    path_to_extracted_audio_file = os.path.join(vid_dir, 'audio.mp3')
-    # extract mp3 audio file from mp4 video video file
-    clip = VideoFileClip(path_to_video_no_transcript)
-    clip.audio.write_audiofile(path_to_extracted_audio_file)
-    model = whisper.load_model("small")
-    options = dict(task="translate", best_of=1, language='en')
-    results = model.transcribe(path_to_extracted_audio_file, **options)
-    vtt = getSubs(results["segments"], "vtt")
-    # path to save generated transcript of video1
-    path_to_generated_trans = osp.join(vid_dir, 'generated_video1.vtt')
-    # write transcription to file
-    with open(path_to_generated_trans, 'w') as f:
-        f.write(vtt)
-    return extract_meta_data(vid_dir, vid_filepath, path_to_generated_trans)
-def ask_llvm(instruction, file_path):
-    result = ollama.generate(
-        model='llava',
-        prompt=instruction,
-        images=[file_path],
-        stream=False
-    )['response']
-    img=Image.open(file_path, mode='r')
-    img = img.resize([int(i/1.2) for i in img.size])
-    display(img)
-    for i in result.split('.'):
-        print(i, end='', flush=True)
-if __name__ == "__main__":
-    meta_data = demp_video_input_that_has_transcript()
-    meta_data1 = demp_video_input_that_has_no_transcript()
-    data = meta_data1[1]
-    caption = data['transcript']
-    print(f'Generated caption is: "{caption}"')
-    frame = Image.open(data['extracted_frame_path'])
-    display(frame)
-    instruction = "Can you describe the image?"
-    ask_llvm(instruction, data['extracted_frame_path'])
-    #print(meta_data)

+from pathlib import Path
+import os
+from os import path as osp
+import whisper
+from moviepy import VideoFileClip
+from PIL import Image
+from utility import download_video, extract_meta_data, get_transcript_vtt, getSubs
+from urllib.request import urlretrieve
+from IPython.display import display
+import ollama
+def demp_video_input_that_has_transcript():
+    # first video's url
+    vid_url = "https://www.youtube.com/watch?v=7Hcg-rLYwdM"
+    # download Youtube video to ./shared_data/videos/video1
+    vid_dir = "./shared_data/videos/video1"
+    vid_filepath = download_video(vid_url, vid_dir)
+    # download Youtube video's subtitle to ./shared_data/videos/video1
+    vid_transcript_filepath = get_transcript_vtt(vid_url, vid_dir)
+    return extract_meta_data(vid_dir, vid_filepath, vid_transcript_filepath)
+def demp_video_input_that_has_no_transcript():
+        # second video's url
+    vid_url=(
+        "https://multimedia-commons.s3-us-west-2.amazonaws.com/"
+        "data/videos/mp4/010/a07/010a074acb1975c4d6d6e43c1faeb8.mp4"
+    )
+    vid_dir = "./shared_data/videos/video2"
+    vid_name = "toddler_in_playground.mp4"
+    # create folder to which video2 will be downloaded
+    Path(vid_dir).mkdir(parents=True, exist_ok=True)
+    vid_filepath = urlretrieve(
+                            vid_url,
+                            osp.join(vid_dir, vid_name)
+                        )[0]
+    path_to_video_no_transcript = vid_filepath
+    # declare where to save .mp3 audio
+    path_to_extracted_audio_file = os.path.join(vid_dir, 'audio.mp3')
+    # extract mp3 audio file from mp4 video video file
+    clip = VideoFileClip(path_to_video_no_transcript)
+    clip.audio.write_audiofile(path_to_extracted_audio_file)
+    model = whisper.load_model("small")
+    options = dict(task="translate", best_of=1, language='en')
+    results = model.transcribe(path_to_extracted_audio_file, **options)
+    vtt = getSubs(results["segments"], "vtt")
+    # path to save generated transcript of video1
+    path_to_generated_trans = osp.join(vid_dir, 'generated_video1.vtt')
+    # write transcription to file
+    with open(path_to_generated_trans, 'w') as f:
+        f.write(vtt)
+    return extract_meta_data(vid_dir, vid_filepath, path_to_generated_trans)
+def ask_llvm(instruction, file_path):
+    result = ollama.generate(
+        model='llava',
+        prompt=instruction,
+        images=[file_path],
+        stream=False
+    )['response']
+    img=Image.open(file_path, mode='r')
+    img = img.resize([int(i/1.2) for i in img.size])
+    display(img)
+    for i in result.split('.'):
+        print(i, end='', flush=True)
+if __name__ == "__main__":
+    meta_data = demp_video_input_that_has_transcript()
+    meta_data1 = demp_video_input_that_has_no_transcript()
+    data = meta_data1[1]
+    caption = data['transcript']
+    print(f'Generated caption is: "{caption}"')
+    frame = Image.open(data['extracted_frame_path'])
+    display(frame)
+    instruction = "Can you describe the image?"
+    ask_llvm(instruction, data['extracted_frame_path'])
+    #print(meta_data)

s7_store_in_rag.py CHANGED Viewed

@@ -1,105 +1,105 @@
-from mm_rag.embeddings.bridgetower_embeddings import (
-    BridgeTowerEmbeddings
-)
-from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
-import lancedb
-import json
-import os
-from PIL import Image
-from utility import load_json_file, display_retrieved_results
-import pyarrow as pa
-# declare host file
-LANCEDB_HOST_FILE = "./shared_data/.lancedb"
-# declare table name
-TBL_NAME = "test_tbl"
-# initialize vectorstore
-db = lancedb.connect(LANCEDB_HOST_FILE)
-# initialize an BridgeTower embedder
-embedder = BridgeTowerEmbeddings()
-def return_top_k_most_similar_docs(max_docs=3):
-    # ask to return top 3 most similar documents
-        # Creating a LanceDB vector store
-    vectorstore = MultimodalLanceDB(
-        uri=LANCEDB_HOST_FILE,
-        embedding=embedder,
-        table_name=TBL_NAME)
-    # creating a retriever for the vector store
-    # search_type="similarity"
-    #  declares that the type of search that the Retriever should perform
-    #  is similarity search
-    # search_kwargs={"k": 1} means returning top-1 most similar document
-    retriever = vectorstore.as_retriever(
-    search_type='similarity',
-    search_kwargs={"k": max_docs})
-    query2 = (
-            "an astronaut's spacewalk "
-            "with an amazing view of the earth from space behind"
-    )
-    results2 = retriever.invoke(query2)
-    display_retrieved_results(results2)
-    query3 = "a group of astronauts"
-    results3 = retriever.invoke(query3)
-    display_retrieved_results(results3)
-def open_table(TBL_NAME):
-    # open a connection to table TBL_NAME
-    tbl = db.open_table()
-    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
-    # display the first 3 rows of the table
-    tbl.to_pandas()[['text', 'image_path']].head(3)
-def store_in_rag():
-    # load metadata files
-    vid1_metadata_path = './shared_data/videos/video1/metadatas.json'
-    vid2_metadata_path = './shared_data/videos/video2/metadatas.json'
-    vid1_metadata = load_json_file(vid1_metadata_path)
-    vid2_metadata = load_json_file(vid2_metadata_path)
-    # collect transcripts and image paths
-    vid1_trans = [vid['transcript'] for vid in vid1_metadata]
-    vid1_img_path = [vid['extracted_frame_path'] for vid in vid1_metadata]
-    vid2_trans = [vid['transcript'] for vid in vid2_metadata]
-    vid2_img_path = [vid['extracted_frame_path'] for vid in vid2_metadata]
-    # for video1, we pick n = 7
-    n = 7
-    updated_vid1_trans = [
-    ' '.join(vid1_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
-    ' '.join(vid1_trans[0 : i + int(n/2)]) for i in range(len(vid1_trans))
-    ]
-    # also need to update the updated transcripts in metadata
-    for i in range(len(updated_vid1_trans)):
-        vid1_metadata[i]['transcript'] = updated_vid1_trans[i]
-    # you can pass in mode="append"
-    # to add more entries to the vector store
-    # in case you want to start with a fresh vector store,
-    # you can pass in mode="overwrite" instead
-    _ = MultimodalLanceDB.from_text_image_pairs(
-        texts=updated_vid1_trans+vid2_trans,
-        image_paths=vid1_img_path+vid2_img_path,
-        embedding=embedder,
-        metadatas=vid1_metadata+vid2_metadata,
-        connection=db,
-        table_name=TBL_NAME,
-        mode="overwrite",
-    )
-if __name__ == "__main__":
-    tbl  = db.open_table(TBL_NAME)
-    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
-    #display the first 3 rows of the table
     return_top_k_most_similar_docs()

+from mm_rag.embeddings.bridgetower_embeddings import (
+    BridgeTowerEmbeddings
+)
+from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
+import lancedb
+import json
+import os
+from PIL import Image
+from utility import load_json_file, display_retrieved_results
+import pyarrow as pa
+# declare host file
+LANCEDB_HOST_FILE = "./shared_data/.lancedb"
+# declare table name
+TBL_NAME = "test_tbl"
+# initialize vectorstore
+db = lancedb.connect(LANCEDB_HOST_FILE)
+# initialize an BridgeTower embedder
+embedder = BridgeTowerEmbeddings()
+def return_top_k_most_similar_docs(max_docs=3):
+    # ask to return top 3 most similar documents
+        # Creating a LanceDB vector store
+    vectorstore = MultimodalLanceDB(
+        uri=LANCEDB_HOST_FILE,
+        embedding=embedder,
+        table_name=TBL_NAME)
+    # creating a retriever for the vector store
+    # search_type="similarity"
+    #  declares that the type of search that the Retriever should perform
+    #  is similarity search
+    # search_kwargs={"k": 1} means returning top-1 most similar document
+    retriever = vectorstore.as_retriever(
+    search_type='similarity',
+    search_kwargs={"k": max_docs})
+    query2 = (
+            "an astronaut's spacewalk "
+            "with an amazing view of the earth from space behind"
+    )
+    results2 = retriever.invoke(query2)
+    display_retrieved_results(results2)
+    query3 = "a group of astronauts"
+    results3 = retriever.invoke(query3)
+    display_retrieved_results(results3)
+def open_table(TBL_NAME):
+    # open a connection to table TBL_NAME
+    tbl = db.open_table()
+    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
+    # display the first 3 rows of the table
+    tbl.to_pandas()[['text', 'image_path']].head(3)
+def store_in_rag():
+    # load metadata files
+    vid1_metadata_path = './shared_data/videos/video1/metadatas.json'
+    vid2_metadata_path = './shared_data/videos/video2/metadatas.json'
+    vid1_metadata = load_json_file(vid1_metadata_path)
+    vid2_metadata = load_json_file(vid2_metadata_path)
+    # collect transcripts and image paths
+    vid1_trans = [vid['transcript'] for vid in vid1_metadata]
+    vid1_img_path = [vid['extracted_frame_path'] for vid in vid1_metadata]
+    vid2_trans = [vid['transcript'] for vid in vid2_metadata]
+    vid2_img_path = [vid['extracted_frame_path'] for vid in vid2_metadata]
+    # for video1, we pick n = 7
+    n = 7
+    updated_vid1_trans = [
+    ' '.join(vid1_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
+    ' '.join(vid1_trans[0 : i + int(n/2)]) for i in range(len(vid1_trans))
+    ]
+    # also need to update the updated transcripts in metadata
+    for i in range(len(updated_vid1_trans)):
+        vid1_metadata[i]['transcript'] = updated_vid1_trans[i]
+    # you can pass in mode="append"
+    # to add more entries to the vector store
+    # in case you want to start with a fresh vector store,
+    # you can pass in mode="overwrite" instead
+    _ = MultimodalLanceDB.from_text_image_pairs(
+        texts=updated_vid1_trans+vid2_trans,
+        image_paths=vid1_img_path+vid2_img_path,
+        embedding=embedder,
+        metadatas=vid1_metadata+vid2_metadata,
+        connection=db,
+        table_name=TBL_NAME,
+        mode="overwrite",
+    )
+if __name__ == "__main__":
+    tbl  = db.open_table(TBL_NAME)
+    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
+    #display the first 3 rows of the table
     return_top_k_most_similar_docs()

shared_data/videos/yt_video/blackholes101nationalgeographic/blackholes101nationalgeographic.mp4 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9f0d499a1b09e47d6f1e382e3be6666b6c268276f16abd84a680a7eb512b1a0
-size 8783737

 version https://git-lfs.github.com/spec/v1
+oid sha256:e61dc16ee65a8a0ef08316bfb4d8f7110c7f5186d298a7e28e1cafb3bb25c338
+size 132

shared_data/videos/yt_video/blackholes101nationalgeographic/captions.vtt CHANGED Viewed

@@ -1,104 +1,104 @@
-WEBVTT
-00:00.000 --> 00:08.760
- Black holes are among the most fascinating objects in our universe, and also the most
-00:08.760 --> 00:13.520
- mysterious.
-00:13.520 --> 00:19.040
- A black hole is a region in space where the force of gravity is so strong, not even light,
-00:19.040 --> 00:23.200
- the fastest known entity in our universe can escape.
-00:23.200 --> 00:28.680
- The boundary of a black hole is called the event horizon, a point of no return beyond
-00:28.680 --> 00:31.840
- which we truly cannot see.
-00:31.840 --> 00:37.040
- When something crosses the event horizon, it collapses into the black hole's singularity,
-00:37.040 --> 00:42.400
- an infinitely small, infinitely dense point where space, time, and the laws of physics
-00:42.400 --> 00:46.200
- no longer apply.
-00:46.200 --> 00:51.400
- Scientists have theorized several different types of black holes, with stellar and supermassive
-00:51.400 --> 00:54.280
- black holes being the most common.
-00:54.280 --> 00:58.640
- Stolar black holes form when massive stars die and collapse.
-00:58.640 --> 01:05.080
- They're roughly 10 to 20 times the mass of our sun, and scattered throughout the universe.
-01:05.080 --> 01:11.040
- There could be millions of these stellar black holes in the Milky Way alone.
-01:11.040 --> 01:16.440
- Supermassive black holes are giants by comparison, measuring millions, even billions of times
-01:16.440 --> 01:19.440
- more massive than our sun.
-01:19.440 --> 01:23.800
- Scientists can only guess how they form, but we do know they exist at the center of just
-01:23.800 --> 01:28.920
- about every large galaxy, including our own.
-01:28.920 --> 01:33.760
- Sagittarius A, the supermassive black hole at the center of the Milky Way, has a mass
-01:33.760 --> 01:39.360
- of roughly four million suns, and has a diameter about the distance between the Earth and our
-01:39.360 --> 01:41.960
- sun.
-01:41.960 --> 01:46.680
- Because black holes are invisible, the only way for scientists to detect and study them
-01:46.680 --> 01:50.040
- is to observe their effect on nearby matter.
-01:50.040 --> 01:55.360
- This includes accretion disks, a disk of particles that form when gases and dust fall toward a
-01:55.360 --> 02:03.920
- black hole, and quasars, jets of particles that blast out of supermassive black holes.
-02:03.920 --> 02:08.720
- Black holes remained largely unknown until the 20th century.
-02:08.720 --> 02:14.840
- In 1916, using Einstein's General Theory of Relativity, a German physicist named Karl
-02:14.840 --> 02:20.280
- Schwartzschild calculated that any mass could become a black hole if it were compressed tightly
-02:20.280 --> 02:22.640
- enough.
-02:22.640 --> 02:27.480
- But it wasn't until 1971 when theory became reality.
-02:27.480 --> 02:34.000
- Astronomers, studying the constellation Cygnus, discovered the first black hole.
-02:34.000 --> 02:39.440
- An untold number of black holes are scattered throughout the universe, constantly warping
-02:39.440 --> 02:45.600
- space and time, altering entire galaxies, and endlessly inspiring both scientists and
-02:45.600 --> 02:47.120
- our collective imagination.

+WEBVTT
+00:00.000 --> 00:08.760
+ Black holes are among the most fascinating objects in our universe, and also the most
+00:08.760 --> 00:13.520
+ mysterious.
+00:13.520 --> 00:19.040
+ A black hole is a region in space where the force of gravity is so strong, not even light,
+00:19.040 --> 00:23.200
+ the fastest known entity in our universe can escape.
+00:23.200 --> 00:28.680
+ The boundary of a black hole is called the event horizon, a point of no return beyond
+00:28.680 --> 00:31.840
+ which we truly cannot see.
+00:31.840 --> 00:37.040
+ When something crosses the event horizon, it collapses into the black hole's singularity,
+00:37.040 --> 00:42.400
+ an infinitely small, infinitely dense point where space, time, and the laws of physics
+00:42.400 --> 00:46.200
+ no longer apply.
+00:46.200 --> 00:51.400
+ Scientists have theorized several different types of black holes, with stellar and supermassive
+00:51.400 --> 00:54.280
+ black holes being the most common.
+00:54.280 --> 00:58.640
+ Stolar black holes form when massive stars die and collapse.
+00:58.640 --> 01:05.080
+ They're roughly 10 to 20 times the mass of our sun, and scattered throughout the universe.
+01:05.080 --> 01:11.040
+ There could be millions of these stellar black holes in the Milky Way alone.
+01:11.040 --> 01:16.440
+ Supermassive black holes are giants by comparison, measuring millions, even billions of times
+01:16.440 --> 01:19.440
+ more massive than our sun.
+01:19.440 --> 01:23.800
+ Scientists can only guess how they form, but we do know they exist at the center of just
+01:23.800 --> 01:28.920
+ about every large galaxy, including our own.
+01:28.920 --> 01:33.760
+ Sagittarius A, the supermassive black hole at the center of the Milky Way, has a mass
+01:33.760 --> 01:39.360
+ of roughly four million suns, and has a diameter about the distance between the Earth and our
+01:39.360 --> 01:41.960
+ sun.
+01:41.960 --> 01:46.680
+ Because black holes are invisible, the only way for scientists to detect and study them
+01:46.680 --> 01:50.040
+ is to observe their effect on nearby matter.
+01:50.040 --> 01:55.360
+ This includes accretion disks, a disk of particles that form when gases and dust fall toward a
+01:55.360 --> 02:03.920
+ black hole, and quasars, jets of particles that blast out of supermassive black holes.
+02:03.920 --> 02:08.720
+ Black holes remained largely unknown until the 20th century.
+02:08.720 --> 02:14.840
+ In 1916, using Einstein's General Theory of Relativity, a German physicist named Karl
+02:14.840 --> 02:20.280
+ Schwartzschild calculated that any mass could become a black hole if it were compressed tightly
+02:20.280 --> 02:22.640
+ enough.
+02:22.640 --> 02:27.480
+ But it wasn't until 1971 when theory became reality.
+02:27.480 --> 02:34.000
+ Astronomers, studying the constellation Cygnus, discovered the first black hole.
+02:34.000 --> 02:39.440
+ An untold number of black holes are scattered throughout the universe, constantly warping
+02:39.440 --> 02:45.600
+ space and time, altering entire galaxies, and endlessly inspiring both scientists and
+02:45.600 --> 02:47.120
+ our collective imagination.

utility.py CHANGED Viewed

@@ -1,764 +1,764 @@
-# Add your utilities or helper functions to this file.
-import os
-from pathlib import Path
-from dotenv import load_dotenv, find_dotenv
-from io import StringIO, BytesIO
-import textwrap
-from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
-from enum import auto, Enum
-import base64
-import glob
-from moviepy import VideoFileClip
-import requests
-from tqdm import tqdm
-from pytubefix import YouTube, Stream
-import webvtt
-import whisper
-from youtube_transcript_api import YouTubeTranscriptApi
-from youtube_transcript_api.formatters import WebVTTFormatter
-from predictionguard import PredictionGuard
-import cv2
-import re
-import json
-import PIL
-from ollama import chat
-from ollama import ChatResponse
-from PIL import Image
-import dataclasses
-import random
-from datasets import load_dataset
-from os import path as osp
-from IPython.display import display
-from langchain_core.prompt_values import PromptValue
-from langchain_core.messages import (
-    MessageLikeRepresentation,
-)
-from transformers import pipeline
-from huggingface_hub import InferenceClient
-MultimodalModelInput = Union[PromptValue, str, Sequence[MessageLikeRepresentation], Dict[str, Any]]
-def get_from_dict_or_env(
-    data: Dict[str, Any], key: str, env_key: str, default: Optional[str] = None
-) -> str:
-    """Get a value from a dictionary or an environment variable."""
-    if key in data and data[key]:
-        return data[key]
-    else:
-        return get_from_env(key, env_key, default=default)
-def get_from_env(key: str, env_key: str, default: Optional[str] = None) -> str:
-    """Get a value from a dictionary or an environment variable."""
-    if env_key in os.environ and os.environ[env_key]:
-        return os.environ[env_key]
-    else:
-        return default
-def load_env():
-    _ = load_dotenv(find_dotenv())
-def get_openai_api_key():
-    load_env()
-    openai_api_key = os.getenv("OPENAI_API_KEY")
-    return openai_api_key
-def get_prediction_guard_api_key():
-    load_env()
-    PREDICTION_GUARD_API_KEY = os.getenv("PREDICTION_GUARD_API_KEY", None)
-    if PREDICTION_GUARD_API_KEY is None:
-        PREDICTION_GUARD_API_KEY = input("Please enter your Prediction Guard API Key: ")
-    return PREDICTION_GUARD_API_KEY
-PREDICTION_GUARD_URL_ENDPOINT = os.getenv("DLAI_PREDICTION_GUARD_URL_ENDPOINT", "https://dl-itdc.predictionguard.com") ###"https://proxy-dl-itdc.predictionguard.com"
-# prompt templates
-templates = [
-    'a picture of {}',
-    'an image of {}',
-    'a nice {}',
-    'a beautiful {}',
-]
-# function helps to prepare list image-text pairs from the first [test_size] data of a Huggingface dataset
-def prepare_dataset_for_umap_visualization(hf_dataset, class_name, templates=templates, test_size=1000):
-    # load Huggingface dataset (download if needed)
-    dataset = load_dataset(hf_dataset, trust_remote_code=True)
-    # split dataset with specific test_size
-    train_test_dataset = dataset['train'].train_test_split(test_size=test_size)
-    # get the test dataset
-    test_dataset = train_test_dataset['test']
-    img_txt_pairs = []
-    for i in range(len(test_dataset)):
-        img_txt_pairs.append({
-            'caption' : templates[random.randint(0, len(templates)-1)].format(class_name),
-            'pil_img' : test_dataset[i]['image']
-        })
-    return img_txt_pairs
-def download_video(video_url, path):
-    print(f'Getting video information for {video_url}')
-    def progress_callback(stream: Stream, data_chunk: bytes, bytes_remaining: int) -> None:
-        pbar.update(len(data_chunk))
-    stream = None
-    try:
-        yt = YouTube(video_url, on_progress_callback=progress_callback)
-        stream = yt.streams.filter(progressive=True, file_extension='mp4', res='480p').desc().first()
-        if stream is None:
-            stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
-    except Exception as e:
-        print(f"Youtube Exception Occured.Loading from local resource: {e}")
-    uncleaned_filename = stream.default_filename.replace(' ', '').lower() if stream else "blackholes101nationalgeographic.mp4"
-    print(f'Uncleaned filename: {uncleaned_filename}')
-    filename= re.sub(r'[^a-zA-Z0-9]', '', uncleaned_filename).replace('mp4', '')
-    filename_without_extension = os.path.splitext(filename)[0]
-    filename_with_extension = filename+'.mp4'
-    folder_path = os.path.join(path, filename_without_extension)
-    print(f'Checking the folder path {folder_path}')
-    full_file_path = os.path.join(folder_path, filename_with_extension)
-    if not os.path.exists(folder_path):
-        os.makedirs(folder_path, exist_ok=True)
-    if os.path.exists(full_file_path):
-        print('Video already downloaded at the folder path', full_file_path)
-        is_downloaded = False
-        return full_file_path, folder_path, is_downloaded
-    is_downloaded = True
-    print('Downloading video from YouTube...')
-    pbar = tqdm(desc='Downloading video from YouTube', total=stream.filesize, unit="bytes")
-    stream.download(folder_path, filename=filename_with_extension)
-    pbar.close()
-    return full_file_path, folder_path, is_downloaded
-def get_video_id_from_url(video_url):
-    """
-    Examples:
-    - http://youtu.be/SA2iWivDJiE
-    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
-    - http://www.youtube.com/embed/SA2iWivDJiE
-    - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
-    """
-    import urllib.parse
-    url = urllib.parse.urlparse(video_url)
-    if url.hostname == 'youtu.be':
-        return url.path[1:]
-    if url.hostname in ('www.youtube.com', 'youtube.com'):
-        if url.path == '/watch':
-            p = urllib.parse.parse_qs(url.query)
-            return p['v'][0]
-        if url.path[:7] == '/embed/':
-            return url.path.split('/')[2]
-        if url.path[:3] == '/v/':
-            return url.path.split('/')[2]
-    return video_url
-def generate_transcript_vtt(vid_dir, vid_filepath):
-    print("Generating transcript for video ", vid_filepath)
-     # declare where to save .mp3 audio
-    path_to_extracted_audio_file = os.path.join(vid_dir, 'audio.mp3')
-    # extract mp3 audio file from mp4 video video file
-    path_to_video_no_transcript = vid_filepath
-    clip = VideoFileClip(path_to_video_no_transcript)
-    clip.audio.write_audiofile(path_to_extracted_audio_file)
-    model = whisper.load_model("small")
-    options = dict(task="translate", best_of=1, language='en')
-    results = model.transcribe(path_to_extracted_audio_file, **options)
-    vtt = getSubs(results["segments"], "vtt")
-    # path to save generated transcript of video1
-    path_to_generated_trans = osp.join(vid_dir, 'captions.vtt')
-    # write transcription to file
-    with open(path_to_generated_trans, 'w') as f:
-        f.write(vtt)
-    return path_to_generated_trans
-# if this has transcript then download
-def get_transcript_vtt(path, video_url, vid_file_path, from_gen=False):
-    if from_gen:
-        return generate_transcript_vtt(path,vid_file_path)
-    video_id = get_video_id_from_url(video_url)
-    filepath = os.path.join(path,'captions.vtt')
-    if os.path.exists(filepath):
-        print('Transcript already exists')
-        return filepath
-    print('Downloading Transcript...')
-    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en-GB', 'en'])
-    formatter = WebVTTFormatter()
-    webvtt_formatted = formatter.format_transcript(transcript)
-    with open(filepath, 'w', encoding='utf-8') as webvtt_file:
-        webvtt_file.write(webvtt_formatted)
-    webvtt_file.close()
-    return filepath
-# helper function for convert time in second to time format for .vtt or .srt file
-def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
-    assert seconds >= 0, "non-negative timestamp expected"
-    milliseconds = round(seconds * 1000.0)
-    hours = milliseconds // 3_600_000
-    milliseconds -= hours * 3_600_000
-    minutes = milliseconds // 60_000
-    milliseconds -= minutes * 60_000
-    seconds = milliseconds // 1_000
-    milliseconds -= seconds * 1_000
-    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
-    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
-# a help function that helps to convert a specific time written as a string in format `webvtt` into a time in miliseconds
-def str2time(strtime):
-    # strip character " if exists
-    strtime = strtime.strip('"')
-    # get hour, minute, second from time string
-    hrs, mins, seconds = [float(c) for c in strtime.split(':')]
-    # get the corresponding time as total seconds
-    total_seconds = hrs * 60**2 + mins * 60 + seconds
-    total_miliseconds = total_seconds * 1000
-    return total_miliseconds
-def _processText(text: str, maxLineWidth=None):
-    if (maxLineWidth is None or maxLineWidth < 0):
-        return text
-    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
-    return '\n'.join(lines)
-# Resizes a image and maintains aspect ratio
-def maintain_aspect_ratio_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
-    # Grab the image size and initialize dimensions
-    dim = None
-    (h, w) = image.shape[:2]
-    # Return original image if no need to resize
-    if width is None and height is None:
-        return image
-    # We are resizing height if width is none
-    if width is None:
-        # Calculate the ratio of the height and construct the dimensions
-        r = height / float(h)
-        dim = (int(w * r), height)
-    # We are resizing width if height is none
-    else:
-        # Calculate the ratio of the width and construct the dimensions
-        r = width / float(w)
-        dim = (width, int(h * r))
-    # Return the resized image
-    return cv2.resize(image, dim, interpolation=inter)
-# helper function to convert transcripts generated by whisper to .vtt file
-def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
-    print("WEBVTT\n", file=file)
-    for segment in transcript:
-        text = _processText(segment['text'], maxLineWidth).replace('-->', '->')
-        print(
-            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
-            f"{text}\n",
-            file=file,
-            flush=True,
-        )
-# helper function to convert transcripts generated by whisper to .srt file
-def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
-    """
-    Write a transcript to a file in SRT format.
-    Example usage:
-        from pathlib import Path
-        from whisper.utils import write_srt
-import requests
-        result = transcribe(model, audio_path, temperature=temperature, **args)
-        # save SRT
-        audio_basename = Path(audio_path).stem
-        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
-            write_srt(result["segments"], file=srt)
-    """
-    for i, segment in enumerate(transcript, start=1):
-        text = _processText(segment['text'].strip(), maxLineWidth).replace('-->', '->')
-        # write srt lines
-        print(
-            f"{i}\n"
-            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
-            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
-            f"{text}\n",
-            file=file,
-            flush=True,
-        )
-def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int=-1) -> str:
-    segmentStream = StringIO()
-    if format == 'vtt':
-        write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
-    elif format == 'srt':
-        write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
-    else:
-        raise Exception("Unknown format " + format)
-    segmentStream.seek(0)
-    return segmentStream.read()
-# encoding image at given path or PIL Image using base64
-def encode_image(image_path_or_PIL_img):
-    if isinstance(image_path_or_PIL_img, PIL.Image.Image):
-        # this is a PIL image
-        buffered = BytesIO()
-        image_path_or_PIL_img.save(buffered, format="JPEG")
-        return base64.b64encode(buffered.getvalue()).decode('utf-8')
-    else:
-        # this is a image_path
-        with open(image_path_or_PIL_img, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode('utf-8')
-# checking whether the given string is base64 or not
-def isBase64(sb):
-    try:
-        if isinstance(sb, str):
-                # If there's any unicode here, an exception will be thrown and the function will return false
-                sb_bytes = bytes(sb, 'ascii')
-        elif isinstance(sb, bytes):
-                sb_bytes = sb
-        else:
-                raise ValueError("Argument must be string or bytes")
-        return base64.b64encode(base64.b64decode(sb_bytes)) == sb_bytes
-    except Exception:
-            return False
-def encode_image_from_path_or_url(image_path_or_url):
-    try:
-        # try to open the url to check valid url
-        f = urlopen(image_path_or_url)
-        # if this is an url
-        return base64.b64encode(requests.get(image_path_or_url).content).decode('utf-8')
-    except:
-        # this is a path to image
-        with open(image_path_or_url, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode('utf-8')
-# helper function to compute the joint embedding of a prompt and a base64-encoded image through PredictionGuard
-def bt_embedding_from_prediction_guard(prompt, base64_image):
-    # get PredictionGuard client
-    client = _getPredictionGuardClient()
-    message = {"text": prompt,}
-    if base64_image is not None and base64_image != "":
-        if not isBase64(base64_image):
-            raise TypeError("image input must be in base64 encoding!")
-        message['image'] = base64_image
-    response = client.embeddings.create(
-        model="bridgetower-large-itm-mlm-itc",
-        input=[message]
-    )
-    return response['data'][0]['embedding']
-def load_json_file(file_path):
-    # Open the JSON file in read mode
-    with open(file_path, 'r') as file:
-        data = json.load(file)
-    return data
-def display_retrieved_results(results):
-    print(f'There is/are {len(results)} retrieved result(s)')
-    print()
-    for i, res in enumerate(results):
-        print(f'The caption of the {str(i+1)}-th retrieved result is:\n"{results[i].page_content}"')
-        print()
-        print(results[i])
-        #display(Image.open(results[i].metadata['metadata']['extracted_frame_path']))
-        print("------------------------------------------------------------")
-class SeparatorStyle(Enum):
-    """Different separator style."""
-    SINGLE = auto()
-@dataclasses.dataclass
-class Conversation:
-    """A class that keeps all conversation history"""
-    system: str
-    roles: List[str]
-    messages: List[List[str]]
-    map_roles: Dict[str, str]
-    version: str = "Unknown"
-    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
-    sep: str = "\n"
-    def _get_prompt_role(self, role):
-        if self.map_roles is not None and role in self.map_roles.keys():
-            return self.map_roles[role]
-        else:
-            return role
-    def _build_content_for_first_message_in_conversation(self, first_message: List[str]):
-        content = []
-        if len(first_message) != 2:
-            raise TypeError("First message in Conversation needs to include a prompt and a base64-enconded image!")
-        prompt, b64_image = first_message[0], first_message[1]
-        # handling prompt
-        if prompt is None:
-            raise TypeError("API does not support None prompt yet")
-        content.append({
-            "type": "text",
-            "text": prompt
-        })
-        if b64_image is None:
-            raise TypeError("API does not support text only conversation yet")
-        # handling image
-        if not isBase64(b64_image):
-            raise TypeError("Image in Conversation's first message must be stored under base64 encoding!")
-        content.append({
-            "type": "image_url",
-            "image_url": {
-                "url": b64_image,
-            }
-        })
-        return content
-    def _build_content_for_follow_up_messages_in_conversation(self, follow_up_message: List[str]):
-        if follow_up_message is not None and len(follow_up_message) > 1:
-            raise TypeError("Follow-up message in Conversation must not include an image!")
-        # handling text prompt
-        if follow_up_message is None or follow_up_message[0] is None:
-            raise TypeError("Follow-up message in Conversation must include exactly one text message")
-        text = follow_up_message[0]
-        return text
-    def get_message(self):
-        messages = self.messages
-        api_messages = []
-        for i, msg in enumerate(messages):
-            role, message_content = msg
-            if i == 0:
-                # get content for very first message in conversation
-                content = self._build_content_for_first_message_in_conversation(message_content)
-            else:
-                # get content for follow-up message in conversation
-                content = self._build_content_for_follow_up_messages_in_conversation(message_content)
-            api_messages.append({
-                "role": role,
-                "content": content,
-            })
-        return api_messages
-    # this method helps represent a multi-turn chat into as a single turn chat format
-    def serialize_messages(self):
-        messages = self.messages
-        ret = ""
-        if self.sep_style == SeparatorStyle.SINGLE:
-            if self.system is not None and self.system != "":
-                ret = self.system + self.sep
-            for i, (role, message) in enumerate(messages):
-                role = self._get_prompt_role(role)
-                if message:
-                    if isinstance(message, List):
-                        # get prompt only
-                        message = message[0]
-                    if i == 0:
-                        # do not include role at the beginning
-                        ret += message
-                    else:
-                        ret += role + ": " + message
-                    if i < len(messages) - 1:
-                        # avoid including sep at the end of serialized message
-                        ret += self.sep
-                else:
-                    ret += role + ":"
-        else:
-            raise ValueError(f"Invalid style: {self.sep_style}")
-        return ret
-    def append_message(self, role, message):
-        if len(self.messages) == 0:
-            # data verification for the very first message
-            assert role == self.roles[0], f"the very first message in conversation must be from role {self.roles[0]}"
-            assert len(message) == 2, f"the very first message in conversation must include both prompt and an image"
-            prompt, image = message[0], message[1]
-            assert prompt is not None, f"prompt must be not None"
-            assert isBase64(image), f"image must be under base64 encoding"
-        else:
-            # data verification for follow-up message
-            assert role in self.roles, f"the follow-up message must be from one of the roles {self.roles}"
-            assert len(message) == 1, f"the follow-up message must consist of one text message only, no image"
-        self.messages.append([role, message])
-    def copy(self):
-        return Conversation(
-            system=self.system,
-            roles=self.roles,
-            messages=[[x,y] for x, y in self.messages],
-            version=self.version,
-            map_roles=self.map_roles,
-        )
-    def dict(self):
-        return {
-            "system": self.system,
-            "roles": self.roles,
-            "messages": [[x, y[0] if len(y) == 1 else y] for x, y in self.messages],
-            "version": self.version,
-        }
-prediction_guard_llava_conv = Conversation(
-    system="",
-    roles=("user", "assistant"),
-    messages=[],
-    version="Prediction Guard LLaVA enpoint Conversation v0",
-    sep_style=SeparatorStyle.SINGLE,
-    map_roles={
-        "user": "USER",
-        "assistant": "ASSISTANT"
-    }
-)
-# get PredictionGuard Client
-def _getPredictionGuardClient():
-    PREDICTION_GUARD_API_KEY = get_prediction_guard_api_key()
-    client = PredictionGuard(
-        api_key=PREDICTION_GUARD_API_KEY,
-        url=PREDICTION_GUARD_URL_ENDPOINT,
-    )
-    return client
-# helper function to call chat completion endpoint of PredictionGuard given a prompt and an image
-def lvlm_inference(prompt, image, max_tokens: int = 200, temperature: float = 0.95, top_p: float = 0.1, top_k: int = 10):
-    # prepare conversation
-    conversation = prediction_guard_llava_conv.copy()
-    conversation.append_message(conversation.roles[0], [prompt, image])
-    return lvlm_inference_with_conversation(conversation, max_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
-def lvlm_inference_with_conversation(conversation, max_tokens: int = 200, temperature: float = 0.95, top_p: float = 0.1, top_k: int = 10):
-    # get PredictionGuard client
-    client = _getPredictionGuardClient()
-    # get message from conversation
-    messages = conversation.get_message()
-    # call chat completion endpoint at Grediction Guard
-    response = client.chat.completions.create(
-        model="llava-1.5-7b-hf",
-        messages=messages,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-    )
-    return response['choices'][-1]['message']['content']
-def get_token():
-    load_env()
-    token = os.getenv("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
-    if token is None:
-        raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
-    return token
-def lvlm_inference_with_phi(prompt):
-    messages = [{"role": "user", "content": prompt}]
-    client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=get_token())
-    response = ''
-    token = client.chat_completion(messages, max_tokens=256)
-    response = token['choices'][0]['message']['content']
-    return response
-def lvlm_inference_with_tiny_model(prompt):
-    classifier = pipeline(
-        "text-generation",
-        model="microsoft/phi-2",  # Only ~2.7GB
-        device_map="auto",
-        torch_dtype="auto",
-    )
-    response = classifier(
-        prompt,
-        max_new_tokens=512,  # Remove max_length and use only max_new_tokens
-        temperature=0.7,
-        do_sample=True,
-        num_return_sequences=1,
-        truncation=True,     # Add explicit truncation
-        pad_token_id=classifier.tokenizer.eos_token_id,
-        eos_token_id=classifier.tokenizer.eos_token_id,
-    )[0]['generated_text']
-    # Remove the input prompt from the response and clean up
-    return response.replace(prompt, "").strip()
-# function `extract_and_save_frames_and_metadata``:
-#   receives as input a video and its transcript
-#   does extracting and saving frames and their metadatas
-#   returns the extracted metadatas
-def extract_and_save_frames_and_metadata(
-        path_to_video,
-        path_to_transcript,
-        path_to_save_extracted_frames,
-        path_to_save_metadatas):
-    # metadatas will store the metadata of all extracted frames
-    metadatas = []
-    # load video using cv2
-    print(f"Loading video from {path_to_video}")
-    video = cv2.VideoCapture(path_to_video)
-    # load transcript using webvtt
-    print(f"Loading transcript from {path_to_transcript}")
-    trans = webvtt.read(path_to_transcript)
-    # iterate transcript file
-    # for each video segment specified in the transcript file
-    for idx, transcript in enumerate(trans):
-        # get the start time and end time in seconds
-        start_time_ms = str2time(transcript.start)
-        end_time_ms = str2time(transcript.end)
-        # get the time in ms exactly
-        # in the middle of start time and end time
-        mid_time_ms = (end_time_ms + start_time_ms) / 2
-        # get the transcript, remove the next-line symbol
-        text = transcript.text.replace("\n", ' ')
-        # get frame at the middle time
-        video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
-        print(f"Extracting frame at {mid_time_ms} ms")
-        success, frame = video.read()
-        if success:
-            # if the frame is extracted successfully, resize it
-            image = maintain_aspect_ratio_resize(frame, height=350)
-            # save frame as JPEG file
-            img_fname = f'frame_{idx}.jpg'
-            img_fpath = osp.join(
-                path_to_save_extracted_frames, img_fname
-            )
-            cv2.imwrite(img_fpath, image)
-            # prepare the metadata
-            metadata = {
-                'extracted_frame_path': img_fpath,
-                'transcript': text,
-                'video_segment_id': idx,
-                'video_path': path_to_video,
-                'mid_time_ms': mid_time_ms,
-            }
-            metadatas.append(metadata)
-        else:
-            print(f"ERROR! Cannot extract frame: idx = {idx}")
-    # save metadata of all extracted frames
-    fn = osp.join(path_to_save_metadatas, 'metadatas.json')
-    with open(fn, 'w') as outfile:
-        json.dump(metadatas, outfile)
-    return metadatas
-def extract_meta_data(vid_dir, vid_filepath, vid_transcript_filepath):
-    # output paths to save extracted frames and their metadata
-    extracted_frames_path = osp.join(vid_dir, 'extracted_frame')
-    metadatas_path = vid_dir
-    # create these output folders if not existing
-    print(f"Creating folders {extracted_frames_path} and {metadatas_path}")
-    Path(extracted_frames_path).mkdir(parents=True, exist_ok=True)
-    Path(metadatas_path).mkdir(parents=True, exist_ok=True)
-    print("Extracting frames the video path ", vid_filepath)
-    # call the function to extract frames and metadatas
-    metadatas = extract_and_save_frames_and_metadata(
-                    vid_filepath,
-                    vid_transcript_filepath,
-                    extracted_frames_path,
-                    metadatas_path,
-                )
-    return metadatas
-# function extract_and_save_frames_and_metadata_with_fps
-#   receives as input a video
-#   does extracting and saving frames and their metadatas
-#   returns the extracted metadatas
-def extract_and_save_frames_and_metadata_with_fps(
-        lvlm_prompt,
-        path_to_video,
-        path_to_save_extracted_frames,
-        path_to_save_metadatas,
-        num_of_extracted_frames_per_second=1):
-    # metadatas will store the metadata of all extracted frames
-    metadatas = []
-    # load video using cv2
-    video = cv2.VideoCapture(path_to_video)
-    # Get the frames per second
-    fps = video.get(cv2.CAP_PROP_FPS)
-    # Get hop = the number of frames pass before a frame is extracted
-    hop = round(fps / num_of_extracted_frames_per_second)
-    curr_frame = 0
-    idx = -1
-    while(True):
-        # iterate all frames
-        ret, frame = video.read()
-        if not ret:
-            break
-        if curr_frame % hop == 0:
-            idx = idx + 1
-            # if the frame is extracted successfully, resize it
-            image = maintain_aspect_ratio_resize(frame, height=350)
-            # save frame as JPEG file
-            img_fname = f'frame_{idx}.jpg'
-            img_fpath = osp.join(
-                            path_to_save_extracted_frames,
-                            img_fname
-                        )
-            cv2.imwrite(img_fpath, image)
-            # generate caption using lvlm_inference
-            b64_image = encode_image(img_fpath)
-            caption = lvlm_inference(lvlm_prompt, b64_image)
-            # prepare the metadata
-            metadata = {
-                'extracted_frame_path': img_fpath,
-                'transcript': caption,
-                'video_segment_id': idx,
-                'video_path': path_to_video,
-            }
-            metadatas.append(metadata)
-        curr_frame += 1
-    # save metadata of all extracted frames
-    metadatas_path = osp.join(path_to_save_metadatas,'metadatas.json')
-    with open(metadatas_path, 'w') as outfile:
-        json.dump(metadatas, outfile)
-    return metadatas
-if __name__ == "__main__":
-    res = lvlm_inference_with_phi("Tell me a story")
     print(res)

+# Add your utilities or helper functions to this file.
+import os
+from pathlib import Path
+from dotenv import load_dotenv, find_dotenv
+from io import StringIO, BytesIO
+import textwrap
+from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
+from enum import auto, Enum
+import base64
+import glob
+from moviepy import VideoFileClip
+import requests
+from tqdm import tqdm
+from pytubefix import YouTube, Stream
+import webvtt
+import whisper
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api.formatters import WebVTTFormatter
+from predictionguard import PredictionGuard
+import cv2
+import re
+import json
+import PIL
+from ollama import chat
+from ollama import ChatResponse
+from PIL import Image
+import dataclasses
+import random
+from datasets import load_dataset
+from os import path as osp
+from IPython.display import display
+from langchain_core.prompt_values import PromptValue
+from langchain_core.messages import (
+    MessageLikeRepresentation,
+)
+from transformers import pipeline
+from huggingface_hub import InferenceClient
+MultimodalModelInput = Union[PromptValue, str, Sequence[MessageLikeRepresentation], Dict[str, Any]]
+def get_from_dict_or_env(
+    data: Dict[str, Any], key: str, env_key: str, default: Optional[str] = None
+) -> str:
+    """Get a value from a dictionary or an environment variable."""
+    if key in data and data[key]:
+        return data[key]
+    else:
+        return get_from_env(key, env_key, default=default)
+def get_from_env(key: str, env_key: str, default: Optional[str] = None) -> str:
+    """Get a value from a dictionary or an environment variable."""
+    if env_key in os.environ and os.environ[env_key]:
+        return os.environ[env_key]
+    else:
+        return default
+def load_env():
+    _ = load_dotenv(find_dotenv())
+def get_openai_api_key():
+    load_env()
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    return openai_api_key
+def get_prediction_guard_api_key():
+    load_env()
+    PREDICTION_GUARD_API_KEY = os.getenv("PREDICTION_GUARD_API_KEY", None)
+    if PREDICTION_GUARD_API_KEY is None:
+        PREDICTION_GUARD_API_KEY = input("Please enter your Prediction Guard API Key: ")
+    return PREDICTION_GUARD_API_KEY
+PREDICTION_GUARD_URL_ENDPOINT = os.getenv("DLAI_PREDICTION_GUARD_URL_ENDPOINT", "https://dl-itdc.predictionguard.com") ###"https://proxy-dl-itdc.predictionguard.com"
+# prompt templates
+templates = [
+    'a picture of {}',
+    'an image of {}',
+    'a nice {}',
+    'a beautiful {}',
+]
+# function helps to prepare list image-text pairs from the first [test_size] data of a Huggingface dataset
+def prepare_dataset_for_umap_visualization(hf_dataset, class_name, templates=templates, test_size=1000):
+    # load Huggingface dataset (download if needed)
+    dataset = load_dataset(hf_dataset, trust_remote_code=True)
+    # split dataset with specific test_size
+    train_test_dataset = dataset['train'].train_test_split(test_size=test_size)
+    # get the test dataset
+    test_dataset = train_test_dataset['test']
+    img_txt_pairs = []
+    for i in range(len(test_dataset)):
+        img_txt_pairs.append({
+            'caption' : templates[random.randint(0, len(templates)-1)].format(class_name),
+            'pil_img' : test_dataset[i]['image']
+        })
+    return img_txt_pairs
+def download_video(video_url, path):
+    print(f'Getting video information for {video_url}')
+    def progress_callback(stream: Stream, data_chunk: bytes, bytes_remaining: int) -> None:
+        pbar.update(len(data_chunk))
+    stream = None
+    try:
+        yt = YouTube(video_url, on_progress_callback=progress_callback)
+        stream = yt.streams.filter(progressive=True, file_extension='mp4', res='480p').desc().first()
+        if stream is None:
+            stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
+    except Exception as e:
+        print(f"Youtube Exception Occured.Loading from local resource: {e}")
+    uncleaned_filename = stream.default_filename.replace(' ', '').lower() if stream else "blackholes101nationalgeographic.mp4"
+    print(f'Uncleaned filename: {uncleaned_filename}')
+    filename= re.sub(r'[^a-zA-Z0-9]', '', uncleaned_filename).replace('mp4', '')
+    filename_without_extension = os.path.splitext(filename)[0]
+    filename_with_extension = filename+'.mp4'
+    folder_path = os.path.join(path, filename_without_extension)
+    print(f'Checking the folder path {folder_path}')
+    full_file_path = os.path.join(folder_path, filename_with_extension)
+    if not os.path.exists(folder_path):
+        os.makedirs(folder_path, exist_ok=True)
+    if os.path.exists(full_file_path):
+        print('Video already downloaded at the folder path', full_file_path)
+        is_downloaded = False
+        return full_file_path, folder_path, is_downloaded
+    is_downloaded = True
+    print('Downloading video from YouTube...')
+    pbar = tqdm(desc='Downloading video from YouTube', total=stream.filesize, unit="bytes")
+    stream.download(folder_path, filename=filename_with_extension)
+    pbar.close()
+    return full_file_path, folder_path, is_downloaded
+def get_video_id_from_url(video_url):
+    """
+    Examples:
+    - http://youtu.be/SA2iWivDJiE
+    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
+    - http://www.youtube.com/embed/SA2iWivDJiE
+    - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
+    """
+    import urllib.parse
+    url = urllib.parse.urlparse(video_url)
+    if url.hostname == 'youtu.be':
+        return url.path[1:]
+    if url.hostname in ('www.youtube.com', 'youtube.com'):
+        if url.path == '/watch':
+            p = urllib.parse.parse_qs(url.query)
+            return p['v'][0]
+        if url.path[:7] == '/embed/':
+            return url.path.split('/')[2]
+        if url.path[:3] == '/v/':
+            return url.path.split('/')[2]
+    return video_url
+def generate_transcript_vtt(vid_dir, vid_filepath):
+    print("Generating transcript for video ", vid_filepath)
+     # declare where to save .mp3 audio
+    path_to_extracted_audio_file = os.path.join(vid_dir, 'audio.mp3')
+    # extract mp3 audio file from mp4 video video file
+    path_to_video_no_transcript = vid_filepath
+    clip = VideoFileClip(path_to_video_no_transcript)
+    clip.audio.write_audiofile(path_to_extracted_audio_file)
+    model = whisper.load_model("small")
+    options = dict(task="translate", best_of=1, language='en')
+    results = model.transcribe(path_to_extracted_audio_file, **options)
+    vtt = getSubs(results["segments"], "vtt")
+    # path to save generated transcript of video1
+    path_to_generated_trans = osp.join(vid_dir, 'captions.vtt')
+    # write transcription to file
+    with open(path_to_generated_trans, 'w') as f:
+        f.write(vtt)
+    return path_to_generated_trans
+# if this has transcript then download
+def get_transcript_vtt(path, video_url, vid_file_path, from_gen=False):
+    if from_gen:
+        return generate_transcript_vtt(path,vid_file_path)
+    video_id = get_video_id_from_url(video_url)
+    filepath = os.path.join(path,'captions.vtt')
+    if os.path.exists(filepath):
+        print('Transcript already exists')
+        return filepath
+    print('Downloading Transcript...')
+    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en-GB', 'en'])
+    formatter = WebVTTFormatter()
+    webvtt_formatted = formatter.format_transcript(transcript)
+    with open(filepath, 'w', encoding='utf-8') as webvtt_file:
+        webvtt_file.write(webvtt_formatted)
+    webvtt_file.close()
+    return filepath
+# helper function for convert time in second to time format for .vtt or .srt file
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+# a help function that helps to convert a specific time written as a string in format `webvtt` into a time in miliseconds
+def str2time(strtime):
+    # strip character " if exists
+    strtime = strtime.strip('"')
+    # get hour, minute, second from time string
+    hrs, mins, seconds = [float(c) for c in strtime.split(':')]
+    # get the corresponding time as total seconds
+    total_seconds = hrs * 60**2 + mins * 60 + seconds
+    total_miliseconds = total_seconds * 1000
+    return total_miliseconds
+def _processText(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)
+# Resizes a image and maintains aspect ratio
+def maintain_aspect_ratio_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+    # Grab the image size and initialize dimensions
+    dim = None
+    (h, w) = image.shape[:2]
+    # Return original image if no need to resize
+    if width is None and height is None:
+        return image
+    # We are resizing height if width is none
+    if width is None:
+        # Calculate the ratio of the height and construct the dimensions
+        r = height / float(h)
+        dim = (int(w * r), height)
+    # We are resizing width if height is none
+    else:
+        # Calculate the ratio of the width and construct the dimensions
+        r = width / float(w)
+        dim = (width, int(h * r))
+    # Return the resized image
+    return cv2.resize(image, dim, interpolation=inter)
+# helper function to convert transcripts generated by whisper to .vtt file
+def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        text = _processText(segment['text'], maxLineWidth).replace('-->', '->')
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+# helper function to convert transcripts generated by whisper to .srt file
+def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+import requests
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    for i, segment in enumerate(transcript, start=1):
+        text = _processText(segment['text'].strip(), maxLineWidth).replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int=-1) -> str:
+    segmentStream = StringIO()
+    if format == 'vtt':
+        write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+    elif format == 'srt':
+        write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+    else:
+        raise Exception("Unknown format " + format)
+    segmentStream.seek(0)
+    return segmentStream.read()
+# encoding image at given path or PIL Image using base64
+def encode_image(image_path_or_PIL_img):
+    if isinstance(image_path_or_PIL_img, PIL.Image.Image):
+        # this is a PIL image
+        buffered = BytesIO()
+        image_path_or_PIL_img.save(buffered, format="JPEG")
+        return base64.b64encode(buffered.getvalue()).decode('utf-8')
+    else:
+        # this is a image_path
+        with open(image_path_or_PIL_img, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+# checking whether the given string is base64 or not
+def isBase64(sb):
+    try:
+        if isinstance(sb, str):
+                # If there's any unicode here, an exception will be thrown and the function will return false
+                sb_bytes = bytes(sb, 'ascii')
+        elif isinstance(sb, bytes):
+                sb_bytes = sb
+        else:
+                raise ValueError("Argument must be string or bytes")
+        return base64.b64encode(base64.b64decode(sb_bytes)) == sb_bytes
+    except Exception:
+            return False
+def encode_image_from_path_or_url(image_path_or_url):
+    try:
+        # try to open the url to check valid url
+        f = urlopen(image_path_or_url)
+        # if this is an url
+        return base64.b64encode(requests.get(image_path_or_url).content).decode('utf-8')
+    except:
+        # this is a path to image
+        with open(image_path_or_url, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+# helper function to compute the joint embedding of a prompt and a base64-encoded image through PredictionGuard
+def bt_embedding_from_prediction_guard(prompt, base64_image):
+    # get PredictionGuard client
+    client = _getPredictionGuardClient()
+    message = {"text": prompt,}
+    if base64_image is not None and base64_image != "":
+        if not isBase64(base64_image):
+            raise TypeError("image input must be in base64 encoding!")
+        message['image'] = base64_image
+    response = client.embeddings.create(
+        model="bridgetower-large-itm-mlm-itc",
+        input=[message]
+    )
+    return response['data'][0]['embedding']
+def load_json_file(file_path):
+    # Open the JSON file in read mode
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+def display_retrieved_results(results):
+    print(f'There is/are {len(results)} retrieved result(s)')
+    print()
+    for i, res in enumerate(results):
+        print(f'The caption of the {str(i+1)}-th retrieved result is:\n"{results[i].page_content}"')
+        print()
+        print(results[i])
+        #display(Image.open(results[i].metadata['metadata']['extracted_frame_path']))
+        print("------------------------------------------------------------")
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history"""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    map_roles: Dict[str, str]
+    version: str = "Unknown"
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "\n"
+    def _get_prompt_role(self, role):
+        if self.map_roles is not None and role in self.map_roles.keys():
+            return self.map_roles[role]
+        else:
+            return role
+    def _build_content_for_first_message_in_conversation(self, first_message: List[str]):
+        content = []
+        if len(first_message) != 2:
+            raise TypeError("First message in Conversation needs to include a prompt and a base64-enconded image!")
+        prompt, b64_image = first_message[0], first_message[1]
+        # handling prompt
+        if prompt is None:
+            raise TypeError("API does not support None prompt yet")
+        content.append({
+            "type": "text",
+            "text": prompt
+        })
+        if b64_image is None:
+            raise TypeError("API does not support text only conversation yet")
+        # handling image
+        if not isBase64(b64_image):
+            raise TypeError("Image in Conversation's first message must be stored under base64 encoding!")
+        content.append({
+            "type": "image_url",
+            "image_url": {
+                "url": b64_image,
+            }
+        })
+        return content
+    def _build_content_for_follow_up_messages_in_conversation(self, follow_up_message: List[str]):
+        if follow_up_message is not None and len(follow_up_message) > 1:
+            raise TypeError("Follow-up message in Conversation must not include an image!")
+        # handling text prompt
+        if follow_up_message is None or follow_up_message[0] is None:
+            raise TypeError("Follow-up message in Conversation must include exactly one text message")
+        text = follow_up_message[0]
+        return text
+    def get_message(self):
+        messages = self.messages
+        api_messages = []
+        for i, msg in enumerate(messages):
+            role, message_content = msg
+            if i == 0:
+                # get content for very first message in conversation
+                content = self._build_content_for_first_message_in_conversation(message_content)
+            else:
+                # get content for follow-up message in conversation
+                content = self._build_content_for_follow_up_messages_in_conversation(message_content)
+            api_messages.append({
+                "role": role,
+                "content": content,
+            })
+        return api_messages
+    # this method helps represent a multi-turn chat into as a single turn chat format
+    def serialize_messages(self):
+        messages = self.messages
+        ret = ""
+        if self.sep_style == SeparatorStyle.SINGLE:
+            if self.system is not None and self.system != "":
+                ret = self.system + self.sep
+            for i, (role, message) in enumerate(messages):
+                role = self._get_prompt_role(role)
+                if message:
+                    if isinstance(message, List):
+                        # get prompt only
+                        message = message[0]
+                    if i == 0:
+                        # do not include role at the beginning
+                        ret += message
+                    else:
+                        ret += role + ": " + message
+                    if i < len(messages) - 1:
+                        # avoid including sep at the end of serialized message
+                        ret += self.sep
+                else:
+                    ret += role + ":"
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        if len(self.messages) == 0:
+            # data verification for the very first message
+            assert role == self.roles[0], f"the very first message in conversation must be from role {self.roles[0]}"
+            assert len(message) == 2, f"the very first message in conversation must include both prompt and an image"
+            prompt, image = message[0], message[1]
+            assert prompt is not None, f"prompt must be not None"
+            assert isBase64(image), f"image must be under base64 encoding"
+        else:
+            # data verification for follow-up message
+            assert role in self.roles, f"the follow-up message must be from one of the roles {self.roles}"
+            assert len(message) == 1, f"the follow-up message must consist of one text message only, no image"
+        self.messages.append([role, message])
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x,y] for x, y in self.messages],
+            version=self.version,
+            map_roles=self.map_roles,
+        )
+    def dict(self):
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": [[x, y[0] if len(y) == 1 else y] for x, y in self.messages],
+            "version": self.version,
+        }
+prediction_guard_llava_conv = Conversation(
+    system="",
+    roles=("user", "assistant"),
+    messages=[],
+    version="Prediction Guard LLaVA enpoint Conversation v0",
+    sep_style=SeparatorStyle.SINGLE,
+    map_roles={
+        "user": "USER",
+        "assistant": "ASSISTANT"
+    }
+)
+# get PredictionGuard Client
+def _getPredictionGuardClient():
+    PREDICTION_GUARD_API_KEY = get_prediction_guard_api_key()
+    client = PredictionGuard(
+        api_key=PREDICTION_GUARD_API_KEY,
+        url=PREDICTION_GUARD_URL_ENDPOINT,
+    )
+    return client
+# helper function to call chat completion endpoint of PredictionGuard given a prompt and an image
+def lvlm_inference(prompt, image, max_tokens: int = 200, temperature: float = 0.95, top_p: float = 0.1, top_k: int = 10):
+    # prepare conversation
+    conversation = prediction_guard_llava_conv.copy()
+    conversation.append_message(conversation.roles[0], [prompt, image])
+    return lvlm_inference_with_conversation(conversation, max_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
+def lvlm_inference_with_conversation(conversation, max_tokens: int = 200, temperature: float = 0.95, top_p: float = 0.1, top_k: int = 10):
+    # get PredictionGuard client
+    client = _getPredictionGuardClient()
+    # get message from conversation
+    messages = conversation.get_message()
+    # call chat completion endpoint at Grediction Guard
+    response = client.chat.completions.create(
+        model="llava-1.5-7b-hf",
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+    )
+    return response['choices'][-1]['message']['content']
+def get_token():
+    load_env()
+    token = os.getenv("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
+    if token is None:
+        raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
+    return token
+def lvlm_inference_with_phi(prompt):
+    messages = [{"role": "user", "content": prompt}]
+    client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=get_token())
+    response = ''
+    token = client.chat_completion(messages, max_tokens=256)
+    response = token['choices'][0]['message']['content']
+    return response
+def lvlm_inference_with_tiny_model(prompt):
+    classifier = pipeline(
+        "text-generation",
+        model="microsoft/phi-2",  # Only ~2.7GB
+        device_map="auto",
+        torch_dtype="auto",
+    )
+    response = classifier(
+        prompt,
+        max_new_tokens=512,  # Remove max_length and use only max_new_tokens
+        temperature=0.7,
+        do_sample=True,
+        num_return_sequences=1,
+        truncation=True,     # Add explicit truncation
+        pad_token_id=classifier.tokenizer.eos_token_id,
+        eos_token_id=classifier.tokenizer.eos_token_id,
+    )[0]['generated_text']
+    # Remove the input prompt from the response and clean up
+    return response.replace(prompt, "").strip()
+# function `extract_and_save_frames_and_metadata``:
+#   receives as input a video and its transcript
+#   does extracting and saving frames and their metadatas
+#   returns the extracted metadatas
+def extract_and_save_frames_and_metadata(
+        path_to_video,
+        path_to_transcript,
+        path_to_save_extracted_frames,
+        path_to_save_metadatas):
+    # metadatas will store the metadata of all extracted frames
+    metadatas = []
+    # load video using cv2
+    print(f"Loading video from {path_to_video}")
+    video = cv2.VideoCapture(path_to_video)
+    # load transcript using webvtt
+    print(f"Loading transcript from {path_to_transcript}")
+    trans = webvtt.read(path_to_transcript)
+    # iterate transcript file
+    # for each video segment specified in the transcript file
+    for idx, transcript in enumerate(trans):
+        # get the start time and end time in seconds
+        start_time_ms = str2time(transcript.start)
+        end_time_ms = str2time(transcript.end)
+        # get the time in ms exactly
+        # in the middle of start time and end time
+        mid_time_ms = (end_time_ms + start_time_ms) / 2
+        # get the transcript, remove the next-line symbol
+        text = transcript.text.replace("\n", ' ')
+        # get frame at the middle time
+        video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
+        print(f"Extracting frame at {mid_time_ms} ms")
+        success, frame = video.read()
+        if success:
+            # if the frame is extracted successfully, resize it
+            image = maintain_aspect_ratio_resize(frame, height=350)
+            # save frame as JPEG file
+            img_fname = f'frame_{idx}.jpg'
+            img_fpath = osp.join(
+                path_to_save_extracted_frames, img_fname
+            )
+            cv2.imwrite(img_fpath, image)
+            # prepare the metadata
+            metadata = {
+                'extracted_frame_path': img_fpath,
+                'transcript': text,
+                'video_segment_id': idx,
+                'video_path': path_to_video,
+                'mid_time_ms': mid_time_ms,
+            }
+            metadatas.append(metadata)
+        else:
+            print(f"ERROR! Cannot extract frame: idx = {idx}")
+    # save metadata of all extracted frames
+    fn = osp.join(path_to_save_metadatas, 'metadatas.json')
+    with open(fn, 'w') as outfile:
+        json.dump(metadatas, outfile)
+    return metadatas
+def extract_meta_data(vid_dir, vid_filepath, vid_transcript_filepath):
+    # output paths to save extracted frames and their metadata
+    extracted_frames_path = osp.join(vid_dir, 'extracted_frame')
+    metadatas_path = vid_dir
+    # create these output folders if not existing
+    print(f"Creating folders {extracted_frames_path} and {metadatas_path}")
+    Path(extracted_frames_path).mkdir(parents=True, exist_ok=True)
+    Path(metadatas_path).mkdir(parents=True, exist_ok=True)
+    print("Extracting frames the video path ", vid_filepath)
+    # call the function to extract frames and metadatas
+    metadatas = extract_and_save_frames_and_metadata(
+                    vid_filepath,
+                    vid_transcript_filepath,
+                    extracted_frames_path,
+                    metadatas_path,
+                )
+    return metadatas
+# function extract_and_save_frames_and_metadata_with_fps
+#   receives as input a video
+#   does extracting and saving frames and their metadatas
+#   returns the extracted metadatas
+def extract_and_save_frames_and_metadata_with_fps(
+        lvlm_prompt,
+        path_to_video,
+        path_to_save_extracted_frames,
+        path_to_save_metadatas,
+        num_of_extracted_frames_per_second=1):
+    # metadatas will store the metadata of all extracted frames
+    metadatas = []
+    # load video using cv2
+    video = cv2.VideoCapture(path_to_video)
+    # Get the frames per second
+    fps = video.get(cv2.CAP_PROP_FPS)
+    # Get hop = the number of frames pass before a frame is extracted
+    hop = round(fps / num_of_extracted_frames_per_second)
+    curr_frame = 0
+    idx = -1
+    while(True):
+        # iterate all frames
+        ret, frame = video.read()
+        if not ret:
+            break
+        if curr_frame % hop == 0:
+            idx = idx + 1
+            # if the frame is extracted successfully, resize it
+            image = maintain_aspect_ratio_resize(frame, height=350)
+            # save frame as JPEG file
+            img_fname = f'frame_{idx}.jpg'
+            img_fpath = osp.join(
+                            path_to_save_extracted_frames,
+                            img_fname
+                        )
+            cv2.imwrite(img_fpath, image)
+            # generate caption using lvlm_inference
+            b64_image = encode_image(img_fpath)
+            caption = lvlm_inference(lvlm_prompt, b64_image)
+            # prepare the metadata
+            metadata = {
+                'extracted_frame_path': img_fpath,
+                'transcript': caption,
+                'video_segment_id': idx,
+                'video_path': path_to_video,
+            }
+            metadatas.append(metadata)
+        curr_frame += 1
+    # save metadata of all extracted frames
+    metadatas_path = osp.join(path_to_save_metadatas,'metadatas.json')
+    with open(metadatas_path, 'w') as outfile:
+        json.dump(metadatas, outfile)
+    return metadatas
+if __name__ == "__main__":
+    res = lvlm_inference_with_phi("Tell me a story")
     print(res)