Spaces:

88hours
/

multimodel-rag-chat-with-videos

Running

App Files Files Community

88hours commited on Feb 5

Commit

cc81bd2

1 Parent(s): 7d9878f

Changing embedding from PredictionGuard to Local

Browse files

Files changed (5) hide show

lrn_vector_embeddings.py +7 -5
mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc +0 -0
mm_rag/embeddings/bridgetower_embeddings.py +3 -2
requirements.txt +3 -1
utility.py +27 -0

lrn_vector_embeddings.py CHANGED Viewed

@@ -102,8 +102,10 @@ def bt_with_masked_input():
     print(results)
     return results
-#res = bt_embeddingsl()
-#print((res['text_embeddings']))
-for img in [img1, img2, img3]:
-    embeddings = bt_embeddings_from_local(img['caption'], Image.open(img['image_path']))
-    print(embeddings['cross_modal_embeddings'][0].shape)

     print(results)
     return results
+if __name__ == "__main__":
+    #res = bt_embeddingsl()
+    #print((res['text_embeddings']))
+    for img in [img1, img2, img3]:
+        embeddings = bt_embeddings_from_local(img['caption'], Image.open(img['image_path']))
+        print(embeddings['cross_modal_embeddings'][0].shape)

mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc CHANGED Viewed

Binary files a/mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc and b/mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc differ

mm_rag/embeddings/bridgetower_embeddings.py CHANGED Viewed

@@ -3,9 +3,10 @@ from langchain_core.embeddings import Embeddings
 from langchain_core.pydantic_v1 import (
     BaseModel,
 )
 from utility import encode_image, bt_embedding_from_prediction_guard
 from tqdm import tqdm
 class BridgeTowerEmbeddings(BaseModel, Embeddings):
     """ BridgeTower embedding model """
@@ -51,6 +52,6 @@ class BridgeTowerEmbeddings(BaseModel, Embeddings):
         embeddings = []
         for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
-            embedding = bt_embedding_from_prediction_guard(text, encode_image(path_to_img))
             embeddings.append(embedding)
         return embeddings

 from langchain_core.pydantic_v1 import (
     BaseModel,
 )
+from lrn_vector_embeddings import bt_embeddings_from_local
 from utility import encode_image, bt_embedding_from_prediction_guard
 from tqdm import tqdm
+from PIL import Image
 class BridgeTowerEmbeddings(BaseModel, Embeddings):
     """ BridgeTower embedding model """
         embeddings = []
         for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
+            embedding = bt_embeddings_from_local(text, Image.open(path_to_img))
             embeddings.append(embedding)
         return embeddings

requirements.txt CHANGED Viewed

@@ -14,4 +14,6 @@ whisper
 webvtt-py
 tqdm
 lancedb
-mmrag

 webvtt-py
 tqdm
 lancedb
+langchain-core
+langchain-community
+ollama

utility.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
 from enum import auto, Enum
 import base64
 import glob
 from tqdm import tqdm
 from pytubefix import YouTube, Stream
 import webvtt
@@ -18,6 +19,8 @@ from predictionguard import PredictionGuard
 import cv2
 import json
 import PIL
 from PIL import Image
 import dataclasses
 import random
@@ -234,6 +237,7 @@ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
     Example usage:
         from pathlib import Path
         from whisper.utils import write_srt
         result = transcribe(model, audio_path, temperature=temperature, **args)
         # save SRT
         audio_basename = Path(audio_path).stem
@@ -520,6 +524,29 @@ def lvlm_inference_with_conversation(conversation, max_tokens: int = 200, temper
     )
     return response['choices'][-1]['message']['content']
 # function `extract_and_save_frames_and_metadata``:
 #   receives as input a video and its transcript
 #   does extracting and saving frames and their metadatas

 from enum import auto, Enum
 import base64
 import glob
+import requests
 from tqdm import tqdm
 from pytubefix import YouTube, Stream
 import webvtt
 import cv2
 import json
 import PIL
+from ollama import chat
+from ollama import ChatResponse
 from PIL import Image
 import dataclasses
 import random
     Example usage:
         from pathlib import Path
         from whisper.utils import write_srt
+import requests
         result = transcribe(model, audio_path, temperature=temperature, **args)
         # save SRT
         audio_basename = Path(audio_path).stem
     )
     return response['choices'][-1]['message']['content']
+def lvlm_inference_with_ollama(conversation, max_tokens: int = 200, temperature: float = 0.95, top_p: float = 0.1, top_k: int = 10):
+    # Send the request to the local Ollama server
+    #response = requests.post("http://localhost:8000/api/v1/completions", json=payload)
+    stream = chat(
+        model="llava-1.5-7b-hf",
+        messages= conversation,
+        stream=True,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        top_p=top_p,
+        top_k=top_k
+    )
+    response_data = ''
+    for chunk in stream:
+        response_data += chunk['message']['content']
+    return response_data
 # function `extract_and_save_frames_and_metadata``:
 #   receives as input a video and its transcript
 #   does extracting and saving frames and their metadatas