Changing embedding from PredictionGuard to Local
Browse files
lrn_vector_embeddings.py
CHANGED
@@ -102,8 +102,10 @@ def bt_with_masked_input():
|
|
102 |
|
103 |
print(results)
|
104 |
return results
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
102 |
|
103 |
print(results)
|
104 |
return results
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
#res = bt_embeddingsl()
|
108 |
+
#print((res['text_embeddings']))
|
109 |
+
for img in [img1, img2, img3]:
|
110 |
+
embeddings = bt_embeddings_from_local(img['caption'], Image.open(img['image_path']))
|
111 |
+
print(embeddings['cross_modal_embeddings'][0].shape)
|
mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc
CHANGED
Binary files a/mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc and b/mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc differ
|
|
mm_rag/embeddings/bridgetower_embeddings.py
CHANGED
@@ -3,9 +3,10 @@ from langchain_core.embeddings import Embeddings
|
|
3 |
from langchain_core.pydantic_v1 import (
|
4 |
BaseModel,
|
5 |
)
|
|
|
6 |
from utility import encode_image, bt_embedding_from_prediction_guard
|
7 |
from tqdm import tqdm
|
8 |
-
|
9 |
class BridgeTowerEmbeddings(BaseModel, Embeddings):
|
10 |
""" BridgeTower embedding model """
|
11 |
|
@@ -51,6 +52,6 @@ class BridgeTowerEmbeddings(BaseModel, Embeddings):
|
|
51 |
|
52 |
embeddings = []
|
53 |
for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
|
54 |
-
embedding =
|
55 |
embeddings.append(embedding)
|
56 |
return embeddings
|
|
|
3 |
from langchain_core.pydantic_v1 import (
|
4 |
BaseModel,
|
5 |
)
|
6 |
+
from lrn_vector_embeddings import bt_embeddings_from_local
|
7 |
from utility import encode_image, bt_embedding_from_prediction_guard
|
8 |
from tqdm import tqdm
|
9 |
+
from PIL import Image
|
10 |
class BridgeTowerEmbeddings(BaseModel, Embeddings):
|
11 |
""" BridgeTower embedding model """
|
12 |
|
|
|
52 |
|
53 |
embeddings = []
|
54 |
for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
|
55 |
+
embedding = bt_embeddings_from_local(text, Image.open(path_to_img))
|
56 |
embeddings.append(embedding)
|
57 |
return embeddings
|
requirements.txt
CHANGED
@@ -14,4 +14,6 @@ whisper
|
|
14 |
webvtt-py
|
15 |
tqdm
|
16 |
lancedb
|
17 |
-
|
|
|
|
|
|
14 |
webvtt-py
|
15 |
tqdm
|
16 |
lancedb
|
17 |
+
langchain-core
|
18 |
+
langchain-community
|
19 |
+
ollama
|
utility.py
CHANGED
@@ -9,6 +9,7 @@ from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
|
|
9 |
from enum import auto, Enum
|
10 |
import base64
|
11 |
import glob
|
|
|
12 |
from tqdm import tqdm
|
13 |
from pytubefix import YouTube, Stream
|
14 |
import webvtt
|
@@ -18,6 +19,8 @@ from predictionguard import PredictionGuard
|
|
18 |
import cv2
|
19 |
import json
|
20 |
import PIL
|
|
|
|
|
21 |
from PIL import Image
|
22 |
import dataclasses
|
23 |
import random
|
@@ -234,6 +237,7 @@ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
|
|
234 |
Example usage:
|
235 |
from pathlib import Path
|
236 |
from whisper.utils import write_srt
|
|
|
237 |
result = transcribe(model, audio_path, temperature=temperature, **args)
|
238 |
# save SRT
|
239 |
audio_basename = Path(audio_path).stem
|
@@ -520,6 +524,29 @@ def lvlm_inference_with_conversation(conversation, max_tokens: int = 200, temper
|
|
520 |
)
|
521 |
return response['choices'][-1]['message']['content']
|
522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
# function `extract_and_save_frames_and_metadata``:
|
524 |
# receives as input a video and its transcript
|
525 |
# does extracting and saving frames and their metadatas
|
|
|
9 |
from enum import auto, Enum
|
10 |
import base64
|
11 |
import glob
|
12 |
+
import requests
|
13 |
from tqdm import tqdm
|
14 |
from pytubefix import YouTube, Stream
|
15 |
import webvtt
|
|
|
19 |
import cv2
|
20 |
import json
|
21 |
import PIL
|
22 |
+
from ollama import chat
|
23 |
+
from ollama import ChatResponse
|
24 |
from PIL import Image
|
25 |
import dataclasses
|
26 |
import random
|
|
|
237 |
Example usage:
|
238 |
from pathlib import Path
|
239 |
from whisper.utils import write_srt
|
240 |
+
import requests
|
241 |
result = transcribe(model, audio_path, temperature=temperature, **args)
|
242 |
# save SRT
|
243 |
audio_basename = Path(audio_path).stem
|
|
|
524 |
)
|
525 |
return response['choices'][-1]['message']['content']
|
526 |
|
527 |
+
def lvlm_inference_with_ollama(conversation, max_tokens: int = 200, temperature: float = 0.95, top_p: float = 0.1, top_k: int = 10):
|
528 |
+
|
529 |
+
|
530 |
+
|
531 |
+
# Send the request to the local Ollama server
|
532 |
+
#response = requests.post("http://localhost:8000/api/v1/completions", json=payload)
|
533 |
+
|
534 |
+
stream = chat(
|
535 |
+
model="llava-1.5-7b-hf",
|
536 |
+
messages= conversation,
|
537 |
+
stream=True,
|
538 |
+
temperature=temperature,
|
539 |
+
max_tokens=max_tokens,
|
540 |
+
top_p=top_p,
|
541 |
+
top_k=top_k
|
542 |
+
)
|
543 |
+
|
544 |
+
response_data = ''
|
545 |
+
for chunk in stream:
|
546 |
+
response_data += chunk['message']['content']
|
547 |
+
|
548 |
+
return response_data
|
549 |
+
|
550 |
# function `extract_and_save_frames_and_metadata``:
|
551 |
# receives as input a video and its transcript
|
552 |
# does extracting and saving frames and their metadatas
|