talk_to_pdf / app /embed_and_store.py
sapatevaibhav
Update README and refactor code structure for image-based PDF RAG system
2376bae
# app/embed_and_store.py
import os
from app.embed import get_image_embedding
from app.chroma_utils import add_embedding
# Create absolute paths for data directories
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
IMAGE_DIR = os.path.join(base_dir, "data", "images")
COLLECTION_NAME = "pdf_images"
def embed_all_images(image_path=None):
"""
Embed and store images in ChromaDB.
Args:
image_path: Optional path to a specific image. If None, all images in IMAGE_DIR will be processed.
"""
if image_path:
print(f"🔍 Embedding: {os.path.basename(image_path)}")
emb = get_image_embedding(image_path)
if emb is not None:
add_embedding(COLLECTION_NAME, image_path, emb)
print(f"✅ Stored {os.path.basename(image_path)} in ChromaDB.")
else:
print("❌ Embedding failed.")
else:
for fname in os.listdir(IMAGE_DIR):
if fname.endswith(".png") or fname.endswith(".jpg"):
path = os.path.join(IMAGE_DIR, fname)
print(f"🔍 Embedding: {fname}")
emb = get_image_embedding(path)
if emb is not None:
add_embedding(COLLECTION_NAME, path, emb)
print(f"✅ Stored {fname} in ChromaDB.")
else:
print(f"❌ Failed to embed {fname}")
if __name__ == "__main__":
embed_all_images()