Spaces:

broadfield-dev
/

bible-app

Running

App Files Files Community

broadfield-dev commited on 4 days ago

Commit

b7595c0

verified ·

1 Parent(s): b861f00

Update build_rag.py

Browse files

Files changed (1) hide show

build_rag.py +18 -24

build_rag.py CHANGED Viewed

@@ -1,10 +1,9 @@
-# build_rag.py (Updated for a model with pre-normalized embeddings)
 import json
 import os
 import pandas as pd
 import torch
-import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import sys
@@ -15,10 +14,8 @@ import traceback
 # --- Configuration ---
 CHROMA_PATH = "chroma_db"
 COLLECTION_NAME = "bible_verses"
-# *** CHANGE 1: USE A MODEL WITH NORMALIZED EMBEDDINGS ***
 MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
-# *** CHANGE 2: USE A NEW REPO FOR THE NEW DATABASE ***
-DATASET_REPO = "broadfield-dev/bible-chromadb-multi-qa-mpnet"
 STATUS_FILE = "build_status.log"
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
@@ -46,20 +43,19 @@ def update_status(message):
     with open(STATUS_FILE, "w") as f:
         f.write(message)
-# Mean Pooling Function - Crucial for sentence-transformer models
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-def process_bible_json_files(directory_path: str, chunk_size: int):
-    # (This function is unchanged)
     all_verses = []
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
         raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
-            version_name = filename.split('.')[0].upper()
             file_path = os.path.join(directory_path, filename)
             with open(file_path, 'r') as f: data = json.load(f)
             rows = data.get("resultset", {}).get("row", [])
@@ -79,7 +75,14 @@ def process_bible_json_files(directory_path: str, chunk_size: int):
             combined_text = " ".join(chunk_df['text'])
             start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
             reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
-            all_chunks.append({'text': combined_text, 'reference': reference, 'version': version})
     return pd.DataFrame(all_chunks)
 def main():
@@ -92,16 +95,13 @@ def main():
         shutil.rmtree(CHROMA_PATH)
     client = chromadb.PersistentClient(path=CHROMA_PATH)
-    collection = client.create_collection(
-        name=COLLECTION_NAME,
-        metadata={"hnsw:space": "cosine"}
-    )
     update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
-    update_status("IN_PROGRESS: Step 4/5 - Generating embeddings (no normalization needed)...")
     for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
@@ -112,24 +112,18 @@ def main():
         embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-        # *** REMOVED: NO LONGER NEED TO NORMALIZE THE EMBEDDINGS ***
-        # embeddings = F.normalize(embeddings, p=2, dim=1)
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
             embeddings=embeddings.cpu().tolist(),
             documents=texts,
-            metadatas=batch_df[['reference', 'version']].to_dict('records')
         )
     update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
     create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
     api = HfApi()
-    api.upload_folder(
-        folder_path=CHROMA_PATH,
-        repo_id=DATASET_REPO,
-        repo_type="dataset",
-    )
     update_status("SUCCESS: Build complete! The application is ready.")

+# build_rag.py
 import json
 import os
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import sys
 # --- Configuration ---
 CHROMA_PATH = "chroma_db"
 COLLECTION_NAME = "bible_verses"
 MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
+DATASET_REPO = "broadfield-dev/bible-chromadb-multi-qa-mpnet" # This can remain the same
 STATUS_FILE = "build_status.log"
 JSON_DIRECTORY = 'bible_json'
 CHUNK_SIZE = 3
     with open(STATUS_FILE, "w") as f:
         f.write(message)
+# Mean Pooling Function
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
     all_verses = []
     if not os.path.exists(directory_path) or not os.listdir(directory_path):
         raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
     for filename in os.listdir(directory_path):
         if filename.endswith('.json'):
+            version_name = os.path.splitext(filename)[0].split('_')[-1].upper()
             file_path = os.path.join(directory_path, filename)
             with open(file_path, 'r') as f: data = json.load(f)
             rows = data.get("resultset", {}).get("row", [])
             combined_text = " ".join(chunk_df['text'])
             start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
             reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
+            # *** CHANGE 1: ADD MORE METADATA TO EACH CHUNK ***
+            all_chunks.append({
+                'text': combined_text,
+                'reference': reference,
+                'version': version,
+                'book_name': book_name,
+                'chapter': chapter
+            })
     return pd.DataFrame(all_chunks)
 def main():
         shutil.rmtree(CHROMA_PATH)
     client = chromadb.PersistentClient(path=CHROMA_PATH)
+    collection = client.create_collection(name=COLLECTION_NAME, metadata={"hnsw:space": "cosine"})
     update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
+    update_status("IN_PROGRESS: Step 4/5 - Generating embeddings...")
     for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
         batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
         texts = batch_df['text'].tolist()
         embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
         collection.add(
             ids=[str(j) for j in range(i, i + len(batch_df))],
             embeddings=embeddings.cpu().tolist(),
             documents=texts,
+            # *** CHANGE 2: SAVE THE NEW METADATA FIELDS TO THE DATABASE ***
+            metadatas=batch_df[['reference', 'version', 'book_name', 'chapter']].to_dict('records')
         )
     update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
     create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
     api = HfApi()
+    api.upload_folder(folder_path=CHROMA_PATH, repo_id=DATASET_REPO, repo_type="dataset")
     update_status("SUCCESS: Build complete! The application is ready.")