broadfield-dev commited on
Commit
b7595c0
·
verified ·
1 Parent(s): b861f00

Update build_rag.py

Browse files
Files changed (1) hide show
  1. build_rag.py +18 -24
build_rag.py CHANGED
@@ -1,10 +1,9 @@
1
- # build_rag.py (Updated for a model with pre-normalized embeddings)
2
 
3
  import json
4
  import os
5
  import pandas as pd
6
  import torch
7
- import torch.nn.functional as F
8
  from transformers import AutoTokenizer, AutoModel
9
  import chromadb
10
  import sys
@@ -15,10 +14,8 @@ import traceback
15
  # --- Configuration ---
16
  CHROMA_PATH = "chroma_db"
17
  COLLECTION_NAME = "bible_verses"
18
- # *** CHANGE 1: USE A MODEL WITH NORMALIZED EMBEDDINGS ***
19
  MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
20
- # *** CHANGE 2: USE A NEW REPO FOR THE NEW DATABASE ***
21
- DATASET_REPO = "broadfield-dev/bible-chromadb-multi-qa-mpnet"
22
  STATUS_FILE = "build_status.log"
23
  JSON_DIRECTORY = 'bible_json'
24
  CHUNK_SIZE = 3
@@ -46,20 +43,19 @@ def update_status(message):
46
  with open(STATUS_FILE, "w") as f:
47
  f.write(message)
48
 
49
- # Mean Pooling Function - Crucial for sentence-transformer models
50
  def mean_pooling(model_output, attention_mask):
51
  token_embeddings = model_output[0]
52
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
53
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
54
 
55
- def process_bible_json_files(directory_path: str, chunk_size: int):
56
- # (This function is unchanged)
57
  all_verses = []
58
  if not os.path.exists(directory_path) or not os.listdir(directory_path):
59
  raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
60
  for filename in os.listdir(directory_path):
61
  if filename.endswith('.json'):
62
- version_name = filename.split('.')[0].upper()
63
  file_path = os.path.join(directory_path, filename)
64
  with open(file_path, 'r') as f: data = json.load(f)
65
  rows = data.get("resultset", {}).get("row", [])
@@ -79,7 +75,14 @@ def process_bible_json_files(directory_path: str, chunk_size: int):
79
  combined_text = " ".join(chunk_df['text'])
80
  start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
81
  reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
82
- all_chunks.append({'text': combined_text, 'reference': reference, 'version': version})
 
 
 
 
 
 
 
83
  return pd.DataFrame(all_chunks)
84
 
85
  def main():
@@ -92,16 +95,13 @@ def main():
92
  shutil.rmtree(CHROMA_PATH)
93
  client = chromadb.PersistentClient(path=CHROMA_PATH)
94
 
95
- collection = client.create_collection(
96
- name=COLLECTION_NAME,
97
- metadata={"hnsw:space": "cosine"}
98
- )
99
 
100
  update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
101
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
102
  model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
103
 
104
- update_status("IN_PROGRESS: Step 4/5 - Generating embeddings (no normalization needed)...")
105
  for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
106
  batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
107
  texts = batch_df['text'].tolist()
@@ -112,24 +112,18 @@ def main():
112
 
113
  embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
114
 
115
- # *** REMOVED: NO LONGER NEED TO NORMALIZE THE EMBEDDINGS ***
116
- # embeddings = F.normalize(embeddings, p=2, dim=1)
117
-
118
  collection.add(
119
  ids=[str(j) for j in range(i, i + len(batch_df))],
120
  embeddings=embeddings.cpu().tolist(),
121
  documents=texts,
122
- metadatas=batch_df[['reference', 'version']].to_dict('records')
 
123
  )
124
 
125
  update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
126
  create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
127
  api = HfApi()
128
- api.upload_folder(
129
- folder_path=CHROMA_PATH,
130
- repo_id=DATASET_REPO,
131
- repo_type="dataset",
132
- )
133
 
134
  update_status("SUCCESS: Build complete! The application is ready.")
135
 
 
1
+ # build_rag.py
2
 
3
  import json
4
  import os
5
  import pandas as pd
6
  import torch
 
7
  from transformers import AutoTokenizer, AutoModel
8
  import chromadb
9
  import sys
 
14
  # --- Configuration ---
15
  CHROMA_PATH = "chroma_db"
16
  COLLECTION_NAME = "bible_verses"
 
17
  MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
18
+ DATASET_REPO = "broadfield-dev/bible-chromadb-multi-qa-mpnet" # This can remain the same
 
19
  STATUS_FILE = "build_status.log"
20
  JSON_DIRECTORY = 'bible_json'
21
  CHUNK_SIZE = 3
 
43
  with open(STATUS_FILE, "w") as f:
44
  f.write(message)
45
 
46
+ # Mean Pooling Function
47
  def mean_pooling(model_output, attention_mask):
48
  token_embeddings = model_output[0]
49
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
50
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
51
 
52
+ def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
 
53
  all_verses = []
54
  if not os.path.exists(directory_path) or not os.listdir(directory_path):
55
  raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
56
  for filename in os.listdir(directory_path):
57
  if filename.endswith('.json'):
58
+ version_name = os.path.splitext(filename)[0].split('_')[-1].upper()
59
  file_path = os.path.join(directory_path, filename)
60
  with open(file_path, 'r') as f: data = json.load(f)
61
  rows = data.get("resultset", {}).get("row", [])
 
75
  combined_text = " ".join(chunk_df['text'])
76
  start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
77
  reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
78
+ # *** CHANGE 1: ADD MORE METADATA TO EACH CHUNK ***
79
+ all_chunks.append({
80
+ 'text': combined_text,
81
+ 'reference': reference,
82
+ 'version': version,
83
+ 'book_name': book_name,
84
+ 'chapter': chapter
85
+ })
86
  return pd.DataFrame(all_chunks)
87
 
88
  def main():
 
95
  shutil.rmtree(CHROMA_PATH)
96
  client = chromadb.PersistentClient(path=CHROMA_PATH)
97
 
98
+ collection = client.create_collection(name=COLLECTION_NAME, metadata={"hnsw:space": "cosine"})
 
 
 
99
 
100
  update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
101
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
102
  model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
103
 
104
+ update_status("IN_PROGRESS: Step 4/5 - Generating embeddings...")
105
  for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
106
  batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
107
  texts = batch_df['text'].tolist()
 
112
 
113
  embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
114
 
 
 
 
115
  collection.add(
116
  ids=[str(j) for j in range(i, i + len(batch_df))],
117
  embeddings=embeddings.cpu().tolist(),
118
  documents=texts,
119
+ # *** CHANGE 2: SAVE THE NEW METADATA FIELDS TO THE DATABASE ***
120
+ metadatas=batch_df[['reference', 'version', 'book_name', 'chapter']].to_dict('records')
121
  )
122
 
123
  update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
124
  create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
125
  api = HfApi()
126
+ api.upload_folder(folder_path=CHROMA_PATH, repo_id=DATASET_REPO, repo_type="dataset")
 
 
 
 
127
 
128
  update_status("SUCCESS: Build complete! The application is ready.")
129