Spaces:
Sleeping
Sleeping
File size: 6,671 Bytes
b7595c0 2e63d3b 5a84a4e 80d005a 08c0e4e 80d005a 08c0e4e 1dde6a2 5a84a4e 80d005a 08c0e4e 2e63d3b b7595c0 1dde6a2 80d005a 08c0e4e 2e63d3b 9376ac0 5a84a4e 1dde6a2 9376ac0 1dde6a2 b7595c0 4dc4b99 2e63d3b 4dc4b99 b7595c0 5a84a4e 80d005a 1dde6a2 5a84a4e b7595c0 5a84a4e 08c0e4e 5a84a4e 08c0e4e 1dde6a2 5a84a4e 08c0e4e b7595c0 1dde6a2 5a84a4e 1dde6a2 80d005a 5a84a4e 1dde6a2 08c0e4e 9376ac0 b7595c0 80d005a 1dde6a2 80d005a 5a84a4e b7595c0 9376ac0 08c0e4e 9376ac0 4dc4b99 80d005a 4dc4b99 9376ac0 2e63d3b 08c0e4e 2e63d3b 08c0e4e b7595c0 08c0e4e 80d005a 1dde6a2 b7595c0 1dde6a2 80d005a 1dde6a2 80d005a 1dde6a2 9376ac0 1dde6a2 9376ac0 1dde6a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# build_rag.py
import json
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import chromadb
import sys
from tqdm import tqdm
from huggingface_hub import HfApi, create_repo
import traceback
# --- Configuration ---
CHROMA_PATH = "chroma_db"
COLLECTION_NAME = "bible_verses"
MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
DATASET_REPO = "broadfield-dev/bible-chromadb-multi-qa-mpnet" # This can remain the same
STATUS_FILE = "build_status.log"
JSON_DIRECTORY = 'bible_json'
CHUNK_SIZE = 3
EMBEDDING_BATCH_SIZE = 16
# (BOOK_ID_TO_NAME dictionary remains the same)
BOOK_ID_TO_NAME = {
1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
11: "1 Kings", 12: "2 Kings", 13: "1 Chronicles", 14: "2 Chronicles",
15: "Ezra", 16: "Nehemiah", 17: "Esther", 18: "Job", 19: "Psalms",
20: "Proverbs", 21: "Ecclesiastes", 22: "Song of Solomon", 23: "Isaiah",
24: "Jeremiah", 25: "Lamentations", 26: "Ezekiel", 27: "Daniel", 28: "Hosea",
29: "Joel", 30: "Amos", 31: "Obadiah", 32: "Jonah", 33: "Micah", 34: "Nahum",
35: "Habakkuk", 36: "Zephaniah", 37: "Haggai", 38: "Zechariah", 39: "Malachi",
40: "Matthew", 41: "Mark", 42: "Luke", 43: "John", 44: "Acts",
45: "Romans", 46: "1 Corinthians", 47: "2 Corinthians", 48: "Galatians",
49: "Ephesians", 50: "Philippians", 51: "Colossians", 52: "1 Thessalonians",
53: "2 Thessalonians", 54: "1 Timothy", 55: "2 Timothy", 56: "Titus",
57: "Philemon", 58: "Hebrews", 59: "James", 60: "1 Peter", 61: "2 Peter",
62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
}
def update_status(message):
print(message)
with open(STATUS_FILE, "w") as f:
f.write(message)
# Mean Pooling Function
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
all_verses = []
if not os.path.exists(directory_path) or not os.listdir(directory_path):
raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
for filename in os.listdir(directory_path):
if filename.endswith('.json'):
version_name = os.path.splitext(filename)[0].split('_')[-1].upper()
file_path = os.path.join(directory_path, filename)
with open(file_path, 'r') as f: data = json.load(f)
rows = data.get("resultset", {}).get("row", [])
for row in rows:
field = row.get("field", [])
if len(field) == 5:
_id, book_id, chapter, verse, text = field
book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
all_verses.append({'version': version_name, 'book_name': book_name, 'chapter': chapter, 'verse': verse, 'text': text.strip()})
if not all_verses: raise ValueError("No verses were processed.")
df = pd.DataFrame(all_verses)
all_chunks = []
for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
group = group.sort_values('verse').reset_index(drop=True)
for i in range(0, len(group), chunk_size):
chunk_df = group.iloc[i:i+chunk_size]
combined_text = " ".join(chunk_df['text'])
start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
# *** CHANGE 1: ADD MORE METADATA TO EACH CHUNK ***
all_chunks.append({
'text': combined_text,
'reference': reference,
'version': version,
'book_name': book_name,
'chapter': chapter
})
return pd.DataFrame(all_chunks)
def main():
update_status("IN_PROGRESS: Step 1/5 - Processing JSON files...")
bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
update_status("IN_PROGRESS: Step 2/5 - Setting up local ChromaDB...")
if os.path.exists(CHROMA_PATH):
import shutil
shutil.rmtree(CHROMA_PATH)
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.create_collection(name=COLLECTION_NAME, metadata={"hnsw:space": "cosine"})
update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
update_status("IN_PROGRESS: Step 4/5 - Generating embeddings...")
for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
texts = batch_df['text'].tolist()
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(model.device)
with torch.no_grad():
model_output = model(**encoded_input)
embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
collection.add(
ids=[str(j) for j in range(i, i + len(batch_df))],
embeddings=embeddings.cpu().tolist(),
documents=texts,
# *** CHANGE 2: SAVE THE NEW METADATA FIELDS TO THE DATABASE ***
metadatas=batch_df[['reference', 'version', 'book_name', 'chapter']].to_dict('records')
)
update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
api = HfApi()
api.upload_folder(folder_path=CHROMA_PATH, repo_id=DATASET_REPO, repo_type="dataset")
update_status("SUCCESS: Build complete! The application is ready.")
if __name__ == "__main__":
try:
main()
except Exception as e:
error_message = traceback.format_exc()
if "401" in str(e) or "Unauthorized" in str(e):
update_status("FAILED: Hugging Face authentication error. Ensure your HF_TOKEN secret has WRITE permissions.")
else:
update_status(f"FAILED: An unexpected error occurred. Check Space logs. Error: {e}")
print(error_message, file=sys.stderr) |