File size: 6,671 Bytes
b7595c0
2e63d3b
5a84a4e
 
 
80d005a
08c0e4e
 
80d005a
08c0e4e
 
1dde6a2
5a84a4e
80d005a
08c0e4e
 
2e63d3b
b7595c0
1dde6a2
80d005a
08c0e4e
2e63d3b
9376ac0
5a84a4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dde6a2
9376ac0
1dde6a2
 
 
b7595c0
4dc4b99
2e63d3b
4dc4b99
 
 
b7595c0
5a84a4e
80d005a
1dde6a2
5a84a4e
 
b7595c0
5a84a4e
08c0e4e
5a84a4e
 
 
 
 
 
08c0e4e
1dde6a2
5a84a4e
 
 
 
 
 
 
08c0e4e
 
b7595c0
 
 
 
 
 
 
 
1dde6a2
5a84a4e
1dde6a2
 
80d005a
5a84a4e
1dde6a2
08c0e4e
 
 
 
9376ac0
b7595c0
80d005a
1dde6a2
80d005a
 
5a84a4e
b7595c0
9376ac0
08c0e4e
 
9376ac0
4dc4b99
80d005a
4dc4b99
9376ac0
2e63d3b
 
08c0e4e
 
2e63d3b
08c0e4e
b7595c0
 
08c0e4e
80d005a
1dde6a2
 
 
b7595c0
1dde6a2
 
 
 
80d005a
1dde6a2
80d005a
1dde6a2
 
9376ac0
1dde6a2
9376ac0
1dde6a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# build_rag.py 

import json
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import chromadb
import sys
from tqdm import tqdm
from huggingface_hub import HfApi, create_repo
import traceback

# --- Configuration ---
CHROMA_PATH = "chroma_db"
COLLECTION_NAME = "bible_verses"
MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
DATASET_REPO = "broadfield-dev/bible-chromadb-multi-qa-mpnet" # This can remain the same
STATUS_FILE = "build_status.log"
JSON_DIRECTORY = 'bible_json'
CHUNK_SIZE = 3
EMBEDDING_BATCH_SIZE = 16
# (BOOK_ID_TO_NAME dictionary remains the same)
BOOK_ID_TO_NAME = {
    1: "Genesis", 2: "Exodus", 3: "Leviticus", 4: "Numbers", 5: "Deuteronomy",
    6: "Joshua", 7: "Judges", 8: "Ruth", 9: "1 Samuel", 10: "2 Samuel",
    11: "1 Kings", 12: "2 Kings", 13: "1 Chronicles", 14: "2 Chronicles",
    15: "Ezra", 16: "Nehemiah", 17: "Esther", 18: "Job", 19: "Psalms",
    20: "Proverbs", 21: "Ecclesiastes", 22: "Song of Solomon", 23: "Isaiah",
    24: "Jeremiah", 25: "Lamentations", 26: "Ezekiel", 27: "Daniel", 28: "Hosea",
    29: "Joel", 30: "Amos", 31: "Obadiah", 32: "Jonah", 33: "Micah", 34: "Nahum",
    35: "Habakkuk", 36: "Zephaniah", 37: "Haggai", 38: "Zechariah", 39: "Malachi",
    40: "Matthew", 41: "Mark", 42: "Luke", 43: "John", 44: "Acts",
    45: "Romans", 46: "1 Corinthians", 47: "2 Corinthians", 48: "Galatians",
    49: "Ephesians", 50: "Philippians", 51: "Colossians", 52: "1 Thessalonians",
    53: "2 Thessalonians", 54: "1 Timothy", 55: "2 Timothy", 56: "Titus",
    57: "Philemon", 58: "Hebrews", 59: "James", 60: "1 Peter", 61: "2 Peter",
    62: "1 John", 63: "2 John", 64: "3 John", 65: "Jude", 66: "Revelation"
}

def update_status(message):
    print(message)
    with open(STATUS_FILE, "w") as f:
        f.write(message)

# Mean Pooling Function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def process_bible_json_files(directory_path: str, chunk_size: int) -> pd.DataFrame:
    all_verses = []
    if not os.path.exists(directory_path) or not os.listdir(directory_path):
        raise FileNotFoundError(f"Directory '{directory_path}' is empty or does not exist.")
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            version_name = os.path.splitext(filename)[0].split('_')[-1].upper()
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r') as f: data = json.load(f)
            rows = data.get("resultset", {}).get("row", [])
            for row in rows:
                field = row.get("field", [])
                if len(field) == 5:
                    _id, book_id, chapter, verse, text = field
                    book_name = BOOK_ID_TO_NAME.get(book_id, "Unknown Book")
                    all_verses.append({'version': version_name, 'book_name': book_name, 'chapter': chapter, 'verse': verse, 'text': text.strip()})
    if not all_verses: raise ValueError("No verses were processed.")
    df = pd.DataFrame(all_verses)
    all_chunks = []
    for (version, book_name, chapter), group in df.groupby(['version', 'book_name', 'chapter']):
        group = group.sort_values('verse').reset_index(drop=True)
        for i in range(0, len(group), chunk_size):
            chunk_df = group.iloc[i:i+chunk_size]
            combined_text = " ".join(chunk_df['text'])
            start_verse, end_verse = chunk_df.iloc[0]['verse'], chunk_df.iloc[-1]['verse']
            reference = f"{book_name} {chapter}:{start_verse}" if start_verse == end_verse else f"{book_name} {chapter}:{start_verse}-{end_verse}"
            # *** CHANGE 1: ADD MORE METADATA TO EACH CHUNK ***
            all_chunks.append({
                'text': combined_text, 
                'reference': reference, 
                'version': version,
                'book_name': book_name,
                'chapter': chapter
            })
    return pd.DataFrame(all_chunks)

def main():
    update_status("IN_PROGRESS: Step 1/5 - Processing JSON files...")
    bible_chunks_df = process_bible_json_files(JSON_DIRECTORY, chunk_size=CHUNK_SIZE)
    
    update_status("IN_PROGRESS: Step 2/5 - Setting up local ChromaDB...")
    if os.path.exists(CHROMA_PATH):
        import shutil
        shutil.rmtree(CHROMA_PATH)
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    
    collection = client.create_collection(name=COLLECTION_NAME, metadata={"hnsw:space": "cosine"})

    update_status(f"IN_PROGRESS: Step 3/5 - Loading embedding model '{MODEL_NAME}'...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")
    
    update_status("IN_PROGRESS: Step 4/5 - Generating embeddings...")
    for i in tqdm(range(0, len(bible_chunks_df), EMBEDDING_BATCH_SIZE), desc="Embedding Chunks"):
        batch_df = bible_chunks_df.iloc[i:i+EMBEDDING_BATCH_SIZE]
        texts = batch_df['text'].tolist()
        
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(model.device)
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        
        collection.add(
            ids=[str(j) for j in range(i, i + len(batch_df))],
            embeddings=embeddings.cpu().tolist(),
            documents=texts,
            # *** CHANGE 2: SAVE THE NEW METADATA FIELDS TO THE DATABASE ***
            metadatas=batch_df[['reference', 'version', 'book_name', 'chapter']].to_dict('records')
        )

    update_status(f"IN_PROGRESS: Step 5/5 - Pushing database to Hugging Face Hub '{DATASET_REPO}'...")
    create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
    api = HfApi()
    api.upload_folder(folder_path=CHROMA_PATH, repo_id=DATASET_REPO, repo_type="dataset")
    
    update_status("SUCCESS: Build complete! The application is ready.")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        error_message = traceback.format_exc()
        if "401" in str(e) or "Unauthorized" in str(e):
            update_status("FAILED: Hugging Face authentication error. Ensure your HF_TOKEN secret has WRITE permissions.")
        else:
            update_status(f"FAILED: An unexpected error occurred. Check Space logs. Error: {e}")
        print(error_message, file=sys.stderr)