#### Write a Python notebook that creates a vector database using ChromaDB (use LangChain)
- ingest the document files only (full_ItemID.html files)
- it is required to save the file path in the metadata

In [83]:
import os
from tqdm import tqdm
from langchain_text_splitters import CharacterTextSplitter
from langchain.vectorstores import Chroma
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

In [None]:
# Step 1: HTML dir
input_dir = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl"
model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')

# Step 2: Clean the HTML files
def load_and_clean_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text()  # Extract plain text from the HTML
    return text

# Step 3: Process all files in the directory
documents = []
metadata = []
for file_name in tqdm(os.listdir(input_dir), desc="Loading documents"):
    if file_name.startswith("full_") and file_name.endswith(".html"):
        file_path = os.path.join(input_dir, file_name)
        text = load_and_clean_html(file_path)
        documents.append(text)
        metadata.append({"file_path": file_path})

print(f"Loaded {len(documents)} documents")
# Step 4: Split text into chunks
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base", chunk_size=2000, chunk_overlap=20, separator="\n"
)
splitted_docs = []
splitted_metadata = []

for doc, meta in zip(documents, metadata):
    chunks = text_splitter.split_text(doc)
    for chunk in chunks:
        splitted_docs.append(chunk)
        splitted_metadata.append(meta)
# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.
# Notice that headers , menu text items, html tags, warnings in English contain a lot of 
# whitespaces when splitted with \n. Thus, I removed those instances since almost all of
# the information for retrieval is conveniently formatted well.
print(splitted_docs)
print(splitted_metadata)
processed_splitted_docs = []
processed_metadata = []
for i, doc in enumerate(splitted_docs):
    processed = doc.split("\n")
    for phrase in processed:
        if len(phrase) > 50 and "    " not in phrase:
            processed_splitted_docs.append(phrase)
            processed_metadata.append(splitted_metadata[i])

Loading documents: 100%|██████████| 5101/5101 [52:41<00:00,  1.61it/s]  


Loaded 5101 documents


Created a chunk of size 3623, which is longer than the specified 2000
Created a chunk of size 10118, which is longer than the specified 2000
Created a chunk of size 10168, which is longer than the specified 2000
Created a chunk of size 3836, which is longer than the specified 2000
Created a chunk of size 8935, which is longer than the specified 2000
Created a chunk of size 5101, which is longer than the specified 2000
Created a chunk of size 16204, which is longer than the specified 2000
Created a chunk of size 8374, which is longer than the specified 2000
Created a chunk of size 3134, which is longer than the specified 2000


In [None]:
# Wrapper with embed_documents and embed_query
class SentenceTransformerWrapper:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)
        
    def embed_documents(self, texts):
        # Convert the list of texts to embeddings
        return self.model.encode(texts, show_progress_bar=True).tolist()
    
    def embed_query(self, text):
        # Convert a single query to its embedding
        return self.model.encode(text).tolist()

# Instantiate wrapper with model
embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')

In [None]:
# Step 6: Generate embeddings using BKAI model

# Step 7: Save the vectors to ChromaDB
vector_db = Chroma.from_texts(
    texts=processed_splitted_docs,
    embedding=embedding_model,
    metadatas=processed_metadata,
    persist_directory="chroma_db_new"  # Directory where the database will be saved
)

print("Database saved successfully!")


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches: 100%|██████████| 7/7 [00:16<00:00,  2.36s/it]


Database saved successfully!
