Spaces:

camiellia
/

phapdien_demo

Running

File size: 6,757 Bytes

7d6c1f1

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Write a Python notebook that creates a vector database using ChromaDB (use LangChain)\n",
    "- ingest the document files only (full_ItemID.html files)\n",
    "- it is required to save the file path in the metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from tqdm import tqdm\n",
    "from langchain_text_splitters import CharacterTextSplitter\n",
    "from langchain.vectorstores import Chroma\n",
    "from bs4 import BeautifulSoup\n",
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading documents: 100%|██████████| 5101/5101 [52:41<00:00,  1.61it/s]  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 5101 documents\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Created a chunk of size 3623, which is longer than the specified 2000\n",
      "Created a chunk of size 10118, which is longer than the specified 2000\n",
      "Created a chunk of size 10168, which is longer than the specified 2000\n",
      "Created a chunk of size 3836, which is longer than the specified 2000\n",
      "Created a chunk of size 8935, which is longer than the specified 2000\n",
      "Created a chunk of size 5101, which is longer than the specified 2000\n",
      "Created a chunk of size 16204, which is longer than the specified 2000\n",
      "Created a chunk of size 8374, which is longer than the specified 2000\n",
      "Created a chunk of size 3134, which is longer than the specified 2000\n"
     ]
    }
   ],
   "source": [
    "# Step 1: HTML dir\n",
    "input_dir = rf\"D:\\PhapDien_semantic_search\\BoPhapDienDienTu\\vbpl\"\n",
    "model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')\n",
    "\n",
    "# Step 2: Clean the HTML files\n",
    "def load_and_clean_html(file_path):\n",
    "    with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
    "        html_content = f.read()\n",
    "    soup = BeautifulSoup(html_content, \"html.parser\")\n",
    "    text = soup.get_text()  # Extract plain text from the HTML\n",
    "    return text\n",
    "\n",
    "# Step 3: Process all files in the directory\n",
    "documents = []\n",
    "metadata = []\n",
    "for file_name in tqdm(os.listdir(input_dir), desc=\"Loading documents\"):\n",
    "    if file_name.startswith(\"full_\") and file_name.endswith(\".html\"):\n",
    "        file_path = os.path.join(input_dir, file_name)\n",
    "        text = load_and_clean_html(file_path)\n",
    "        documents.append(text)\n",
    "        metadata.append({\"file_path\": file_path})\n",
    "\n",
    "print(f\"Loaded {len(documents)} documents\")\n",
    "# Step 4: Split text into chunks\n",
    "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
    "    encoding_name=\"cl100k_base\", chunk_size=2000, chunk_overlap=20, separator=\"\\n\"\n",
    ")\n",
    "splitted_docs = []\n",
    "splitted_metadata = []\n",
    "\n",
    "for doc, meta in zip(documents, metadata):\n",
    "    chunks = text_splitter.split_text(doc)\n",
    "    for chunk in chunks:\n",
    "        splitted_docs.append(chunk)\n",
    "        splitted_metadata.append(meta)\n",
    "# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.\n",
    "# Notice that headers , menu text items, html tags, warnings in English contain a lot of \n",
    "# whitespaces when splitted with \\n. Thus, I removed those instances since almost all of\n",
    "# the information for retrieval is conveniently formatted well.\n",
    "print(splitted_docs)\n",
    "print(splitted_metadata)\n",
    "processed_splitted_docs = []\n",
    "processed_metadata = []\n",
    "for i, doc in enumerate(splitted_docs):\n",
    "    processed = doc.split(\"\\n\")\n",
    "    for phrase in processed:\n",
    "        if len(phrase) > 50 and \"    \" not in phrase:\n",
    "            processed_splitted_docs.append(phrase)\n",
    "            processed_metadata.append(splitted_metadata[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Wrapper with embed_documents and embed_query\n",
    "class SentenceTransformerWrapper:\n",
    "    def __init__(self, model_name):\n",
    "        self.model = SentenceTransformer(model_name)\n",
    "        \n",
    "    def embed_documents(self, texts):\n",
    "        # Convert the list of texts to embeddings\n",
    "        return self.model.encode(texts, show_progress_bar=True).tolist()\n",
    "    \n",
    "    def embed_query(self, text):\n",
    "        # Convert a single query to its embedding\n",
    "        return self.model.encode(text).tolist()\n",
    "\n",
    "# Instantiate wrapper with model\n",
    "embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches:   0%|          | 0/7 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 7/7 [00:16<00:00,  2.36s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Database saved successfully!\n"
     ]
    }
   ],
   "source": [
    "# Step 6: Generate embeddings using BKAI model\n",
    "\n",
    "# Step 7: Save the vectors to ChromaDB\n",
    "vector_db = Chroma.from_texts(\n",
    "    texts=processed_splitted_docs,\n",
    "    embedding=embedding_model,\n",
    "    metadatas=processed_metadata,\n",
    "    persist_directory=\"chroma_db_new\"  # Directory where the database will be saved\n",
    ")\n",
    "\n",
    "print(\"Database saved successfully!\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "phapdienvv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}