File size: 6,757 Bytes
7d6c1f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Write a Python notebook that creates a vector database using ChromaDB (use LangChain)\n",
    "- ingest the document files only (full_ItemID.html files)\n",
    "- it is required to save the file path in the metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from tqdm import tqdm\n",
    "from langchain_text_splitters import CharacterTextSplitter\n",
    "from langchain.vectorstores import Chroma\n",
    "from bs4 import BeautifulSoup\n",
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading documents: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5101/5101 [52:41<00:00,  1.61it/s]  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 5101 documents\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Created a chunk of size 3623, which is longer than the specified 2000\n",
      "Created a chunk of size 10118, which is longer than the specified 2000\n",
      "Created a chunk of size 10168, which is longer than the specified 2000\n",
      "Created a chunk of size 3836, which is longer than the specified 2000\n",
      "Created a chunk of size 8935, which is longer than the specified 2000\n",
      "Created a chunk of size 5101, which is longer than the specified 2000\n",
      "Created a chunk of size 16204, which is longer than the specified 2000\n",
      "Created a chunk of size 8374, which is longer than the specified 2000\n",
      "Created a chunk of size 3134, which is longer than the specified 2000\n"
     ]
    }
   ],
   "source": [
    "# Step 1: HTML dir\n",
    "input_dir = rf\"D:\\PhapDien_semantic_search\\BoPhapDienDienTu\\vbpl\"\n",
    "model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')\n",
    "\n",
    "# Step 2: Clean the HTML files\n",
    "def load_and_clean_html(file_path):\n",
    "    with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
    "        html_content = f.read()\n",
    "    soup = BeautifulSoup(html_content, \"html.parser\")\n",
    "    text = soup.get_text()  # Extract plain text from the HTML\n",
    "    return text\n",
    "\n",
    "# Step 3: Process all files in the directory\n",
    "documents = []\n",
    "metadata = []\n",
    "for file_name in tqdm(os.listdir(input_dir), desc=\"Loading documents\"):\n",
    "    if file_name.startswith(\"full_\") and file_name.endswith(\".html\"):\n",
    "        file_path = os.path.join(input_dir, file_name)\n",
    "        text = load_and_clean_html(file_path)\n",
    "        documents.append(text)\n",
    "        metadata.append({\"file_path\": file_path})\n",
    "\n",
    "print(f\"Loaded {len(documents)} documents\")\n",
    "# Step 4: Split text into chunks\n",
    "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
    "    encoding_name=\"cl100k_base\", chunk_size=2000, chunk_overlap=20, separator=\"\\n\"\n",
    ")\n",
    "splitted_docs = []\n",
    "splitted_metadata = []\n",
    "\n",
    "for doc, meta in zip(documents, metadata):\n",
    "    chunks = text_splitter.split_text(doc)\n",
    "    for chunk in chunks:\n",
    "        splitted_docs.append(chunk)\n",
    "        splitted_metadata.append(meta)\n",
    "# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.\n",
    "# Notice that headers , menu text items, html tags, warnings in English contain a lot of \n",
    "# whitespaces when splitted with \\n. Thus, I removed those instances since almost all of\n",
    "# the information for retrieval is conveniently formatted well.\n",
    "print(splitted_docs)\n",
    "print(splitted_metadata)\n",
    "processed_splitted_docs = []\n",
    "processed_metadata = []\n",
    "for i, doc in enumerate(splitted_docs):\n",
    "    processed = doc.split(\"\\n\")\n",
    "    for phrase in processed:\n",
    "        if len(phrase) > 50 and \"    \" not in phrase:\n",
    "            processed_splitted_docs.append(phrase)\n",
    "            processed_metadata.append(splitted_metadata[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Wrapper with embed_documents and embed_query\n",
    "class SentenceTransformerWrapper:\n",
    "    def __init__(self, model_name):\n",
    "        self.model = SentenceTransformer(model_name)\n",
    "        \n",
    "    def embed_documents(self, texts):\n",
    "        # Convert the list of texts to embeddings\n",
    "        return self.model.encode(texts, show_progress_bar=True).tolist()\n",
    "    \n",
    "    def embed_query(self, text):\n",
    "        # Convert a single query to its embedding\n",
    "        return self.model.encode(text).tolist()\n",
    "\n",
    "# Instantiate wrapper with model\n",
    "embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches:   0%|          | 0/7 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7/7 [00:16<00:00,  2.36s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Database saved successfully!\n"
     ]
    }
   ],
   "source": [
    "# Step 6: Generate embeddings using BKAI model\n",
    "\n",
    "# Step 7: Save the vectors to ChromaDB\n",
    "vector_db = Chroma.from_texts(\n",
    "    texts=processed_splitted_docs,\n",
    "    embedding=embedding_model,\n",
    "    metadatas=processed_metadata,\n",
    "    persist_directory=\"chroma_db_new\"  # Directory where the database will be saved\n",
    ")\n",
    "\n",
    "print(\"Database saved successfully!\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "phapdienvv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}