Spaces:
Running
Running
File size: 6,757 Bytes
7d6c1f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Write a Python notebook that creates a vector database using ChromaDB (use LangChain)\n",
"- ingest the document files only (full_ItemID.html files)\n",
"- it is required to save the file path in the metadata"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from tqdm import tqdm\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"from langchain.vectorstores import Chroma\n",
"from bs4 import BeautifulSoup\n",
"from sentence_transformers import SentenceTransformer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading documents: 100%|ββββββββββ| 5101/5101 [52:41<00:00, 1.61it/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 5101 documents\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Created a chunk of size 3623, which is longer than the specified 2000\n",
"Created a chunk of size 10118, which is longer than the specified 2000\n",
"Created a chunk of size 10168, which is longer than the specified 2000\n",
"Created a chunk of size 3836, which is longer than the specified 2000\n",
"Created a chunk of size 8935, which is longer than the specified 2000\n",
"Created a chunk of size 5101, which is longer than the specified 2000\n",
"Created a chunk of size 16204, which is longer than the specified 2000\n",
"Created a chunk of size 8374, which is longer than the specified 2000\n",
"Created a chunk of size 3134, which is longer than the specified 2000\n"
]
}
],
"source": [
"# Step 1: HTML dir\n",
"input_dir = rf\"D:\\PhapDien_semantic_search\\BoPhapDienDienTu\\vbpl\"\n",
"model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')\n",
"\n",
"# Step 2: Clean the HTML files\n",
"def load_and_clean_html(file_path):\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
" html_content = f.read()\n",
" soup = BeautifulSoup(html_content, \"html.parser\")\n",
" text = soup.get_text() # Extract plain text from the HTML\n",
" return text\n",
"\n",
"# Step 3: Process all files in the directory\n",
"documents = []\n",
"metadata = []\n",
"for file_name in tqdm(os.listdir(input_dir), desc=\"Loading documents\"):\n",
" if file_name.startswith(\"full_\") and file_name.endswith(\".html\"):\n",
" file_path = os.path.join(input_dir, file_name)\n",
" text = load_and_clean_html(file_path)\n",
" documents.append(text)\n",
" metadata.append({\"file_path\": file_path})\n",
"\n",
"print(f\"Loaded {len(documents)} documents\")\n",
"# Step 4: Split text into chunks\n",
"text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
" encoding_name=\"cl100k_base\", chunk_size=2000, chunk_overlap=20, separator=\"\\n\"\n",
")\n",
"splitted_docs = []\n",
"splitted_metadata = []\n",
"\n",
"for doc, meta in zip(documents, metadata):\n",
" chunks = text_splitter.split_text(doc)\n",
" for chunk in chunks:\n",
" splitted_docs.append(chunk)\n",
" splitted_metadata.append(meta)\n",
"# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.\n",
"# Notice that headers , menu text items, html tags, warnings in English contain a lot of \n",
"# whitespaces when splitted with \\n. Thus, I removed those instances since almost all of\n",
"# the information for retrieval is conveniently formatted well.\n",
"print(splitted_docs)\n",
"print(splitted_metadata)\n",
"processed_splitted_docs = []\n",
"processed_metadata = []\n",
"for i, doc in enumerate(splitted_docs):\n",
" processed = doc.split(\"\\n\")\n",
" for phrase in processed:\n",
" if len(phrase) > 50 and \" \" not in phrase:\n",
" processed_splitted_docs.append(phrase)\n",
" processed_metadata.append(splitted_metadata[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wrapper with embed_documents and embed_query\n",
"class SentenceTransformerWrapper:\n",
" def __init__(self, model_name):\n",
" self.model = SentenceTransformer(model_name)\n",
" \n",
" def embed_documents(self, texts):\n",
" # Convert the list of texts to embeddings\n",
" return self.model.encode(texts, show_progress_bar=True).tolist()\n",
" \n",
" def embed_query(self, text):\n",
" # Convert a single query to its embedding\n",
" return self.model.encode(text).tolist()\n",
"\n",
"# Instantiate wrapper with model\n",
"embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Batches: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Batches: 100%|ββββββββββ| 7/7 [00:16<00:00, 2.36s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Database saved successfully!\n"
]
}
],
"source": [
"# Step 6: Generate embeddings using BKAI model\n",
"\n",
"# Step 7: Save the vectors to ChromaDB\n",
"vector_db = Chroma.from_texts(\n",
" texts=processed_splitted_docs,\n",
" embedding=embedding_model,\n",
" metadatas=processed_metadata,\n",
" persist_directory=\"chroma_db_new\" # Directory where the database will be saved\n",
")\n",
"\n",
"print(\"Database saved successfully!\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "phapdienvv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|