Spaces:
Running
Running
File size: 7,171 Bytes
3a2b6a9 25b7393 d594c97 4a7f0ed 10d346a 3a2b6a9 9c36c45 4e65a7f 3a2b6a9 4a7f0ed d594c97 49f9121 3a2b6a9 c6bbad8 25b7393 4e65a7f 3a2b6a9 10d346a 7d6c1f1 4a7f0ed 7d6c1f1 4a7f0ed 10d346a 7d6c1f1 4a7f0ed 10d346a 7d6c1f1 4a7f0ed 10d346a 7d6c1f1 3a2b6a9 10d346a a3507d8 3a2b6a9 10d346a 3a2b6a9 10d346a 9c36c45 10d346a 6bb18cf 10d346a 812582b 37c6ffa 048b5fe 37c6ffa 05495bc 812582b 1ddc171 812582b 1ddc171 0bd96a1 1ddc171 37c6ffa 1ddc171 812582b 10d346a d594c97 10d346a 4a7f0ed 10d346a 661ae8b 25b7393 49f9121 ead6406 49f9121 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import os
from dotenv import load_dotenv
import shutil
import numpy as np
from pathlib import Path
import zipfile
from tqdm import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from accelerate import Accelerator
from datasets import Dataset
from huggingface_hub import HfApi
# load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
raise ValueError("HF_TOKEN not in the .env file")
# Wrapper for embedding
class SentenceTransformerWrapper:
def __init__(self, model_name, batch_size=32):
self.batch_size = batch_size
self.accelerator = Accelerator() # Create an accelerator instance
self.model = SentenceTransformer(model_name)
# Move the model to the appropriate device
self.model.to(self.accelerator.device)
def embed_documents(self, texts):
# Create a DataLoader for the texts
dataloader = DataLoader(texts, batch_size=self.batch_size)
all_embeddings = []
# Optionally, prepare the DataLoader with accelerator if needed
dataloader = self.accelerator.prepare(dataloader)
for batch in tqdm(dataloader, desc="Embedding documents"):
# SentenceTransformer.encode already supports batching;
batch_embeddings = self.model.encode(batch, show_progress_bar=False)
all_embeddings.append(batch_embeddings)
embeddings = np.concatenate(all_embeddings, axis=0)
return embeddings.tolist()
def embed_query(self, text):
return self.model.encode(text).tolist()
# Step 1: Unzip HTML files and set up model
def extract_documents(zip_path, extract_dir):
extract_dir = Path(extract_dir)
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
print(f"Documents extracted to {extract_dir}")
return extract_dir
# Step 2: Clean the HTML files
def load_and_clean_html(file_path):
with open(file_path, "r", encoding="utf-8") as f:
html_content = f.read()
soup = BeautifulSoup(html_content, "html.parser")
return soup.get_text()
# Step 3: Process files in directory and extract plain text
def process_html_files(directory, file_pattern="full_*.html"):
directory = Path(directory)
documents, metadata = [], []
html_files = list(directory.glob(file_pattern))
for file_path in tqdm(html_files, desc="Loading and cleaning documents"):
text = load_and_clean_html(file_path)
documents.append(text)
metadata.append({"file_path": str(file_path)})
print(f"Loaded {len(documents)} documents")
return documents, metadata
# Step 4: Split text into chunks
def split_documents(documents, metadata, chunk_size=2000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
encoding_name="cl100k_base", chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[". ", "\n", "; ", "\t"]
)
splitted_docs, splitted_metadata = [], []
for doc, meta in tqdm(zip(documents, metadata), desc="Splitting documents", total=len(documents)):
chunks = text_splitter.split_text(doc)
splitted_docs.extend(chunks)
splitted_metadata.extend([meta] * len(chunks))
return splitted_docs, splitted_metadata
# Step 5: Clean the chunks
def clean_chunks(splitted_docs, splitted_metadata, min_length=50):
cleaned_docs, cleaned_metadata = [], []
for doc, meta in tqdm(zip(splitted_docs, splitted_metadata), desc="Cleaning text", total=len(splitted_docs)):
phrases = doc.split("\n")
for phrase in phrases:
if len(phrase) > min_length and " " not in phrase:
cleaned_docs.append(phrase)
cleaned_metadata.append(meta)
print(f"Cleaned {len(cleaned_docs)} text chunks.")
return cleaned_docs, cleaned_metadata
# Step 6: Save to ChromaDB
def save_to_chromadb(
processed_docs,
processed_metadata,
embedding_model,
persist_directory="./chroma_db",
batch_size=1024
):
"""
Save documents to a Chroma vectorstore in batches.
processed_docs: List of text chunks.
processed_metadata: Corresponding metadata list.
embedding_model: An embedding model with a method embed_documents.
persist_directory: Where the vectorstore will be saved.
batch_size: Number of documents to process per batch.
"""
vector_db = Chroma(
embedding_function=embedding_model,
persist_directory=persist_directory
)
vector_store = None
# Process documents in batches.
for i in tqdm(range(0, len(processed_docs), batch_size), desc="Embedding and saving batches"):
batch_texts = processed_docs[i : i + batch_size]
batch_metadata = processed_metadata[i : i + batch_size]
# Compute embeddings for the current batch.
# batch_embeddings = embedding_model.embed_documents(batch_texts)
# Add the batch to the vectorstore.
if vector_store is None:
# Initialize Chroma vector store with the first batch
vector_store = Chroma.from_texts(
texts=batch_texts,
embedding=embedding_model,
metadatas=batch_metadata,
persist_directory=persist_directory
)
else:
vector_db.add_texts(
texts=batch_texts,
# batch_embeddings=batch_embeddings,
embedding=embedding_model,
metadatas=batch_metadata
)
# Persist changes to disk.
print(f"Database saved successfully to {persist_directory}")
return vector_db
# Main script
if __name__ == "__main__":
# Configuration
zip_path = "./documents.zip"
extract_dir = "./vbpl"
model_name = "bkai-foundation-models/vietnamese-bi-encoder"
# Step 1: Extract files
extract_dir = extract_documents(zip_path, extract_dir)
# Step 2: Initialize embedding model
embedding_model = SentenceTransformerWrapper(model_name, batch_size=32)
# Step 3: Process files
documents, metadata = process_html_files(extract_dir)
# Step 4: Split text into chunks
splitted_docs, splitted_metadata = split_documents(documents, metadata)
# Step 5: Clean the text chunks
processed_docs, processed_metadata = clean_chunks(splitted_docs, splitted_metadata)
# Step 6: Generate embeddings and save to ChromaDB
save_to_chromadb(processed_docs, processed_metadata, embedding_model)
shutil.make_archive("chroma_db", "zip", "./chroma_db")
print("Vector database archived as chroma_db.zip")
api = HfApi()
repo_id = "camiellia/phapdien_demo"
api.upload_file(
path_or_fileobj="chroma_db.zip",
path_in_repo="chroma_db.zip",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
print("Uploaded chroma_db.zip to Hugging Face Hub") |