File size: 7,171 Bytes
3a2b6a9
25b7393
d594c97
4a7f0ed
10d346a
3a2b6a9
 
9c36c45
4e65a7f
3a2b6a9
 
4a7f0ed
 
d594c97
49f9121
3a2b6a9
c6bbad8
25b7393
4e65a7f
 
 
3a2b6a9
10d346a
7d6c1f1
4a7f0ed
 
 
7d6c1f1
4a7f0ed
 
10d346a
7d6c1f1
4a7f0ed
 
 
 
 
 
 
 
 
 
 
 
10d346a
7d6c1f1
 
4a7f0ed
10d346a
 
 
 
 
 
 
 
 
 
7d6c1f1
3a2b6a9
 
 
 
 
10d346a
 
 
 
 
 
 
 
 
a3507d8
3a2b6a9
 
10d346a
 
 
 
3a2b6a9
 
 
10d346a
9c36c45
 
10d346a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bb18cf
10d346a
 
 
 
812582b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37c6ffa
048b5fe
37c6ffa
 
05495bc
812582b
 
 
 
 
 
 
1ddc171
812582b
 
1ddc171
 
 
 
 
 
0bd96a1
1ddc171
 
 
 
37c6ffa
 
1ddc171
812582b
 
 
10d346a
d594c97
10d346a
 
 
 
 
 
 
 
 
 
 
 
 
 
4a7f0ed
10d346a
 
 
 
 
 
 
 
 
 
 
 
661ae8b
25b7393
 
 
49f9121
 
 
 
 
 
ead6406
49f9121
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
from dotenv import load_dotenv
import shutil
import numpy as np
from pathlib import Path
import zipfile
from tqdm import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from accelerate import Accelerator
from datasets import Dataset
from huggingface_hub import HfApi

# load_dotenv()

hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("HF_TOKEN not in the .env file")

# Wrapper for embedding
class SentenceTransformerWrapper:
    def __init__(self, model_name, batch_size=32):
        self.batch_size = batch_size
        self.accelerator = Accelerator()  # Create an accelerator instance
        self.model = SentenceTransformer(model_name)
        # Move the model to the appropriate device
        self.model.to(self.accelerator.device)

    def embed_documents(self, texts):
        # Create a DataLoader for the texts
        dataloader = DataLoader(texts, batch_size=self.batch_size)
        all_embeddings = []
        # Optionally, prepare the DataLoader with accelerator if needed
        dataloader = self.accelerator.prepare(dataloader)
        
        for batch in tqdm(dataloader, desc="Embedding documents"):
            # SentenceTransformer.encode already supports batching;
            batch_embeddings = self.model.encode(batch, show_progress_bar=False)
            all_embeddings.append(batch_embeddings)
        embeddings = np.concatenate(all_embeddings, axis=0)
        return embeddings.tolist()

    def embed_query(self, text):
        return self.model.encode(text).tolist()
    
# Step 1: Unzip HTML files and set up model
def extract_documents(zip_path, extract_dir):
    extract_dir = Path(extract_dir)
    extract_dir.mkdir(parents=True, exist_ok=True)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    
    print(f"Documents extracted to {extract_dir}")
    return extract_dir

# Step 2: Clean the HTML files
def load_and_clean_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text()


# Step 3: Process files in directory and extract plain text
def process_html_files(directory, file_pattern="full_*.html"):
    directory = Path(directory)
    documents, metadata = [], []
    
    html_files = list(directory.glob(file_pattern))
    for file_path in tqdm(html_files, desc="Loading and cleaning documents"):
        text = load_and_clean_html(file_path)
        documents.append(text)
        metadata.append({"file_path": str(file_path)})
    
    print(f"Loaded {len(documents)} documents")
    return documents, metadata


# Step 4: Split text into chunks
def split_documents(documents, metadata, chunk_size=2000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        encoding_name="cl100k_base", chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[". ", "\n", "; ", "\t"]
    )
    
    splitted_docs, splitted_metadata = [], []
    for doc, meta in tqdm(zip(documents, metadata), desc="Splitting documents", total=len(documents)):
        chunks = text_splitter.split_text(doc)
        splitted_docs.extend(chunks)
        splitted_metadata.extend([meta] * len(chunks))
    
    return splitted_docs, splitted_metadata


# Step 5: Clean the chunks
def clean_chunks(splitted_docs, splitted_metadata, min_length=50):
    cleaned_docs, cleaned_metadata = [], []
    
    for doc, meta in tqdm(zip(splitted_docs, splitted_metadata), desc="Cleaning text", total=len(splitted_docs)):
        phrases = doc.split("\n")
        for phrase in phrases:
            if len(phrase) > min_length and "    " not in phrase:
                cleaned_docs.append(phrase)
                cleaned_metadata.append(meta)
    
    print(f"Cleaned {len(cleaned_docs)} text chunks.")
    return cleaned_docs, cleaned_metadata


# Step 6: Save to ChromaDB
def save_to_chromadb(
    processed_docs, 
    processed_metadata, 
    embedding_model, 
    persist_directory="./chroma_db", 
    batch_size=1024
):
    """
    Save documents to a Chroma vectorstore in batches.
    
    processed_docs: List of text chunks.
    processed_metadata: Corresponding metadata list.
    embedding_model: An embedding model with a method embed_documents.
    persist_directory: Where the vectorstore will be saved.
    batch_size: Number of documents to process per batch.
    """

    vector_db = Chroma(
        embedding_function=embedding_model,
        persist_directory=persist_directory
    )
    vector_store = None
    
    # Process documents in batches.
    for i in tqdm(range(0, len(processed_docs), batch_size), desc="Embedding and saving batches"):
        batch_texts = processed_docs[i : i + batch_size]
        batch_metadata = processed_metadata[i : i + batch_size]
        
        # Compute embeddings for the current batch.
        # batch_embeddings = embedding_model.embed_documents(batch_texts)
        
        # Add the batch to the vectorstore.
        if vector_store is None:
        # Initialize Chroma vector store with the first batch
            vector_store = Chroma.from_texts(
                texts=batch_texts,
                embedding=embedding_model,
                metadatas=batch_metadata,
                persist_directory=persist_directory
        )
        else:
            vector_db.add_texts(
                texts=batch_texts, 
                # batch_embeddings=batch_embeddings,
                embedding=embedding_model,
                metadatas=batch_metadata
        )
    
    # Persist changes to disk.
    print(f"Database saved successfully to {persist_directory}")

    return vector_db


# Main script
if __name__ == "__main__":
    # Configuration
    zip_path = "./documents.zip"
    extract_dir = "./vbpl"
    model_name = "bkai-foundation-models/vietnamese-bi-encoder"
    
    # Step 1: Extract files
    extract_dir = extract_documents(zip_path, extract_dir)
    
    # Step 2: Initialize embedding model
    embedding_model = SentenceTransformerWrapper(model_name, batch_size=32)
    
    # Step 3: Process files
    documents, metadata = process_html_files(extract_dir)
    
    # Step 4: Split text into chunks
    splitted_docs, splitted_metadata = split_documents(documents, metadata)
    
    # Step 5: Clean the text chunks
    processed_docs, processed_metadata = clean_chunks(splitted_docs, splitted_metadata)
    
    # Step 6: Generate embeddings and save to ChromaDB
    save_to_chromadb(processed_docs, processed_metadata, embedding_model)

    shutil.make_archive("chroma_db", "zip", "./chroma_db")
    print("Vector database archived as chroma_db.zip")

    api = HfApi()
    repo_id = "camiellia/phapdien_demo"
    api.upload_file(
    path_or_fileobj="chroma_db.zip",
    path_in_repo="chroma_db.zip",
    repo_id=repo_id,
    repo_type="dataset",
    token=hf_token,
    )
    print("Uploaded chroma_db.zip to Hugging Face Hub")