Spaces:
Build error
Build error
Update hf_to_chroma_ds.py
Browse files- hf_to_chroma_ds.py +4 -1
hf_to_chroma_ds.py
CHANGED
|
@@ -120,13 +120,15 @@ def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_f
|
|
| 120 |
|
| 121 |
# Retrieve the mapped data
|
| 122 |
mapped_data = dataset.to_chroma()
|
| 123 |
-
|
|
|
|
| 124 |
# Split the data into batches and add them to the collection
|
| 125 |
def chunk_data(data, size):
|
| 126 |
"""Helper function to split data into batches."""
|
| 127 |
for i in range(0, len(data), size):
|
| 128 |
yield data[i:i+size]
|
| 129 |
|
|
|
|
| 130 |
ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
|
| 131 |
metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
|
| 132 |
documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
|
|
@@ -134,6 +136,7 @@ def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_f
|
|
| 134 |
|
| 135 |
total_docs = len(mapped_data["ids"])
|
| 136 |
|
|
|
|
| 137 |
for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
|
| 138 |
collection.add(
|
| 139 |
ids=ids,
|
|
|
|
| 120 |
|
| 121 |
# Retrieve the mapped data
|
| 122 |
mapped_data = dataset.to_chroma()
|
| 123 |
+
del dataset
|
| 124 |
+
|
| 125 |
# Split the data into batches and add them to the collection
|
| 126 |
def chunk_data(data, size):
|
| 127 |
"""Helper function to split data into batches."""
|
| 128 |
for i in range(0, len(data), size):
|
| 129 |
yield data[i:i+size]
|
| 130 |
|
| 131 |
+
print("########### Chunking ###########")
|
| 132 |
ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
|
| 133 |
metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
|
| 134 |
documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
|
|
|
|
| 136 |
|
| 137 |
total_docs = len(mapped_data["ids"])
|
| 138 |
|
| 139 |
+
print("########### Iterating batches ###########")
|
| 140 |
for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
|
| 141 |
collection.add(
|
| 142 |
ids=ids,
|