Richard Guo
commited on
Commit
·
a2483b1
1
Parent(s):
de5bc26
use sample instead of head
Browse files- build_map.py +16 -14
build_map.py
CHANGED
|
@@ -18,6 +18,7 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
|
|
| 18 |
sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
|
| 19 |
features = dataset.features
|
| 20 |
|
|
|
|
| 21 |
numeric_fields = []
|
| 22 |
string_fields = []
|
| 23 |
bool_fields = []
|
|
@@ -71,6 +72,16 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
|
|
| 71 |
|
| 72 |
else:
|
| 73 |
uncategorized_fields.append(field)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
return features, \
|
| 76 |
numeric_fields, \
|
|
@@ -80,7 +91,8 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
|
|
| 80 |
label_fields, \
|
| 81 |
categorical_fields, \
|
| 82 |
datetime_fields, \
|
| 83 |
-
uncategorized_fields
|
|
|
|
| 84 |
|
| 85 |
|
| 86 |
def load_dataset_and_metadata(dataset_name,
|
|
@@ -139,21 +151,11 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
| 139 |
label_fields, \
|
| 140 |
categorical_fields, \
|
| 141 |
datetime_fields, \
|
| 142 |
-
uncategorized_fields
|
| 143 |
-
|
| 144 |
|
| 145 |
-
# return longest string field from 5 samples
|
| 146 |
-
head = dataset_dict["head"]
|
| 147 |
if indexed_field is None:
|
| 148 |
-
|
| 149 |
-
for field in string_fields:
|
| 150 |
-
length = 0
|
| 151 |
-
for i in range(len(head)):
|
| 152 |
-
if head[field][i]:
|
| 153 |
-
length += len(str(head[field][i]).split())
|
| 154 |
-
if length > longest_length:
|
| 155 |
-
longest_length = length
|
| 156 |
-
indexed_field = field
|
| 157 |
|
| 158 |
topic_label_field = None
|
| 159 |
if modality == "embedding":
|
|
|
|
| 18 |
sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
|
| 19 |
features = dataset.features
|
| 20 |
|
| 21 |
+
indexable_field = None
|
| 22 |
numeric_fields = []
|
| 23 |
string_fields = []
|
| 24 |
bool_fields = []
|
|
|
|
| 72 |
|
| 73 |
else:
|
| 74 |
uncategorized_fields.append(field)
|
| 75 |
+
|
| 76 |
+
longest_length = 0
|
| 77 |
+
for field in string_fields:
|
| 78 |
+
length = 0
|
| 79 |
+
for i in range(len(sample)):
|
| 80 |
+
if sample[field][i]:
|
| 81 |
+
length += len(str(sample[field][i]).split())
|
| 82 |
+
if length > longest_length:
|
| 83 |
+
longest_length = length
|
| 84 |
+
indexable_field = field
|
| 85 |
|
| 86 |
return features, \
|
| 87 |
numeric_fields, \
|
|
|
|
| 91 |
label_fields, \
|
| 92 |
categorical_fields, \
|
| 93 |
datetime_fields, \
|
| 94 |
+
uncategorized_fields, \
|
| 95 |
+
indexable_field
|
| 96 |
|
| 97 |
|
| 98 |
def load_dataset_and_metadata(dataset_name,
|
|
|
|
| 151 |
label_fields, \
|
| 152 |
categorical_fields, \
|
| 153 |
datetime_fields, \
|
| 154 |
+
uncategorized_fields, \
|
| 155 |
+
indexable_field = get_datum_fields(dataset_dict)
|
| 156 |
|
|
|
|
|
|
|
| 157 |
if indexed_field is None:
|
| 158 |
+
indexed_field = indexable_field
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
topic_label_field = None
|
| 161 |
if modality == "embedding":
|