Spaces:
Sleeping
Sleeping
push json file to the dataset using a pr
Browse files- src/services/huggingface.py +33 -221
- src/services/util.py +1 -1
src/services/huggingface.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
-
from huggingface_hub import login
|
2 |
-
from datasets import load_dataset, Dataset, concatenate_datasets
|
3 |
import json
|
4 |
from src.services.util import HF_TOKEN, DATASET_NAME
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
def init_huggingface():
|
@@ -14,6 +16,7 @@ def init_huggingface():
|
|
14 |
|
15 |
def update_dataset(json_data):
|
16 |
"""Update the Hugging Face dataset with new data."""
|
|
|
17 |
if json_data is None or json_data.startswith("The following fields are required"):
|
18 |
return json_data or "No data to submit. Please fill in all required fields."
|
19 |
|
@@ -23,222 +26,31 @@ def update_dataset(json_data):
|
|
23 |
return "Invalid JSON data. Please ensure all required fields are filled correctly."
|
24 |
|
25 |
try:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
"
|
54 |
-
algorithms_data = {field: "| ".join(str(algo.get(
|
55 |
-
field)) for algo in algorithms if algo.get(field)) or "" for field in fields}
|
56 |
-
trainingType_str = algorithms_data["trainingType"]
|
57 |
-
algorithmType_str = algorithms_data["algorithmType"]
|
58 |
-
algorithmName_str = algorithms_data["algorithmName"]
|
59 |
-
algorithmUri_str = algorithms_data["algorithmUri"]
|
60 |
-
foundationModelName_str = algorithms_data["foundationModelName"]
|
61 |
-
foundationModelUri_str = algorithms_data["foundationModelUri"]
|
62 |
-
parametersNumber_str = algorithms_data["parametersNumber"]
|
63 |
-
framework_str = algorithms_data["framework"]
|
64 |
-
frameworkVersion_str = algorithms_data["frameworkVersion"]
|
65 |
-
classPath_str = algorithms_data["classPath"]
|
66 |
-
layersNumber_str = algorithms_data["layersNumber"]
|
67 |
-
epochsNumber_str = algorithms_data["epochsNumber"]
|
68 |
-
optimizer_str = algorithms_data["optimizer"]
|
69 |
-
quantization_str = algorithms_data["quantization"]
|
70 |
-
|
71 |
-
"""Create a flattened data structure for the dataset."""
|
72 |
-
# Handle dataset
|
73 |
-
dataset = data.get("task", {}).get("dataset", [])
|
74 |
-
fields = ["dataUsage", "dataType", "dataFormat", "dataSize",
|
75 |
-
"dataQuantity", "shape", "source", "sourceUri", "owner"]
|
76 |
-
"""Create a flattened data structure for the dataset."""
|
77 |
-
dataset_data = {field: "| ".join(
|
78 |
-
str(d.get(field)) for d in dataset if d.get(field)) or "" for field in fields}
|
79 |
-
dataUsage_str = dataset_data["dataUsage"]
|
80 |
-
dataType_str = dataset_data["dataType"]
|
81 |
-
dataFormat_str = dataset_data["dataFormat"]
|
82 |
-
dataSize_str = dataset_data["dataSize"]
|
83 |
-
dataQuantity_str = dataset_data["dataQuantity"]
|
84 |
-
shape_str = dataset_data["shape"]
|
85 |
-
source_str = dataset_data["source"]
|
86 |
-
sourceUri_str = dataset_data["sourceUri"]
|
87 |
-
owner_str = dataset_data["owner"]
|
88 |
-
|
89 |
-
"""Create a flattened data structure for the measures."""
|
90 |
-
# Handle measures
|
91 |
-
measures = data.get("measures", [])
|
92 |
-
fields = ["measurementMethod", "manufacturer", "version", "cpuTrackingMode", "gpuTrackingMode", "averageUtilizationCpu", "averageUtilizationGpu",
|
93 |
-
"powerCalibrationMeasurement", "durationCalibrationMeasurement", "powerConsumption", "measurementDuration", "measurementDateTime"]
|
94 |
-
"""Create a flattened data structure for the measures."""
|
95 |
-
measures_data = {field: "| ".join(str(measure.get(
|
96 |
-
field)) for measure in measures if measure.get(field)) or "" for field in fields}
|
97 |
-
measurementMethod_str = measures_data["measurementMethod"]
|
98 |
-
manufacturer_str = measures_data["manufacturer"]
|
99 |
-
version_str = measures_data["version"]
|
100 |
-
cpuTrackingMode_str = measures_data["cpuTrackingMode"]
|
101 |
-
gpuTrackingMode_str = measures_data["gpuTrackingMode"]
|
102 |
-
averageUtilizationCpu_str = measures_data["averageUtilizationCpu"]
|
103 |
-
averageUtilizationGpu_str = measures_data["averageUtilizationGpu"]
|
104 |
-
powerCalibrationMeasurement_str = measures_data["powerCalibrationMeasurement"]
|
105 |
-
durationCalibrationMeasurement_str = measures_data["durationCalibrationMeasurement"]
|
106 |
-
powerConsumption_str = measures_data["powerConsumption"]
|
107 |
-
measurementDuration_str = measures_data["measurementDuration"]
|
108 |
-
measurementDateTime_str = measures_data["measurementDateTime"]
|
109 |
-
|
110 |
-
# Handle components
|
111 |
-
components = data.get("infrastructure", {}).get("components", [])
|
112 |
-
fields = ["componentName", "componentType", "nbComponent", "memorySize",
|
113 |
-
"manufacturer", "family", "series", "share"]
|
114 |
-
|
115 |
-
# Generate concatenated strings for each field
|
116 |
-
component_data = {field: "| ".join(str(comp.get(
|
117 |
-
field)) for comp in components if comp.get(field)) or "" for field in fields}
|
118 |
-
|
119 |
-
componentName_str = component_data["componentName"]
|
120 |
-
componentType_str = component_data["componentType"]
|
121 |
-
nbComponent_str = component_data["nbComponent"]
|
122 |
-
memorySize_str = component_data["memorySize"]
|
123 |
-
manufacturer_infra_str = component_data["manufacturer"]
|
124 |
-
family_str = component_data["family"]
|
125 |
-
series_str = component_data["series"]
|
126 |
-
share_str = component_data["share"]
|
127 |
-
|
128 |
-
return {
|
129 |
-
# Header
|
130 |
-
"licensing": [data.get("header", {}).get("licensing", "")],
|
131 |
-
"formatVersion": [data.get("header", {}).get("formatVersion", "")],
|
132 |
-
"formatVersionSpecificationUri": [data.get("header", {}).get("formatVersionSpecificationUri", "")],
|
133 |
-
"reportId": [data.get("header", {}).get("reportId", "")],
|
134 |
-
"reportDatetime": [data.get("header", {}).get("reportDatetime", "")],
|
135 |
-
"reportStatus": [data.get("header", {}).get("reportStatus", "")],
|
136 |
-
"publisher_name": [data.get("header", {}).get("publisher", {}).get("name", "")],
|
137 |
-
"publisher_division": [data.get("header", {}).get("publisher", {}).get("division", "")],
|
138 |
-
"publisher_projectName": [data.get("header", {}).get("publisher", {}).get("projectName", "")],
|
139 |
-
"publisher_confidentialityLevel": [data.get("header", {}).get("publisher", {}).get("confidentialityLevel", "")],
|
140 |
-
"publisher_publicKey": [data.get("header", {}).get("publisher", {}).get("publicKey", "")],
|
141 |
-
|
142 |
-
# Task
|
143 |
-
"taskStage": [data.get("task", {}).get("taskStage", "")],
|
144 |
-
"taskFamily": [data.get("task", {}).get("taskFamily", "")],
|
145 |
-
"nbRequest": [data.get("task", {}).get("nbRequest", "")],
|
146 |
-
# Algorithms
|
147 |
-
"trainingType": [trainingType_str],
|
148 |
-
"algorithmType": [algorithmType_str],
|
149 |
-
"algorithmName": [algorithmName_str],
|
150 |
-
"algorithmUri": [algorithmUri_str],
|
151 |
-
"foundationModelName": [foundationModelName_str],
|
152 |
-
"foundationModelUri": [foundationModelUri_str],
|
153 |
-
"parametersNumber": [parametersNumber_str],
|
154 |
-
"framework": [framework_str],
|
155 |
-
"frameworkVersion": [frameworkVersion_str],
|
156 |
-
"classPath": [classPath_str],
|
157 |
-
"layersNumber": [layersNumber_str],
|
158 |
-
"epochsNumber": [epochsNumber_str],
|
159 |
-
"optimizer": [optimizer_str],
|
160 |
-
"quantization": [quantization_str],
|
161 |
-
# Dataset
|
162 |
-
"dataUsage": [dataUsage_str],
|
163 |
-
"dataType": [dataType_str],
|
164 |
-
"dataFormat": [dataFormat_str],
|
165 |
-
"dataSize": [dataSize_str],
|
166 |
-
"dataQuantity": [dataQuantity_str],
|
167 |
-
"shape": [shape_str],
|
168 |
-
"source": [source_str],
|
169 |
-
"sourceUri": [sourceUri_str],
|
170 |
-
"owner": [owner_str],
|
171 |
-
"measuredAccuracy": [data.get("task", {}).get("measuredAccuracy", "")],
|
172 |
-
"estimatedAccuracy": [data.get("task", {}).get("estimatedAccuracy", "")],
|
173 |
-
"taskDescription": [data.get("task", {}).get("taskDescription", "")],
|
174 |
-
|
175 |
-
# Measures
|
176 |
-
"measurementMethod": [measurementMethod_str],
|
177 |
-
"manufacturer": [manufacturer_str],
|
178 |
-
"version": [version_str],
|
179 |
-
"cpuTrackingMode": [cpuTrackingMode_str],
|
180 |
-
"gpuTrackingMode": [gpuTrackingMode_str],
|
181 |
-
"averageUtilizationCpu": [averageUtilizationCpu_str],
|
182 |
-
"averageUtilizationGpu": [averageUtilizationGpu_str],
|
183 |
-
"powerCalibrationMeasurement": [powerCalibrationMeasurement_str],
|
184 |
-
"durationCalibrationMeasurement": [durationCalibrationMeasurement_str],
|
185 |
-
"powerConsumption": [powerConsumption_str],
|
186 |
-
"measurementDuration": [measurementDuration_str],
|
187 |
-
"measurementDateTime": [measurementDateTime_str],
|
188 |
-
|
189 |
-
# System
|
190 |
-
"os": [data.get("system", {}).get("os", "")],
|
191 |
-
"distribution": [data.get("system", {}).get("distribution", "")],
|
192 |
-
"distributionVersion": [data.get("system", {}).get("distributionVersion", "")],
|
193 |
-
|
194 |
-
# Software
|
195 |
-
"language": [data.get("software", {}).get("language", "")],
|
196 |
-
"version_software": [data.get("software", {}).get("version_software", "")],
|
197 |
-
|
198 |
-
# Infrastructure
|
199 |
-
"infraType": [data.get("infrastructure", {}).get("infra_type", "")],
|
200 |
-
"cloudProvider": [data.get("infrastructure", {}).get("cloudProvider", "")],
|
201 |
-
"cloudInstance": [data.get("infrastructure", {}).get("cloudInstance", "")],
|
202 |
-
"cloudService": [data.get("infrastructure", {}).get("cloudService", "")],
|
203 |
-
"componentName": [componentName_str],
|
204 |
-
"componentType": [componentType_str],
|
205 |
-
"nbComponent": [nbComponent_str],
|
206 |
-
"memorySize": [memorySize_str],
|
207 |
-
"manufacturer_infra": [manufacturer_infra_str],
|
208 |
-
"family": [family_str],
|
209 |
-
"series": [series_str],
|
210 |
-
"share": [share_str],
|
211 |
-
|
212 |
-
# Environment
|
213 |
-
"country": [data.get("environment", {}).get("country", "")],
|
214 |
-
"latitude": [data.get("environment", {}).get("latitude", "")],
|
215 |
-
"longitude": [data.get("environment", {}).get("longitude", "")],
|
216 |
-
"location": [data.get("environment", {}).get("location", "")],
|
217 |
-
"powerSupplierType": [data.get("environment", {}).get("powerSupplierType", "")],
|
218 |
-
"powerSource": [data.get("environment", {}).get("powerSource", "")],
|
219 |
-
"powerSourceCarbonIntensity": [data.get("environment", {}).get("powerSourceCarbonIntensity", "")],
|
220 |
-
|
221 |
-
# Quality
|
222 |
-
"quality": [data.get("quality", "")],
|
223 |
-
}
|
224 |
-
|
225 |
-
|
226 |
-
"""
|
227 |
-
def create_flattened_data(data):
|
228 |
-
out = {}
|
229 |
-
|
230 |
-
def flatten(x, name=''):
|
231 |
-
if type(x) is dict:
|
232 |
-
for a in x:
|
233 |
-
flatten(x[a], name + a + '_')
|
234 |
-
elif type(x) is list:
|
235 |
-
i = 0
|
236 |
-
for a in x:
|
237 |
-
flatten(a, name + str(i) + '_')
|
238 |
-
i += 1
|
239 |
-
else:
|
240 |
-
out[name[:-1]] = x
|
241 |
-
|
242 |
-
flatten(data)
|
243 |
-
return out
|
244 |
-
"""
|
|
|
1 |
+
from huggingface_hub import HfApi, login
|
|
|
2 |
import json
|
3 |
from src.services.util import HF_TOKEN, DATASET_NAME
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
import json
|
7 |
|
8 |
|
9 |
def init_huggingface():
|
|
|
16 |
|
17 |
def update_dataset(json_data):
|
18 |
"""Update the Hugging Face dataset with new data."""
|
19 |
+
|
20 |
if json_data is None or json_data.startswith("The following fields are required"):
|
21 |
return json_data or "No data to submit. Please fill in all required fields."
|
22 |
|
|
|
26 |
return "Invalid JSON data. Please ensure all required fields are filled correctly."
|
27 |
|
28 |
try:
|
29 |
+
# Initialize Hugging Face authentication
|
30 |
+
init_huggingface()
|
31 |
+
api = HfApi()
|
32 |
+
# Write JSON to a temporary file
|
33 |
+
# If json_data is a string, first convert it to a Python dict
|
34 |
+
json_dic = json.loads(json_data)
|
35 |
+
json_f = json.dumps(json_dic, indent=2, ensure_ascii=False)
|
36 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as tmp:
|
37 |
+
tmp.write(json_f) # json_data must be a string
|
38 |
+
tmp_path = tmp.name
|
39 |
+
|
40 |
+
# Generate a unique filename for the repo
|
41 |
+
filename = os.path.basename(tmp_path)
|
42 |
+
|
43 |
+
# Push the file to hub with a pull request
|
44 |
+
api.upload_file(
|
45 |
+
path_or_fileobj=tmp_path,
|
46 |
+
repo_id=DATASET_NAME,
|
47 |
+
path_in_repo=f"data/{filename}",
|
48 |
+
repo_type="dataset",
|
49 |
+
commit_message=f"Add new BoAmps report data - {filename}",
|
50 |
+
create_pr=True,
|
51 |
+
)
|
52 |
+
os.unlink(tmp_path) # Clean up
|
53 |
+
|
54 |
+
except Exception as e:
|
55 |
+
return f"Error updating dataset: {str(e)}"
|
56 |
+
return "Data submitted successfully and dataset updated! Consult the data here: https://huggingface.co/datasets/boavizta/open_data_boamps"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/services/util.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
|
3 |
# Hugging Face Configuration
|
4 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
5 |
-
DATASET_NAME = "boavizta/
|
6 |
|
7 |
# Form Field Configurations
|
8 |
# not used and verified for now
|
|
|
2 |
|
3 |
# Hugging Face Configuration
|
4 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
5 |
+
DATASET_NAME = "boavizta/open_data_boamps"
|
6 |
|
7 |
# Form Field Configurations
|
8 |
# not used and verified for now
|