soury commited on
Commit
43a2b78
·
1 Parent(s): 58f078c

push json file to the dataset using a pr

Browse files
Files changed (2) hide show
  1. src/services/huggingface.py +33 -221
  2. src/services/util.py +1 -1
src/services/huggingface.py CHANGED
@@ -1,7 +1,9 @@
1
- from huggingface_hub import login
2
- from datasets import load_dataset, Dataset, concatenate_datasets
3
  import json
4
  from src.services.util import HF_TOKEN, DATASET_NAME
 
 
 
5
 
6
 
7
  def init_huggingface():
@@ -14,6 +16,7 @@ def init_huggingface():
14
 
15
  def update_dataset(json_data):
16
  """Update the Hugging Face dataset with new data."""
 
17
  if json_data is None or json_data.startswith("The following fields are required"):
18
  return json_data or "No data to submit. Please fill in all required fields."
19
 
@@ -23,222 +26,31 @@ def update_dataset(json_data):
23
  return "Invalid JSON data. Please ensure all required fields are filled correctly."
24
 
25
  try:
26
- dataset = load_dataset(DATASET_NAME, split="train")
27
- print(dataset)
28
- except:
29
- dataset = Dataset.from_dict({})
30
-
31
- new_data = create_flattened_data(data)
32
- new_dataset = Dataset.from_dict(new_data)
33
-
34
- if len(dataset) > 0:
35
- print("dataset intitial")
36
- print(dataset)
37
- print("data to add ")
38
- print(new_dataset)
39
- updated_dataset = concatenate_datasets([dataset, new_dataset])
40
- else:
41
- updated_dataset = new_dataset
42
-
43
- updated_dataset.push_to_hub(DATASET_NAME)
44
- return "Data submitted successfully and dataset updated! Consult the data [here](https://huggingface.co/datasets/boavizta/BoAmps_data)"
45
-
46
-
47
- def create_flattened_data(data):
48
- """Create a flattened data structure for the algorithms."""
49
- # Handle algorithms
50
- algorithms = data.get("task", {}).get("algorithms", [])
51
- fields = ["trainingType", "algorithmType", "algorithmName", "algorithmUri", "foundationModelName", "foundationModelUri",
52
- "parametersNumber", "framework", "frameworkVersion", "classPath", "layersNumber", "epochsNumber", "optimizer", "quantization"]
53
- """Create a flattened data structure for the algorithms."""
54
- algorithms_data = {field: "| ".join(str(algo.get(
55
- field)) for algo in algorithms if algo.get(field)) or "" for field in fields}
56
- trainingType_str = algorithms_data["trainingType"]
57
- algorithmType_str = algorithms_data["algorithmType"]
58
- algorithmName_str = algorithms_data["algorithmName"]
59
- algorithmUri_str = algorithms_data["algorithmUri"]
60
- foundationModelName_str = algorithms_data["foundationModelName"]
61
- foundationModelUri_str = algorithms_data["foundationModelUri"]
62
- parametersNumber_str = algorithms_data["parametersNumber"]
63
- framework_str = algorithms_data["framework"]
64
- frameworkVersion_str = algorithms_data["frameworkVersion"]
65
- classPath_str = algorithms_data["classPath"]
66
- layersNumber_str = algorithms_data["layersNumber"]
67
- epochsNumber_str = algorithms_data["epochsNumber"]
68
- optimizer_str = algorithms_data["optimizer"]
69
- quantization_str = algorithms_data["quantization"]
70
-
71
- """Create a flattened data structure for the dataset."""
72
- # Handle dataset
73
- dataset = data.get("task", {}).get("dataset", [])
74
- fields = ["dataUsage", "dataType", "dataFormat", "dataSize",
75
- "dataQuantity", "shape", "source", "sourceUri", "owner"]
76
- """Create a flattened data structure for the dataset."""
77
- dataset_data = {field: "| ".join(
78
- str(d.get(field)) for d in dataset if d.get(field)) or "" for field in fields}
79
- dataUsage_str = dataset_data["dataUsage"]
80
- dataType_str = dataset_data["dataType"]
81
- dataFormat_str = dataset_data["dataFormat"]
82
- dataSize_str = dataset_data["dataSize"]
83
- dataQuantity_str = dataset_data["dataQuantity"]
84
- shape_str = dataset_data["shape"]
85
- source_str = dataset_data["source"]
86
- sourceUri_str = dataset_data["sourceUri"]
87
- owner_str = dataset_data["owner"]
88
-
89
- """Create a flattened data structure for the measures."""
90
- # Handle measures
91
- measures = data.get("measures", [])
92
- fields = ["measurementMethod", "manufacturer", "version", "cpuTrackingMode", "gpuTrackingMode", "averageUtilizationCpu", "averageUtilizationGpu",
93
- "powerCalibrationMeasurement", "durationCalibrationMeasurement", "powerConsumption", "measurementDuration", "measurementDateTime"]
94
- """Create a flattened data structure for the measures."""
95
- measures_data = {field: "| ".join(str(measure.get(
96
- field)) for measure in measures if measure.get(field)) or "" for field in fields}
97
- measurementMethod_str = measures_data["measurementMethod"]
98
- manufacturer_str = measures_data["manufacturer"]
99
- version_str = measures_data["version"]
100
- cpuTrackingMode_str = measures_data["cpuTrackingMode"]
101
- gpuTrackingMode_str = measures_data["gpuTrackingMode"]
102
- averageUtilizationCpu_str = measures_data["averageUtilizationCpu"]
103
- averageUtilizationGpu_str = measures_data["averageUtilizationGpu"]
104
- powerCalibrationMeasurement_str = measures_data["powerCalibrationMeasurement"]
105
- durationCalibrationMeasurement_str = measures_data["durationCalibrationMeasurement"]
106
- powerConsumption_str = measures_data["powerConsumption"]
107
- measurementDuration_str = measures_data["measurementDuration"]
108
- measurementDateTime_str = measures_data["measurementDateTime"]
109
-
110
- # Handle components
111
- components = data.get("infrastructure", {}).get("components", [])
112
- fields = ["componentName", "componentType", "nbComponent", "memorySize",
113
- "manufacturer", "family", "series", "share"]
114
-
115
- # Generate concatenated strings for each field
116
- component_data = {field: "| ".join(str(comp.get(
117
- field)) for comp in components if comp.get(field)) or "" for field in fields}
118
-
119
- componentName_str = component_data["componentName"]
120
- componentType_str = component_data["componentType"]
121
- nbComponent_str = component_data["nbComponent"]
122
- memorySize_str = component_data["memorySize"]
123
- manufacturer_infra_str = component_data["manufacturer"]
124
- family_str = component_data["family"]
125
- series_str = component_data["series"]
126
- share_str = component_data["share"]
127
-
128
- return {
129
- # Header
130
- "licensing": [data.get("header", {}).get("licensing", "")],
131
- "formatVersion": [data.get("header", {}).get("formatVersion", "")],
132
- "formatVersionSpecificationUri": [data.get("header", {}).get("formatVersionSpecificationUri", "")],
133
- "reportId": [data.get("header", {}).get("reportId", "")],
134
- "reportDatetime": [data.get("header", {}).get("reportDatetime", "")],
135
- "reportStatus": [data.get("header", {}).get("reportStatus", "")],
136
- "publisher_name": [data.get("header", {}).get("publisher", {}).get("name", "")],
137
- "publisher_division": [data.get("header", {}).get("publisher", {}).get("division", "")],
138
- "publisher_projectName": [data.get("header", {}).get("publisher", {}).get("projectName", "")],
139
- "publisher_confidentialityLevel": [data.get("header", {}).get("publisher", {}).get("confidentialityLevel", "")],
140
- "publisher_publicKey": [data.get("header", {}).get("publisher", {}).get("publicKey", "")],
141
-
142
- # Task
143
- "taskStage": [data.get("task", {}).get("taskStage", "")],
144
- "taskFamily": [data.get("task", {}).get("taskFamily", "")],
145
- "nbRequest": [data.get("task", {}).get("nbRequest", "")],
146
- # Algorithms
147
- "trainingType": [trainingType_str],
148
- "algorithmType": [algorithmType_str],
149
- "algorithmName": [algorithmName_str],
150
- "algorithmUri": [algorithmUri_str],
151
- "foundationModelName": [foundationModelName_str],
152
- "foundationModelUri": [foundationModelUri_str],
153
- "parametersNumber": [parametersNumber_str],
154
- "framework": [framework_str],
155
- "frameworkVersion": [frameworkVersion_str],
156
- "classPath": [classPath_str],
157
- "layersNumber": [layersNumber_str],
158
- "epochsNumber": [epochsNumber_str],
159
- "optimizer": [optimizer_str],
160
- "quantization": [quantization_str],
161
- # Dataset
162
- "dataUsage": [dataUsage_str],
163
- "dataType": [dataType_str],
164
- "dataFormat": [dataFormat_str],
165
- "dataSize": [dataSize_str],
166
- "dataQuantity": [dataQuantity_str],
167
- "shape": [shape_str],
168
- "source": [source_str],
169
- "sourceUri": [sourceUri_str],
170
- "owner": [owner_str],
171
- "measuredAccuracy": [data.get("task", {}).get("measuredAccuracy", "")],
172
- "estimatedAccuracy": [data.get("task", {}).get("estimatedAccuracy", "")],
173
- "taskDescription": [data.get("task", {}).get("taskDescription", "")],
174
-
175
- # Measures
176
- "measurementMethod": [measurementMethod_str],
177
- "manufacturer": [manufacturer_str],
178
- "version": [version_str],
179
- "cpuTrackingMode": [cpuTrackingMode_str],
180
- "gpuTrackingMode": [gpuTrackingMode_str],
181
- "averageUtilizationCpu": [averageUtilizationCpu_str],
182
- "averageUtilizationGpu": [averageUtilizationGpu_str],
183
- "powerCalibrationMeasurement": [powerCalibrationMeasurement_str],
184
- "durationCalibrationMeasurement": [durationCalibrationMeasurement_str],
185
- "powerConsumption": [powerConsumption_str],
186
- "measurementDuration": [measurementDuration_str],
187
- "measurementDateTime": [measurementDateTime_str],
188
-
189
- # System
190
- "os": [data.get("system", {}).get("os", "")],
191
- "distribution": [data.get("system", {}).get("distribution", "")],
192
- "distributionVersion": [data.get("system", {}).get("distributionVersion", "")],
193
-
194
- # Software
195
- "language": [data.get("software", {}).get("language", "")],
196
- "version_software": [data.get("software", {}).get("version_software", "")],
197
-
198
- # Infrastructure
199
- "infraType": [data.get("infrastructure", {}).get("infra_type", "")],
200
- "cloudProvider": [data.get("infrastructure", {}).get("cloudProvider", "")],
201
- "cloudInstance": [data.get("infrastructure", {}).get("cloudInstance", "")],
202
- "cloudService": [data.get("infrastructure", {}).get("cloudService", "")],
203
- "componentName": [componentName_str],
204
- "componentType": [componentType_str],
205
- "nbComponent": [nbComponent_str],
206
- "memorySize": [memorySize_str],
207
- "manufacturer_infra": [manufacturer_infra_str],
208
- "family": [family_str],
209
- "series": [series_str],
210
- "share": [share_str],
211
-
212
- # Environment
213
- "country": [data.get("environment", {}).get("country", "")],
214
- "latitude": [data.get("environment", {}).get("latitude", "")],
215
- "longitude": [data.get("environment", {}).get("longitude", "")],
216
- "location": [data.get("environment", {}).get("location", "")],
217
- "powerSupplierType": [data.get("environment", {}).get("powerSupplierType", "")],
218
- "powerSource": [data.get("environment", {}).get("powerSource", "")],
219
- "powerSourceCarbonIntensity": [data.get("environment", {}).get("powerSourceCarbonIntensity", "")],
220
-
221
- # Quality
222
- "quality": [data.get("quality", "")],
223
- }
224
-
225
-
226
- """
227
- def create_flattened_data(data):
228
- out = {}
229
-
230
- def flatten(x, name=''):
231
- if type(x) is dict:
232
- for a in x:
233
- flatten(x[a], name + a + '_')
234
- elif type(x) is list:
235
- i = 0
236
- for a in x:
237
- flatten(a, name + str(i) + '_')
238
- i += 1
239
- else:
240
- out[name[:-1]] = x
241
-
242
- flatten(data)
243
- return out
244
- """
 
1
+ from huggingface_hub import HfApi, login
 
2
  import json
3
  from src.services.util import HF_TOKEN, DATASET_NAME
4
+ import tempfile
5
+ import os
6
+ import json
7
 
8
 
9
  def init_huggingface():
 
16
 
17
  def update_dataset(json_data):
18
  """Update the Hugging Face dataset with new data."""
19
+
20
  if json_data is None or json_data.startswith("The following fields are required"):
21
  return json_data or "No data to submit. Please fill in all required fields."
22
 
 
26
  return "Invalid JSON data. Please ensure all required fields are filled correctly."
27
 
28
  try:
29
+ # Initialize Hugging Face authentication
30
+ init_huggingface()
31
+ api = HfApi()
32
+ # Write JSON to a temporary file
33
+ # If json_data is a string, first convert it to a Python dict
34
+ json_dic = json.loads(json_data)
35
+ json_f = json.dumps(json_dic, indent=2, ensure_ascii=False)
36
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as tmp:
37
+ tmp.write(json_f) # json_data must be a string
38
+ tmp_path = tmp.name
39
+
40
+ # Generate a unique filename for the repo
41
+ filename = os.path.basename(tmp_path)
42
+
43
+ # Push the file to hub with a pull request
44
+ api.upload_file(
45
+ path_or_fileobj=tmp_path,
46
+ repo_id=DATASET_NAME,
47
+ path_in_repo=f"data/{filename}",
48
+ repo_type="dataset",
49
+ commit_message=f"Add new BoAmps report data - {filename}",
50
+ create_pr=True,
51
+ )
52
+ os.unlink(tmp_path) # Clean up
53
+
54
+ except Exception as e:
55
+ return f"Error updating dataset: {str(e)}"
56
+ return "Data submitted successfully and dataset updated! Consult the data here: https://huggingface.co/datasets/boavizta/open_data_boamps"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/util.py CHANGED
@@ -2,7 +2,7 @@ import os
2
 
3
  # Hugging Face Configuration
4
  HF_TOKEN = os.environ.get("HF_TOKEN")
5
- DATASET_NAME = "boavizta/BoAmps_data"
6
 
7
  # Form Field Configurations
8
  # not used and verified for now
 
2
 
3
  # Hugging Face Configuration
4
  HF_TOKEN = os.environ.get("HF_TOKEN")
5
+ DATASET_NAME = "boavizta/open_data_boamps"
6
 
7
  # Form Field Configurations
8
  # not used and verified for now