JoaquinVanschoren commited on
Commit
6ec1943
·
1 Parent(s): 71ddcd2

better reporting of generation issues

Browse files
__pycache__/validation.cpython-312.pyc CHANGED
Binary files a/__pycache__/validation.cpython-312.pyc and b/__pycache__/validation.cpython-312.pyc differ
 
app.py CHANGED
@@ -9,35 +9,30 @@ def process_file(file):
9
  results = []
10
  json_data = None
11
 
12
- # Use just the filename instead of full path
13
  filename = file.name.split("/")[-1]
14
-
15
  # Check 1: JSON validation
16
  json_valid, json_message, json_data = validate_json(file.name)
17
- # Remove empty checkmarks from messages
18
  json_message = json_message.replace("\n✓\n", "\n")
19
- results.append(("JSON Format Validation", json_valid, json_message))
20
-
21
  if not json_valid:
22
  return results, None
23
-
24
  # Check 2: Croissant validation
25
- croissant_valid, croissant_message = validate_croissant(json_data)
26
- # Remove empty checkmarks from messages
27
  croissant_message = croissant_message.replace("\n✓\n", "\n")
28
- results.append(("Croissant Schema Validation", croissant_valid, croissant_message))
29
-
30
  if not croissant_valid:
31
  return results, None
32
-
33
- # Check 3: Records validation
34
- records_valid, records_message = validate_records(json_data)
35
- # Remove empty checkmarks from messages
36
  records_message = records_message.replace("\n✓\n", "\n")
37
- results.append(("Records Generation Test", records_valid, records_message))
38
-
39
 
40
- # Generate detailed report with just filename
41
  report = generate_validation_report(filename, json_data, results)
42
 
43
  return results, report
@@ -160,6 +155,10 @@ def create_ui():
160
  .status-error {
161
  background-color: #f44336 !important;
162
  }
 
 
 
 
163
 
164
  .step-details {
165
  padding: 12px 15px;
@@ -365,9 +364,9 @@ def create_ui():
365
  None
366
  ]
367
 
368
- records_valid, records_message = validate_records(json_data)
369
- results.append(("Records Generation Test", records_valid, records_message))
370
-
371
  # Generate report
372
  report = generate_validation_report(url.split("/")[-1], json_data, results)
373
  report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
@@ -411,17 +410,32 @@ def create_ui():
411
  None,
412
  None
413
  ]
414
-
415
  def build_results_html(results):
416
- # Build validation results HTML
417
  html = '<div class="validation-results">'
418
-
419
- for i, (test_name, passed, message) in enumerate(results):
420
- status_class = "status-success" if passed else "status-error"
421
- status_icon = "✓" if passed else "✗"
422
- # Add emoji to message
423
- message_with_emoji = ("✅ " if passed else "❌ ") + message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
 
 
425
  html += f'''
426
  <div class="validation-step" id="step-{i}">
427
  <div class="step-header" onclick="
@@ -445,10 +459,10 @@ def create_ui():
445
  </div>
446
  </div>
447
  '''
448
-
449
  html += '</div>'
450
  return gr.update(value=html, visible=True)
451
-
452
  def on_validate(file):
453
  if file is None:
454
  return [
 
9
  results = []
10
  json_data = None
11
 
 
12
  filename = file.name.split("/")[-1]
13
+
14
  # Check 1: JSON validation
15
  json_valid, json_message, json_data = validate_json(file.name)
 
16
  json_message = json_message.replace("\n✓\n", "\n")
17
+ results.append(("JSON Format Validation", json_valid, json_message, "pass" if json_valid else "error"))
18
+
19
  if not json_valid:
20
  return results, None
21
+
22
  # Check 2: Croissant validation
23
+ croissant_valid, croissant_message = validate_croissant(json_data)
 
24
  croissant_message = croissant_message.replace("\n✓\n", "\n")
25
+ results.append(("Croissant Schema Validation", croissant_valid, croissant_message, "pass" if croissant_valid else "error"))
26
+
27
  if not croissant_valid:
28
  return results, None
29
+
30
+ # Check 3: Records validation (with timeout-safe and error-specific logic)
31
+ records_valid, records_message, records_status = validate_records(json_data)
 
32
  records_message = records_message.replace("\n✓\n", "\n")
33
+ results.append(("Records Generation Test", records_valid, records_message, records_status))
 
34
 
35
+ # Generate final report
36
  report = generate_validation_report(filename, json_data, results)
37
 
38
  return results, report
 
155
  .status-error {
156
  background-color: #f44336 !important;
157
  }
158
+
159
+ .status-warning {
160
+ background-color: #ff9800 !important; /* Amber for warnings */
161
+ }
162
 
163
  .step-details {
164
  padding: 12px 15px;
 
364
  None
365
  ]
366
 
367
+ records_valid, records_message, records_status = validate_records(json_data)
368
+ results.append(("Records Generation Test (Optional)", records_valid, records_message, records_status))
369
+
370
  # Generate report
371
  report = generate_validation_report(url.split("/")[-1], json_data, results)
372
  report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
 
410
  None,
411
  None
412
  ]
413
+
414
  def build_results_html(results):
 
415
  html = '<div class="validation-results">'
416
+
417
+ for i, result in enumerate(results):
418
+ if len(result) == 4:
419
+ test_name, passed, message, status = result
420
+ else:
421
+ test_name, passed, message = result
422
+ status = "pass" if passed else "error"
423
+
424
+ if status == "pass":
425
+ status_class = "status-success"
426
+ status_icon = "✓"
427
+ message_with_emoji = "✅ " + message
428
+ elif status == "warning":
429
+ status_class = "status-warning"
430
+ status_icon = "?"
431
+ message_with_emoji = "⚠️ Could not automatically generate records. This is oftentimes not an issue (e.g. datasets could be too large or too complex), and it's not required to pass this test to submit to NeurIPS.\n\n" + message
432
+ else: # error
433
+ status_class = "status-error"
434
+ status_icon = "✗"
435
+ message_with_emoji = "❌ " + message
436
 
437
+ message_with_emoji = message_with_emoji.replace("\n", "<br>")
438
+
439
  html += f'''
440
  <div class="validation-step" id="step-{i}">
441
  <div class="step-header" onclick="
 
459
  </div>
460
  </div>
461
  '''
462
+
463
  html += '</div>'
464
  return gr.update(value=html, visible=True)
465
+
466
  def on_validate(file):
467
  if file is None:
468
  return [
apt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git-lfs
report_croissant-validation_Student Performance on an Entrance Examination.md ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CROISSANT VALIDATION REPORT
2
+ ================================================================================
3
+ ## VALIDATION RESULTS
4
+ --------------------------------------------------------------------------------
5
+ Starting validation for file: download
6
+ ### JSON Format Validation
7
+
8
+ The URL returned valid JSON.
9
+ ### Croissant Schema Validation
10
+
11
+ The dataset passes Croissant validation.
12
+ ### Records Generation Test
13
+
14
+ Record set 'Student_Performance_on_an_Entrance_Examination.csv' passed validation.
15
+ ## JSON-LD REFERENCE
16
+ ================================================================================
17
+ ```json
18
+ {
19
+ "@context": {
20
+ "@language": "en",
21
+ "@vocab": "https://schema.org/",
22
+ "citeAs": "cr:citeAs",
23
+ "column": "cr:column",
24
+ "conformsTo": "dct:conformsTo",
25
+ "cr": "http://mlcommons.org/croissant/",
26
+ "data": {
27
+ "@id": "cr:data",
28
+ "@type": "@json"
29
+ },
30
+ "dataBiases": "cr:dataBiases",
31
+ "dataCollection": "cr:dataCollection",
32
+ "dataType": {
33
+ "@id": "cr:dataType",
34
+ "@type": "@vocab"
35
+ },
36
+ "dct": "http://purl.org/dc/terms/",
37
+ "extract": "cr:extract",
38
+ "field": "cr:field",
39
+ "fileProperty": "cr:fileProperty",
40
+ "fileObject": "cr:fileObject",
41
+ "fileSet": "cr:fileSet",
42
+ "format": "cr:format",
43
+ "includes": "cr:includes",
44
+ "isEnumeration": "cr:isEnumeration",
45
+ "isLiveDataset": "cr:isLiveDataset",
46
+ "jsonPath": "cr:jsonPath",
47
+ "key": "cr:key",
48
+ "md5": "cr:md5",
49
+ "parentField": "cr:parentField",
50
+ "path": "cr:path",
51
+ "personalSensitiveInformation": "cr:personalSensitiveInformation",
52
+ "recordSet": "cr:recordSet",
53
+ "references": "cr:references",
54
+ "regex": "cr:regex",
55
+ "repeated": "cr:repeated",
56
+ "replace": "cr:replace",
57
+ "sc": "https://schema.org/",
58
+ "separator": "cr:separator",
59
+ "source": "cr:source",
60
+ "subField": "cr:subField",
61
+ "transform": "cr:transform",
62
+ "wd": "https://www.wikidata.org/wiki/",
63
+ "@base": "cr_base_iri/"
64
+ },
65
+ "alternateName": " Examining Demographic, Academic, and Socioeconomic Factors",
66
+ "conformsTo": "http://mlcommons.org/croissant/1.0",
67
+ "license": {
68
+ "@type": "sc:CreativeWork",
69
+ "name": "Other (specified in description)"
70
+ },
71
+ "distribution": [
72
+ {
73
+ "contentUrl": "https://www.kaggle.com/api/v1/datasets/download/adilshamim8/student-performance-on-an-entrance-examination?datasetVersionNumber=1",
74
+ "contentSize": "4.299 KB",
75
+ "md5": "c8RSY3Vq8U4A+IMWxNtpMQ==",
76
+ "encodingFormat": "application/zip",
77
+ "@id": "archive.zip",
78
+ "@type": "cr:FileObject",
79
+ "name": "archive.zip",
80
+ "description": "Archive containing all the contents of the Student Performance on an Entrance Examination dataset"
81
+ },
82
+ {
83
+ "contentUrl": "Student_Performance_on_an_Entrance_Examination.csv",
84
+ "containedIn": {
85
+ "@id": "archive.zip"
86
+ },
87
+ "encodingFormat": "text/csv",
88
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject",
89
+ "@type": "cr:FileObject",
90
+ "name": "Student_Performance_on_an_Entrance_Examination.csv",
91
+ "description": "- **Gender** \n *Description:* Indicates the candidate\u2019s gender (e.g., Male, Female). This field helps in analyzing performance trends and demographic differences based on gender.\n\n- **Caste** \n *Description:* Specifies the caste category of the candidate (such as General, OBC, SC, ST, etc.). This information can be used to explore socio-cultural factors and their influence on academic performance.\n\n- **coaching** \n *Description:* Denotes whether the candidate attended any coaching classes prior to the examination. It typically categorizes candidates into those who attended coaching within Assam, outside Assam, or not at all, providing insights into the role of supplementary education.\n\n- **Class_ten_education** \n *Description:* Records the board or institution where the candidate completed their Class X education. This can be useful for assessing the impact of the quality of secondary education on subsequent exam performance.\n\n- **twelve_education** \n *Description:* Indicates the board or institution where the candidate completed their Class XII education. Analyzing this field can reveal differences in educational standards and curricula that may affect entrance exam outcomes.\n\n- **medium** \n *Description:* Specifies the medium of instruction used during the candidate\u2019s Class XII education (e.g., English, Assamese, etc.). The medium of instruction might influence comprehension and performance in the exam.\n\n- **Class_X_Percentage** \n *Description:* Represents the percentage marks secured by the candidate in their Class X examinations. This score serves as a baseline indicator of academic ability and prior educational attainment.\n\n- **Class_XII_Percentage** \n *Description:* Denotes the percentage marks achieved by the candidate in their Class XII examinations, providing further insight into their academic consistency and preparation for the entrance exam.\n\n- **Father_occupation** \n *Description:* Captures the occupation of the candidate\u2019s father. This socioeconomic indicator can help in understanding how parental employment and associated factors might influence educational opportunities and performance.\n\n- **Mother_occupation** \n *Description:* Captures the occupation of the candidate\u2019s mother. Like the father's occupation, this field contributes to a broader view of the candidate's socioeconomic background and its potential impact on academic success.\n\n- **time** \n *Description:* Records the time or session related to the exam or data collection. This field can be useful for tracking trends over time or correlating performance with specific examination sessions.\n\n- **Performance** \n *Description:* Represents the candidate\u2019s performance in the Common Entrance Examination (CEE). This could be presented as a numeric score, grade, or categorical outcome (e.g., pass/fail), and serves as the primary variable for assessing academic achievement in the dataset."
92
+ }
93
+ ],
94
+ "recordSet": [
95
+ {
96
+ "field": [
97
+ {
98
+ "dataType": [
99
+ "sc:Text"
100
+ ],
101
+ "source": {
102
+ "fileObject": {
103
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
104
+ },
105
+ "extract": {
106
+ "column": "Gender"
107
+ }
108
+ },
109
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Gender",
110
+ "@type": "cr:Field",
111
+ "name": "Gender",
112
+ "description": "Indicates the candidate\u2019s gender (e.g., Male, Female). This field helps in analyzing performance trends and demographic differences based on gender."
113
+ },
114
+ {
115
+ "dataType": [
116
+ "sc:Text"
117
+ ],
118
+ "source": {
119
+ "fileObject": {
120
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
121
+ },
122
+ "extract": {
123
+ "column": "Caste"
124
+ }
125
+ },
126
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Caste",
127
+ "@type": "cr:Field",
128
+ "name": "Caste",
129
+ "description": "Specifies the caste category of the candidate (such as General, OBC, SC, ST, etc.). This information can be used to explore socio-cultural factors and their influence on academic performance."
130
+ },
131
+ {
132
+ "dataType": [
133
+ "sc:Text"
134
+ ],
135
+ "source": {
136
+ "fileObject": {
137
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
138
+ },
139
+ "extract": {
140
+ "column": "coaching"
141
+ }
142
+ },
143
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/coaching",
144
+ "@type": "cr:Field",
145
+ "name": "coaching",
146
+ "description": " Denotes whether the candidate attended any coaching classes prior to the examination. It typically categorizes candidates into those who attended coaching within Assam, outside Assam, or not at all, providing insights into the role of supplementary education."
147
+ },
148
+ {
149
+ "dataType": [
150
+ "sc:Text"
151
+ ],
152
+ "source": {
153
+ "fileObject": {
154
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
155
+ },
156
+ "extract": {
157
+ "column": "Class_ten_education"
158
+ }
159
+ },
160
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Class_ten_education",
161
+ "@type": "cr:Field",
162
+ "name": "Class_ten_education",
163
+ "description": " Records the board or institution where the candidate completed their Class X education. This can be useful for assessing the impact of the quality of secondary education on subsequent exam performance."
164
+ },
165
+ {
166
+ "dataType": [
167
+ "sc:Text"
168
+ ],
169
+ "source": {
170
+ "fileObject": {
171
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
172
+ },
173
+ "extract": {
174
+ "column": "twelve_education"
175
+ }
176
+ },
177
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/twelve_education",
178
+ "@type": "cr:Field",
179
+ "name": "twelve_education",
180
+ "description": " Indicates the board or institution where the candidate completed their Class XII education. Analyzing this field can reveal differences in educational standards and curricula that may affect entrance exam outcomes."
181
+ },
182
+ {
183
+ "dataType": [
184
+ "sc:Text"
185
+ ],
186
+ "source": {
187
+ "fileObject": {
188
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
189
+ },
190
+ "extract": {
191
+ "column": "medium"
192
+ }
193
+ },
194
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/medium",
195
+ "@type": "cr:Field",
196
+ "name": "medium",
197
+ "description": "Specifies the medium of instruction used during the candidate\u2019s Class XII education (e.g., English, Assamese, etc.). The medium of instruction might influence comprehension and performance in the exam."
198
+ },
199
+ {
200
+ "dataType": [
201
+ "sc:Text"
202
+ ],
203
+ "source": {
204
+ "fileObject": {
205
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
206
+ },
207
+ "extract": {
208
+ "column": "Class_X_Percentage"
209
+ }
210
+ },
211
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Class_X_Percentage",
212
+ "@type": "cr:Field",
213
+ "name": "Class_X_Percentage",
214
+ "description": " Represents the percentage marks secured by the candidate in their Class X examinations. This score serves as a baseline indicator of academic ability and prior educational attainment."
215
+ },
216
+ {
217
+ "dataType": [
218
+ "sc:Text"
219
+ ],
220
+ "source": {
221
+ "fileObject": {
222
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
223
+ },
224
+ "extract": {
225
+ "column": "Class_XII_Percentage"
226
+ }
227
+ },
228
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Class_XII_Percentage",
229
+ "@type": "cr:Field",
230
+ "name": "Class_XII_Percentage",
231
+ "description": " Denotes the percentage marks achieved by the candidate in their Class XII examinations, providing further insight into their academic consistency and preparation for the entrance exam."
232
+ },
233
+ {
234
+ "dataType": [
235
+ "sc:Text"
236
+ ],
237
+ "source": {
238
+ "fileObject": {
239
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
240
+ },
241
+ "extract": {
242
+ "column": "Father_occupation"
243
+ }
244
+ },
245
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Father_occupation",
246
+ "@type": "cr:Field",
247
+ "name": "Father_occupation",
248
+ "description": "Captures the occupation of the candidate\u2019s father. This socioeconomic indicator can help in understanding how parental employment and associated factors might influence educational opportunities and performance."
249
+ },
250
+ {
251
+ "dataType": [
252
+ "sc:Text"
253
+ ],
254
+ "source": {
255
+ "fileObject": {
256
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
257
+ },
258
+ "extract": {
259
+ "column": "Mother_occupation"
260
+ }
261
+ },
262
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Mother_occupation",
263
+ "@type": "cr:Field",
264
+ "name": "Mother_occupation",
265
+ "description": " Captures the occupation of the candidate\u2019s mother. Like the father's occupation, this field contributes to a broader view of the candidate's socioeconomic background and its potential impact on academic success."
266
+ },
267
+ {
268
+ "dataType": [
269
+ "sc:Text"
270
+ ],
271
+ "source": {
272
+ "fileObject": {
273
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
274
+ },
275
+ "extract": {
276
+ "column": "time"
277
+ }
278
+ },
279
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/time",
280
+ "@type": "cr:Field",
281
+ "name": "time",
282
+ "description": " Records the time or session related to the exam or data collection. This field can be useful for tracking trends over time or correlating performance with specific examination sessions."
283
+ },
284
+ {
285
+ "dataType": [
286
+ "sc:Text"
287
+ ],
288
+ "source": {
289
+ "fileObject": {
290
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
291
+ },
292
+ "extract": {
293
+ "column": "Performance"
294
+ }
295
+ },
296
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv/Performance",
297
+ "@type": "cr:Field",
298
+ "name": "Performance",
299
+ "description": " Represents the candidate\u2019s performance in the Common Entrance Examination (CEE). This could be presented as a numeric score, grade, or categorical outcome (e.g., pass/fail), and serves as the primary variable for assessing academic achievement in the dataset."
300
+ }
301
+ ],
302
+ "@id": "Student_Performance_on_an_Entrance_Examination.csv",
303
+ "@type": "cr:RecordSet",
304
+ "name": "Student_Performance_on_an_Entrance_Examination.csv",
305
+ "description": "- **Gender** \n *Description:* Indicates the candidate\u2019s gender (e.g., Male, Female). This field helps in analyzing performance trends and demographic differences based on gender.\n\n- **Caste** \n *Description:* Specifies the caste category of the candidate (such as General, OBC, SC, ST, etc.). This information can be used to explore socio-cultural factors and their influence on academic performance.\n\n- **coaching** \n *Description:* Denotes whether the candidate attended any coaching classes prior to the examination. It typically categorizes candidates into those who attended coaching within Assam, outside Assam, or not at all, providing insights into the role of supplementary education.\n\n- **Class_ten_education** \n *Description:* Records the board or institution where the candidate completed their Class X education. This can be useful for assessing the impact of the quality of secondary education on subsequent exam performance.\n\n- **twelve_education** \n *Description:* Indicates the board or institution where the candidate completed their Class XII education. Analyzing this field can reveal differences in educational standards and curricula that may affect entrance exam outcomes.\n\n- **medium** \n *Description:* Specifies the medium of instruction used during the candidate\u2019s Class XII education (e.g., English, Assamese, etc.). The medium of instruction might influence comprehension and performance in the exam.\n\n- **Class_X_Percentage** \n *Description:* Represents the percentage marks secured by the candidate in their Class X examinations. This score serves as a baseline indicator of academic ability and prior educational attainment.\n\n- **Class_XII_Percentage** \n *Description:* Denotes the percentage marks achieved by the candidate in their Class XII examinations, providing further insight into their academic consistency and preparation for the entrance exam.\n\n- **Father_occupation** \n *Description:* Captures the occupation of the candidate\u2019s father. This socioeconomic indicator can help in understanding how parental employment and associated factors might influence educational opportunities and performance.\n\n- **Mother_occupation** \n *Description:* Captures the occupation of the candidate\u2019s mother. Like the father's occupation, this field contributes to a broader view of the candidate's socioeconomic background and its potential impact on academic success.\n\n- **time** \n *Description:* Records the time or session related to the exam or data collection. This field can be useful for tracking trends over time or correlating performance with specific examination sessions.\n\n- **Performance** \n *Description:* Represents the candidate\u2019s performance in the Common Entrance Examination (CEE). This could be presented as a numeric score, grade, or categorical outcome (e.g., pass/fail), and serves as the primary variable for assessing academic achievement in the dataset."
306
+ }
307
+ ],
308
+ "version": 1,
309
+ "keywords": [
310
+ "subject > people and society > education",
311
+ "technique > data visualization",
312
+ "technique > exploratory data analysis",
313
+ "subject > people and society > education > universities and colleges",
314
+ "subject > people and society > social science",
315
+ "subject > people and society > education > standardized testing"
316
+ ],
317
+ "isAccessibleForFree": true,
318
+ "includedInDataCatalog": {
319
+ "@type": "sc:DataCatalog",
320
+ "name": "Kaggle",
321
+ "url": "https://www.kaggle.com"
322
+ },
323
+ "creator": {
324
+ "@type": "sc:Person",
325
+ "name": "Adil Shamim",
326
+ "url": "/adilshamim8",
327
+ "image": "https://storage.googleapis.com/kaggle-avatars/thumbnails/22146488-kg.jpg?t=2025-02-08-13-40-43"
328
+ },
329
+ "publisher": {
330
+ "@type": "sc:Organization",
331
+ "name": "Kaggle",
332
+ "url": "https://www.kaggle.com/organizations/kaggle",
333
+ "image": "https://storage.googleapis.com/kaggle-organizations/4/thumbnail.png"
334
+ },
335
+ "thumbnailUrl": "https://storage.googleapis.com/kaggle-datasets-images/6783385/10912302/0f54936fde1351d0247218871f9c6336/dataset-card.jpg?t=2025-03-04-00-24-52",
336
+ "dateModified": "2025-03-04T00:09:21.697",
337
+ "@type": "sc:Dataset",
338
+ "name": "Student Performance on an Entrance Examination",
339
+ "url": "https://www.kaggle.com/datasets/adilshamim8/student-performance-on-an-entrance-examination/versions/1",
340
+ "description": "\n\nThis dataset contains comprehensive information regarding candidates' performance in a common entrance examination, alongside various demographic and academic indicators. It is designed to support analysis into the factors influencing success in competitive exams and can serve as a valuable resource for educational researchers and data scientists.\n\n#### Key Features:\n- **Examination Performance:** Data reflecting the candidate\u2019s results in the entrance examination.\n- **Candidate Demographics:** \n - **Sex:** Gender of the candidate.\n - **Caste:** Caste classification of the candidate.\n- **Coaching Details:**\n - Information on whether the candidate attended coaching classes within Assam, outside Assam, or did not attend any coaching.\n- **Educational Background:**\n - **Board Details:** Names of the boards where the candidate studied during Class X and Class XII.\n - **Medium of Instruction:** The medium used for teaching during Class XII.\n- **Academic Performance:**\n - **Class X Percentage:** Marks secured at the Class X level.\n - **Class XII Percentage:** Marks secured at the Class XII level.\n- **Parental Occupation:**\n - Occupation details for both the candidate's father and mother, which can help analyze socioeconomic influences on performance.\n\n#### Use Cases:\n- **Performance Analysis:** Examine correlations between educational background, coaching, and exam performance.\n- **Predictive Modeling:** Develop models to predict exam outcomes based on prior academic results and demographic factors.\n- **Educational Research:** Explore the impact of socio-economic and educational variables on academic success.\n\nThis dataset is ideal for conducting in-depth studies into the determinants of academic achievement and for designing interventions to improve student performance in competitive exams."
341
+ }
342
+ ```
report_croissant-validation_credit-g.md ADDED
@@ -0,0 +1,982 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CROISSANT VALIDATION REPORT
2
+ ================================================================================
3
+ ## VALIDATION RESULTS
4
+ --------------------------------------------------------------------------------
5
+ Starting validation for file: dataset_31_croissant 14.json
6
+ ### JSON Format Validation
7
+
8
+ The file is valid JSON.
9
+ ### Croissant Schema Validation
10
+
11
+ The dataset passes Croissant validation.
12
+ ### Records Generation Test
13
+
14
+ Record set '_:Ne3c47f5599c9458993fb484e2e59014e' failed: An error occured during the sequential generation of the dataset, more specifically during the operation Join(_:Ne3c47f5599c9458993fb484e2e59014e)
15
+
16
+ Traceback (most recent call last):
17
+ File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in validate_records
18
+ _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
19
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/dafunc.py", line 108, in func_timeout
21
+ raise_exception(exception)
22
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/py3_raise.py", line 7, in raise_exception
23
+ raise exception[0] from None
24
+ File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in <lambda>
25
+ _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
26
+ ^^^^^^^^^^^^^^^^^^^
27
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/datasets.py", line 171, in __iter__
28
+ yield from execute_operations_sequentially(
29
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 72, in execute_operations_sequentially
30
+ raise GenerationError(
31
+ mlcroissant._src.core.issues.GenerationError: An error occured during the sequential generation of the dataset, more specifically during the operation Join(_:Ne3c47f5599c9458993fb484e2e59014e)
32
+ ## JSON-LD REFERENCE
33
+ ================================================================================
34
+ ```json
35
+ {
36
+ "@context": {
37
+ "@language": "en",
38
+ "@vocab": "https://schema.org/",
39
+ "citeAs": "cr:citeAs",
40
+ "column": "cr:column",
41
+ "conformsTo": "dct:conformsTo",
42
+ "cr": "http://mlcommons.org/croissant/",
43
+ "rai": "http://mlcommons.org/croissant/RAI/",
44
+ "data": {
45
+ "@id": "cr:data",
46
+ "@type": "@json"
47
+ },
48
+ "dataType": {
49
+ "@id": "cr:dataType",
50
+ "@type": "@vocab"
51
+ },
52
+ "dct": "http://purl.org/dc/terms/",
53
+ "examples": {
54
+ "@id": "cr:examples",
55
+ "@type": "@json"
56
+ },
57
+ "extract": "cr:extract",
58
+ "field": "cr:field",
59
+ "fileProperty": "cr:fileProperty",
60
+ "fileObject": "cr:fileObject",
61
+ "fileSet": "cr:fileSet",
62
+ "format": "cr:format",
63
+ "includes": "cr:includes",
64
+ "isLiveDataset": "cr:isLiveDataset",
65
+ "jsonPath": "cr:jsonPath",
66
+ "key": "cr:key",
67
+ "md5": "cr:md5",
68
+ "parentField": "cr:parentField",
69
+ "path": "cr:path",
70
+ "recordSet": "cr:recordSet",
71
+ "references": "cr:references",
72
+ "regex": "cr:regex",
73
+ "repeated": "cr:repeated",
74
+ "replace": "cr:replace",
75
+ "sc": "https://schema.org/",
76
+ "separator": "cr:separator",
77
+ "source": "cr:source",
78
+ "subField": "cr:subField",
79
+ "transform": "cr:transform",
80
+ "@base": "cr_base_iri/"
81
+ },
82
+ "@type": "sc:Dataset",
83
+ "citeAs": "https://dl.acm.org/doi/abs/10.1145/967900.968104",
84
+ "conformsTo": "http://mlcommons.org/croissant/1.0",
85
+ "creator": [
86
+ {
87
+ "@type": "sc:Person",
88
+ "name": "Dr. Hans Hofmann"
89
+ }
90
+ ],
91
+ "dateCreated": "2014-04-06T23:21:47",
92
+ "datePublished": "1994-11-17T00:00:00",
93
+ "description": "**Author**: Dr. Hans Hofmann \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994 \n**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n**German Credit dataset** \nThis dataset classifies people described by a set of attributes as good or bad credit risks.\n\nThis dataset comes with a cost matrix: \n``` \nGood Bad (predicted) \nGood 0 1 (actual) \nBad 5 0 \n```\n\nIt is worse to class a customer as good when they are bad (5), than it is to class a customer as bad when they are good (1). \n\n### Attribute description \n\n1. Status of existing checking account, in Deutsche Mark. \n2. Duration in months \n3. Credit history (credits taken, paid back duly, delays, critical accounts) \n4. Purpose of the credit (car, television,...) \n5. Credit amount \n6. Status of savings account/bonds, in Deutsche Mark. \n7. Present employment, in number of years. \n8. Installment rate in percentage of disposable income \n9. Personal status (married, single,...) and sex \n10. Other debtors / guarantors \n11. Present residence since X years \n12. Property (e.g. real estate) \n13. Age in years \n14. Other installment plans (banks, stores) \n15. Housing (rent, own,...) \n16. Number of existing credits at this bank \n17. Job \n18. Number of people being liable to provide maintenance for \n19. Telephone (yes,no) \n20. Foreign worker (yes,no)",
94
+ "inLanguage": "en",
95
+ "isAccessibleForFree": true,
96
+ "keywords": [
97
+ "credit_scoring",
98
+ "Data Science",
99
+ "Economics",
100
+ "finance_problem",
101
+ "mythbusting_1",
102
+ "OpenML-CC18",
103
+ "OpenML100",
104
+ "Statistics",
105
+ "study_1",
106
+ "study_123",
107
+ "study_14",
108
+ "study_144",
109
+ "study_15",
110
+ "study_20",
111
+ "study_218",
112
+ "study_241",
113
+ "study_34",
114
+ "study_37",
115
+ "study_41",
116
+ "study_50",
117
+ "study_52",
118
+ "study_7",
119
+ "study_70",
120
+ "study_98",
121
+ "study_99",
122
+ "uci"
123
+ ],
124
+ "license": "Public",
125
+ "name": "credit-g",
126
+ "sameAs": "https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)",
127
+ "url": "https://www.openml.org/search?type=data&id=31",
128
+ "version": 1,
129
+ "distribution": [
130
+ {
131
+ "@type": "cr:FileObject",
132
+ "@id": "data-file",
133
+ "name": "data-file",
134
+ "description": "Data file belonging to the dataset.",
135
+ "contentUrl": "https://api.openml.org/data/v1/download/31/credit-g.arff",
136
+ "encodingFormat": "text/plain",
137
+ "md5": "9a475053fed0c26ee95cd4525e50074c"
138
+ }
139
+ ],
140
+ "recordSet": [
141
+ {
142
+ "@type": "cr:RecordSet",
143
+ "@id": "enumerations/checking_status",
144
+ "name": "checking_status",
145
+ "description": "Possible values for checking_status",
146
+ "dataType": "sc:Enumeration",
147
+ "field": [
148
+ {
149
+ "@type": "cr:Field",
150
+ "@id": "enumerations/checking_status/value",
151
+ "name": "value",
152
+ "description": "The value of checking_status.",
153
+ "dataType": "sc:Text"
154
+ }
155
+ ],
156
+ "data": [
157
+ {
158
+ "enumerations/checking_status/value": "0<=X<200"
159
+ },
160
+ {
161
+ "enumerations/checking_status/value": "<0"
162
+ },
163
+ {
164
+ "enumerations/checking_status/value": ">=200"
165
+ },
166
+ {
167
+ "enumerations/checking_status/value": "no checking"
168
+ }
169
+ ]
170
+ },
171
+ {
172
+ "@type": "cr:RecordSet",
173
+ "@id": "enumerations/credit_history",
174
+ "name": "credit_history",
175
+ "description": "Possible values for credit_history",
176
+ "dataType": "sc:Enumeration",
177
+ "field": [
178
+ {
179
+ "@type": "cr:Field",
180
+ "@id": "enumerations/credit_history/value",
181
+ "name": "value",
182
+ "description": "The value of credit_history.",
183
+ "dataType": "sc:Text"
184
+ }
185
+ ],
186
+ "data": [
187
+ {
188
+ "enumerations/credit_history/value": "all paid"
189
+ },
190
+ {
191
+ "enumerations/credit_history/value": "critical/other existing credit"
192
+ },
193
+ {
194
+ "enumerations/credit_history/value": "delayed previously"
195
+ },
196
+ {
197
+ "enumerations/credit_history/value": "existing paid"
198
+ },
199
+ {
200
+ "enumerations/credit_history/value": "no credits/all paid"
201
+ }
202
+ ]
203
+ },
204
+ {
205
+ "@type": "cr:RecordSet",
206
+ "@id": "enumerations/purpose",
207
+ "name": "purpose",
208
+ "description": "Possible values for purpose",
209
+ "dataType": "sc:Enumeration",
210
+ "field": [
211
+ {
212
+ "@type": "cr:Field",
213
+ "@id": "enumerations/purpose/value",
214
+ "name": "value",
215
+ "description": "The value of purpose.",
216
+ "dataType": "sc:Text"
217
+ }
218
+ ],
219
+ "data": [
220
+ {
221
+ "enumerations/purpose/value": "business"
222
+ },
223
+ {
224
+ "enumerations/purpose/value": "domestic appliance"
225
+ },
226
+ {
227
+ "enumerations/purpose/value": "education"
228
+ },
229
+ {
230
+ "enumerations/purpose/value": "furniture/equipment"
231
+ },
232
+ {
233
+ "enumerations/purpose/value": "new car"
234
+ },
235
+ {
236
+ "enumerations/purpose/value": "other"
237
+ },
238
+ {
239
+ "enumerations/purpose/value": "radio/tv"
240
+ },
241
+ {
242
+ "enumerations/purpose/value": "repairs"
243
+ },
244
+ {
245
+ "enumerations/purpose/value": "retraining"
246
+ },
247
+ {
248
+ "enumerations/purpose/value": "used car"
249
+ },
250
+ {
251
+ "enumerations/purpose/value": "vacation"
252
+ }
253
+ ]
254
+ },
255
+ {
256
+ "@type": "cr:RecordSet",
257
+ "@id": "enumerations/savings_status",
258
+ "name": "savings_status",
259
+ "description": "Possible values for savings_status",
260
+ "dataType": "sc:Enumeration",
261
+ "field": [
262
+ {
263
+ "@type": "cr:Field",
264
+ "@id": "enumerations/savings_status/value",
265
+ "name": "value",
266
+ "description": "The value of savings_status.",
267
+ "dataType": "sc:Text"
268
+ }
269
+ ],
270
+ "data": [
271
+ {
272
+ "enumerations/savings_status/value": "100<=X<500"
273
+ },
274
+ {
275
+ "enumerations/savings_status/value": "500<=X<1000"
276
+ },
277
+ {
278
+ "enumerations/savings_status/value": "<100"
279
+ },
280
+ {
281
+ "enumerations/savings_status/value": ">=1000"
282
+ },
283
+ {
284
+ "enumerations/savings_status/value": "no known savings"
285
+ }
286
+ ]
287
+ },
288
+ {
289
+ "@type": "cr:RecordSet",
290
+ "@id": "enumerations/employment",
291
+ "name": "employment",
292
+ "description": "Possible values for employment",
293
+ "dataType": "sc:Enumeration",
294
+ "field": [
295
+ {
296
+ "@type": "cr:Field",
297
+ "@id": "enumerations/employment/value",
298
+ "name": "value",
299
+ "description": "The value of employment.",
300
+ "dataType": "sc:Text"
301
+ }
302
+ ],
303
+ "data": [
304
+ {
305
+ "enumerations/employment/value": "1<=X<4"
306
+ },
307
+ {
308
+ "enumerations/employment/value": "4<=X<7"
309
+ },
310
+ {
311
+ "enumerations/employment/value": "<1"
312
+ },
313
+ {
314
+ "enumerations/employment/value": ">=7"
315
+ },
316
+ {
317
+ "enumerations/employment/value": "unemployed"
318
+ }
319
+ ]
320
+ },
321
+ {
322
+ "@type": "cr:RecordSet",
323
+ "@id": "enumerations/personal_status",
324
+ "name": "personal_status",
325
+ "description": "Possible values for personal_status",
326
+ "dataType": "sc:Enumeration",
327
+ "field": [
328
+ {
329
+ "@type": "cr:Field",
330
+ "@id": "enumerations/personal_status/value",
331
+ "name": "value",
332
+ "description": "The value of personal_status.",
333
+ "dataType": "sc:Text"
334
+ }
335
+ ],
336
+ "data": [
337
+ {
338
+ "enumerations/personal_status/value": "female div/dep/mar"
339
+ },
340
+ {
341
+ "enumerations/personal_status/value": "female single"
342
+ },
343
+ {
344
+ "enumerations/personal_status/value": "male div/sep"
345
+ },
346
+ {
347
+ "enumerations/personal_status/value": "male mar/wid"
348
+ },
349
+ {
350
+ "enumerations/personal_status/value": "male single"
351
+ }
352
+ ]
353
+ },
354
+ {
355
+ "@type": "cr:RecordSet",
356
+ "@id": "enumerations/other_parties",
357
+ "name": "other_parties",
358
+ "description": "Possible values for other_parties",
359
+ "dataType": "sc:Enumeration",
360
+ "field": [
361
+ {
362
+ "@type": "cr:Field",
363
+ "@id": "enumerations/other_parties/value",
364
+ "name": "value",
365
+ "description": "The value of other_parties.",
366
+ "dataType": "sc:Text"
367
+ }
368
+ ],
369
+ "data": [
370
+ {
371
+ "enumerations/other_parties/value": "co applicant"
372
+ },
373
+ {
374
+ "enumerations/other_parties/value": "guarantor"
375
+ },
376
+ {
377
+ "enumerations/other_parties/value": "none"
378
+ }
379
+ ]
380
+ },
381
+ {
382
+ "@type": "cr:RecordSet",
383
+ "@id": "enumerations/property_magnitude",
384
+ "name": "property_magnitude",
385
+ "description": "Possible values for property_magnitude",
386
+ "dataType": "sc:Enumeration",
387
+ "field": [
388
+ {
389
+ "@type": "cr:Field",
390
+ "@id": "enumerations/property_magnitude/value",
391
+ "name": "value",
392
+ "description": "The value of property_magnitude.",
393
+ "dataType": "sc:Text"
394
+ }
395
+ ],
396
+ "data": [
397
+ {
398
+ "enumerations/property_magnitude/value": "car"
399
+ },
400
+ {
401
+ "enumerations/property_magnitude/value": "life insurance"
402
+ },
403
+ {
404
+ "enumerations/property_magnitude/value": "no known property"
405
+ },
406
+ {
407
+ "enumerations/property_magnitude/value": "real estate"
408
+ }
409
+ ]
410
+ },
411
+ {
412
+ "@type": "cr:RecordSet",
413
+ "@id": "enumerations/other_payment_plans",
414
+ "name": "other_payment_plans",
415
+ "description": "Possible values for other_payment_plans",
416
+ "dataType": "sc:Enumeration",
417
+ "field": [
418
+ {
419
+ "@type": "cr:Field",
420
+ "@id": "enumerations/other_payment_plans/value",
421
+ "name": "value",
422
+ "description": "The value of other_payment_plans.",
423
+ "dataType": "sc:Text"
424
+ }
425
+ ],
426
+ "data": [
427
+ {
428
+ "enumerations/other_payment_plans/value": "bank"
429
+ },
430
+ {
431
+ "enumerations/other_payment_plans/value": "none"
432
+ },
433
+ {
434
+ "enumerations/other_payment_plans/value": "stores"
435
+ }
436
+ ]
437
+ },
438
+ {
439
+ "@type": "cr:RecordSet",
440
+ "@id": "enumerations/housing",
441
+ "name": "housing",
442
+ "description": "Possible values for housing",
443
+ "dataType": "sc:Enumeration",
444
+ "field": [
445
+ {
446
+ "@type": "cr:Field",
447
+ "@id": "enumerations/housing/value",
448
+ "name": "value",
449
+ "description": "The value of housing.",
450
+ "dataType": "sc:Text"
451
+ }
452
+ ],
453
+ "data": [
454
+ {
455
+ "enumerations/housing/value": "for free"
456
+ },
457
+ {
458
+ "enumerations/housing/value": "own"
459
+ },
460
+ {
461
+ "enumerations/housing/value": "rent"
462
+ }
463
+ ]
464
+ },
465
+ {
466
+ "@type": "cr:RecordSet",
467
+ "@id": "enumerations/job",
468
+ "name": "job",
469
+ "description": "Possible values for job",
470
+ "dataType": "sc:Enumeration",
471
+ "field": [
472
+ {
473
+ "@type": "cr:Field",
474
+ "@id": "enumerations/job/value",
475
+ "name": "value",
476
+ "description": "The value of job.",
477
+ "dataType": "sc:Text"
478
+ }
479
+ ],
480
+ "data": [
481
+ {
482
+ "enumerations/job/value": "high qualif/self emp/mgmt"
483
+ },
484
+ {
485
+ "enumerations/job/value": "skilled"
486
+ },
487
+ {
488
+ "enumerations/job/value": "unemp/unskilled non res"
489
+ },
490
+ {
491
+ "enumerations/job/value": "unskilled resident"
492
+ }
493
+ ]
494
+ },
495
+ {
496
+ "@type": "cr:RecordSet",
497
+ "@id": "enumerations/own_telephone",
498
+ "name": "own_telephone",
499
+ "description": "Possible values for own_telephone",
500
+ "dataType": "sc:Enumeration",
501
+ "field": [
502
+ {
503
+ "@type": "cr:Field",
504
+ "@id": "enumerations/own_telephone/value",
505
+ "name": "value",
506
+ "description": "The value of own_telephone.",
507
+ "dataType": "sc:Text"
508
+ }
509
+ ],
510
+ "data": [
511
+ {
512
+ "enumerations/own_telephone/value": "none"
513
+ },
514
+ {
515
+ "enumerations/own_telephone/value": "yes"
516
+ }
517
+ ]
518
+ },
519
+ {
520
+ "@type": "cr:RecordSet",
521
+ "@id": "enumerations/foreign_worker",
522
+ "name": "foreign_worker",
523
+ "description": "Possible values for foreign_worker",
524
+ "dataType": "sc:Enumeration",
525
+ "field": [
526
+ {
527
+ "@type": "cr:Field",
528
+ "@id": "enumerations/foreign_worker/value",
529
+ "name": "value",
530
+ "description": "The value of foreign_worker.",
531
+ "dataType": "sc:Text"
532
+ }
533
+ ],
534
+ "data": [
535
+ {
536
+ "enumerations/foreign_worker/value": "no"
537
+ },
538
+ {
539
+ "enumerations/foreign_worker/value": "yes"
540
+ }
541
+ ]
542
+ },
543
+ {
544
+ "@type": "cr:RecordSet",
545
+ "@id": "enumerations/class",
546
+ "name": "class",
547
+ "description": "Possible values for class",
548
+ "dataType": "sc:Enumeration",
549
+ "field": [
550
+ {
551
+ "@type": "cr:Field",
552
+ "@id": "enumerations/class/value",
553
+ "name": "value",
554
+ "description": "The value of class.",
555
+ "dataType": "sc:Text"
556
+ }
557
+ ],
558
+ "data": [
559
+ {
560
+ "enumerations/class/value": "bad"
561
+ },
562
+ {
563
+ "enumerations/class/value": "good"
564
+ }
565
+ ]
566
+ },
567
+ {
568
+ "@type": "cr:RecordSet",
569
+ "name": "data-file-description",
570
+ "description": "Listing the fields of the data.",
571
+ "field": [
572
+ {
573
+ "@type": "cr:Field",
574
+ "@id": "features/0-checking_status",
575
+ "name": "checking_status",
576
+ "description": "checking_status - a field.",
577
+ "dataType": "sc:Text",
578
+ "references": {
579
+ "field": {
580
+ "@id": "enumerations/checking_status/value"
581
+ }
582
+ },
583
+ "source": {
584
+ "fileObject": {
585
+ "@id": "data-file"
586
+ },
587
+ "extract": {
588
+ "column": "checking_status"
589
+ }
590
+ }
591
+ },
592
+ {
593
+ "@type": "cr:Field",
594
+ "@id": "features/1-duration",
595
+ "name": "duration",
596
+ "description": "duration - a field.",
597
+ "dataType": [
598
+ "sc:Float",
599
+ "sc:Integer"
600
+ ],
601
+ "source": {
602
+ "fileObject": {
603
+ "@id": "data-file"
604
+ },
605
+ "extract": {
606
+ "column": "duration"
607
+ }
608
+ }
609
+ },
610
+ {
611
+ "@type": "cr:Field",
612
+ "@id": "features/2-credit_history",
613
+ "name": "credit_history",
614
+ "description": "credit_history - a field.",
615
+ "dataType": "sc:Text",
616
+ "references": {
617
+ "field": {
618
+ "@id": "enumerations/credit_history/value"
619
+ }
620
+ },
621
+ "source": {
622
+ "fileObject": {
623
+ "@id": "data-file"
624
+ },
625
+ "extract": {
626
+ "column": "credit_history"
627
+ }
628
+ }
629
+ },
630
+ {
631
+ "@type": "cr:Field",
632
+ "@id": "features/3-purpose",
633
+ "name": "purpose",
634
+ "description": "purpose - a field.",
635
+ "dataType": "sc:Text",
636
+ "references": {
637
+ "field": {
638
+ "@id": "enumerations/purpose/value"
639
+ }
640
+ },
641
+ "source": {
642
+ "fileObject": {
643
+ "@id": "data-file"
644
+ },
645
+ "extract": {
646
+ "column": "purpose"
647
+ }
648
+ }
649
+ },
650
+ {
651
+ "@type": "cr:Field",
652
+ "@id": "features/4-credit_amount",
653
+ "name": "credit_amount",
654
+ "description": "credit_amount - a field.",
655
+ "dataType": [
656
+ "sc:Float",
657
+ "sc:Integer"
658
+ ],
659
+ "source": {
660
+ "fileObject": {
661
+ "@id": "data-file"
662
+ },
663
+ "extract": {
664
+ "column": "credit_amount"
665
+ }
666
+ }
667
+ },
668
+ {
669
+ "@type": "cr:Field",
670
+ "@id": "features/5-savings_status",
671
+ "name": "savings_status",
672
+ "description": "savings_status - a field.",
673
+ "dataType": "sc:Text",
674
+ "references": {
675
+ "field": {
676
+ "@id": "enumerations/savings_status/value"
677
+ }
678
+ },
679
+ "source": {
680
+ "fileObject": {
681
+ "@id": "data-file"
682
+ },
683
+ "extract": {
684
+ "column": "savings_status"
685
+ }
686
+ }
687
+ },
688
+ {
689
+ "@type": "cr:Field",
690
+ "@id": "features/6-employment",
691
+ "name": "employment",
692
+ "description": "employment - a field.",
693
+ "dataType": "sc:Text",
694
+ "references": {
695
+ "field": {
696
+ "@id": "enumerations/employment/value"
697
+ }
698
+ },
699
+ "source": {
700
+ "fileObject": {
701
+ "@id": "data-file"
702
+ },
703
+ "extract": {
704
+ "column": "employment"
705
+ }
706
+ }
707
+ },
708
+ {
709
+ "@type": "cr:Field",
710
+ "@id": "features/7-installment_commitment",
711
+ "name": "installment_commitment",
712
+ "description": "installment_commitment - a field.",
713
+ "dataType": [
714
+ "sc:Float",
715
+ "sc:Integer"
716
+ ],
717
+ "source": {
718
+ "fileObject": {
719
+ "@id": "data-file"
720
+ },
721
+ "extract": {
722
+ "column": "installment_commitment"
723
+ }
724
+ }
725
+ },
726
+ {
727
+ "@type": "cr:Field",
728
+ "@id": "features/8-personal_status",
729
+ "name": "personal_status",
730
+ "description": "personal_status - a field.",
731
+ "dataType": "sc:Text",
732
+ "references": {
733
+ "field": {
734
+ "@id": "enumerations/personal_status/value"
735
+ }
736
+ },
737
+ "source": {
738
+ "fileObject": {
739
+ "@id": "data-file"
740
+ },
741
+ "extract": {
742
+ "column": "personal_status"
743
+ }
744
+ }
745
+ },
746
+ {
747
+ "@type": "cr:Field",
748
+ "@id": "features/9-other_parties",
749
+ "name": "other_parties",
750
+ "description": "other_parties - a field.",
751
+ "dataType": "sc:Text",
752
+ "references": {
753
+ "field": {
754
+ "@id": "enumerations/other_parties/value"
755
+ }
756
+ },
757
+ "source": {
758
+ "fileObject": {
759
+ "@id": "data-file"
760
+ },
761
+ "extract": {
762
+ "column": "other_parties"
763
+ }
764
+ }
765
+ },
766
+ {
767
+ "@type": "cr:Field",
768
+ "@id": "features/10-residence_since",
769
+ "name": "residence_since",
770
+ "description": "residence_since - a field.",
771
+ "dataType": [
772
+ "sc:Float",
773
+ "sc:Integer"
774
+ ],
775
+ "source": {
776
+ "fileObject": {
777
+ "@id": "data-file"
778
+ },
779
+ "extract": {
780
+ "column": "residence_since"
781
+ }
782
+ }
783
+ },
784
+ {
785
+ "@type": "cr:Field",
786
+ "@id": "features/11-property_magnitude",
787
+ "name": "property_magnitude",
788
+ "description": "property_magnitude - a field.",
789
+ "dataType": "sc:Text",
790
+ "references": {
791
+ "field": {
792
+ "@id": "enumerations/property_magnitude/value"
793
+ }
794
+ },
795
+ "source": {
796
+ "fileObject": {
797
+ "@id": "data-file"
798
+ },
799
+ "extract": {
800
+ "column": "property_magnitude"
801
+ }
802
+ }
803
+ },
804
+ {
805
+ "@type": "cr:Field",
806
+ "@id": "features/12-age",
807
+ "name": "age",
808
+ "description": "age - a field.",
809
+ "dataType": [
810
+ "sc:Float",
811
+ "sc:Integer"
812
+ ],
813
+ "source": {
814
+ "fileObject": {
815
+ "@id": "data-file"
816
+ },
817
+ "extract": {
818
+ "column": "age"
819
+ }
820
+ }
821
+ },
822
+ {
823
+ "@type": "cr:Field",
824
+ "@id": "features/13-other_payment_plans",
825
+ "name": "other_payment_plans",
826
+ "description": "other_payment_plans - a field.",
827
+ "dataType": "sc:Text",
828
+ "references": {
829
+ "field": {
830
+ "@id": "enumerations/other_payment_plans/value"
831
+ }
832
+ },
833
+ "source": {
834
+ "fileObject": {
835
+ "@id": "data-file"
836
+ },
837
+ "extract": {
838
+ "column": "other_payment_plans"
839
+ }
840
+ }
841
+ },
842
+ {
843
+ "@type": "cr:Field",
844
+ "@id": "features/14-housing",
845
+ "name": "housing",
846
+ "description": "housing - a field.",
847
+ "dataType": "sc:Text",
848
+ "references": {
849
+ "field": {
850
+ "@id": "enumerations/housing/value"
851
+ }
852
+ },
853
+ "source": {
854
+ "fileObject": {
855
+ "@id": "data-file"
856
+ },
857
+ "extract": {
858
+ "column": "housing"
859
+ }
860
+ }
861
+ },
862
+ {
863
+ "@type": "cr:Field",
864
+ "@id": "features/15-existing_credits",
865
+ "name": "existing_credits",
866
+ "description": "existing_credits - a field.",
867
+ "dataType": [
868
+ "sc:Float",
869
+ "sc:Integer"
870
+ ],
871
+ "source": {
872
+ "fileObject": {
873
+ "@id": "data-file"
874
+ },
875
+ "extract": {
876
+ "column": "existing_credits"
877
+ }
878
+ }
879
+ },
880
+ {
881
+ "@type": "cr:Field",
882
+ "@id": "features/16-job",
883
+ "name": "job",
884
+ "description": "job - a field.",
885
+ "dataType": "sc:Text",
886
+ "references": {
887
+ "field": {
888
+ "@id": "enumerations/job/value"
889
+ }
890
+ },
891
+ "source": {
892
+ "fileObject": {
893
+ "@id": "data-file"
894
+ },
895
+ "extract": {
896
+ "column": "job"
897
+ }
898
+ }
899
+ },
900
+ {
901
+ "@type": "cr:Field",
902
+ "@id": "features/17-num_dependents",
903
+ "name": "num_dependents",
904
+ "description": "num_dependents - a field.",
905
+ "dataType": [
906
+ "sc:Float",
907
+ "sc:Integer"
908
+ ],
909
+ "source": {
910
+ "fileObject": {
911
+ "@id": "data-file"
912
+ },
913
+ "extract": {
914
+ "column": "num_dependents"
915
+ }
916
+ }
917
+ },
918
+ {
919
+ "@type": "cr:Field",
920
+ "@id": "features/18-own_telephone",
921
+ "name": "own_telephone",
922
+ "description": "own_telephone - a field.",
923
+ "dataType": "sc:Text",
924
+ "references": {
925
+ "field": {
926
+ "@id": "enumerations/own_telephone/value"
927
+ }
928
+ },
929
+ "source": {
930
+ "fileObject": {
931
+ "@id": "data-file"
932
+ },
933
+ "extract": {
934
+ "column": "own_telephone"
935
+ }
936
+ }
937
+ },
938
+ {
939
+ "@type": "cr:Field",
940
+ "@id": "features/19-foreign_worker",
941
+ "name": "foreign_worker",
942
+ "description": "foreign_worker - a field.",
943
+ "dataType": "sc:Text",
944
+ "references": {
945
+ "field": {
946
+ "@id": "enumerations/foreign_worker/value"
947
+ }
948
+ },
949
+ "source": {
950
+ "fileObject": {
951
+ "@id": "data-file"
952
+ },
953
+ "extract": {
954
+ "column": "foreign_worker"
955
+ }
956
+ }
957
+ },
958
+ {
959
+ "@type": "cr:Field",
960
+ "@id": "features/20-class",
961
+ "name": "class",
962
+ "description": "class - the default target field.",
963
+ "dataType": "sc:Text",
964
+ "references": {
965
+ "field": {
966
+ "@id": "enumerations/class/value"
967
+ }
968
+ },
969
+ "source": {
970
+ "fileObject": {
971
+ "@id": "data-file"
972
+ },
973
+ "extract": {
974
+ "column": "class"
975
+ }
976
+ }
977
+ }
978
+ ]
979
+ }
980
+ ]
981
+ }
982
+ ```
report_croissant-validation_natural_reasoning.md ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CROISSANT VALIDATION REPORT
2
+ ================================================================================
3
+ ## VALIDATION RESULTS
4
+ --------------------------------------------------------------------------------
5
+ Starting validation for file: croissant
6
+ ### JSON Format Validation
7
+
8
+ The URL returned valid JSON.
9
+ ### Croissant Schema Validation
10
+
11
+ The dataset passes Croissant validation.
12
+ ### Records Generation Test
13
+
14
+ Record set 'default' failed: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
15
+
16
+ Traceback (most recent call last):
17
+ File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in validate_records
18
+ _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
19
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/dafunc.py", line 108, in func_timeout
21
+ raise_exception(exception)
22
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/py3_raise.py", line 7, in raise_exception
23
+ raise exception[0] from None
24
+ File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in <lambda>
25
+ _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
26
+ ^^^^^^^^^^^^^^^^^^^
27
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/datasets.py", line 171, in __iter__
28
+ yield from execute_operations_sequentially(
29
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 72, in execute_operations_sequentially
30
+ raise GenerationError(
31
+ mlcroissant._src.core.issues.GenerationError: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
32
+ ## JSON-LD REFERENCE
33
+ ================================================================================
34
+ ```json
35
+ {
36
+ "@context": {
37
+ "@language": "en",
38
+ "@vocab": "https://schema.org/",
39
+ "citeAs": "cr:citeAs",
40
+ "column": "cr:column",
41
+ "conformsTo": "dct:conformsTo",
42
+ "cr": "http://mlcommons.org/croissant/",
43
+ "data": {
44
+ "@id": "cr:data",
45
+ "@type": "@json"
46
+ },
47
+ "dataBiases": "cr:dataBiases",
48
+ "dataCollection": "cr:dataCollection",
49
+ "dataType": {
50
+ "@id": "cr:dataType",
51
+ "@type": "@vocab"
52
+ },
53
+ "dct": "http://purl.org/dc/terms/",
54
+ "extract": "cr:extract",
55
+ "field": "cr:field",
56
+ "fileProperty": "cr:fileProperty",
57
+ "fileObject": "cr:fileObject",
58
+ "fileSet": "cr:fileSet",
59
+ "format": "cr:format",
60
+ "includes": "cr:includes",
61
+ "isLiveDataset": "cr:isLiveDataset",
62
+ "jsonPath": "cr:jsonPath",
63
+ "key": "cr:key",
64
+ "md5": "cr:md5",
65
+ "parentField": "cr:parentField",
66
+ "path": "cr:path",
67
+ "personalSensitiveInformation": "cr:personalSensitiveInformation",
68
+ "recordSet": "cr:recordSet",
69
+ "references": "cr:references",
70
+ "regex": "cr:regex",
71
+ "repeated": "cr:repeated",
72
+ "replace": "cr:replace",
73
+ "sc": "https://schema.org/",
74
+ "separator": "cr:separator",
75
+ "source": "cr:source",
76
+ "subField": "cr:subField",
77
+ "transform": "cr:transform",
78
+ "@base": "cr_base_iri/"
79
+ },
80
+ "@type": "sc:Dataset",
81
+ "distribution": [
82
+ {
83
+ "@type": "cr:FileObject",
84
+ "@id": "repo",
85
+ "name": "repo",
86
+ "description": "The Hugging Face git repository.",
87
+ "contentUrl": "https://huggingface.co/datasets/facebook/natural_reasoning/tree/refs%2Fconvert%2Fparquet",
88
+ "encodingFormat": "git+https",
89
+ "sha256": "https://github.com/mlcommons/croissant/issues/80"
90
+ },
91
+ {
92
+ "@type": "cr:FileSet",
93
+ "@id": "parquet-files-for-config-default",
94
+ "containedIn": {
95
+ "@id": "repo"
96
+ },
97
+ "encodingFormat": "application/x-parquet",
98
+ "includes": "default/*/*.parquet"
99
+ }
100
+ ],
101
+ "recordSet": [
102
+ {
103
+ "@type": "cr:RecordSet",
104
+ "dataType": "cr:Split",
105
+ "key": {
106
+ "@id": "default_splits/split_name"
107
+ },
108
+ "@id": "default_splits",
109
+ "name": "default_splits",
110
+ "description": "Splits for the default config.",
111
+ "field": [
112
+ {
113
+ "@type": "cr:Field",
114
+ "@id": "default_splits/split_name",
115
+ "dataType": "sc:Text"
116
+ }
117
+ ],
118
+ "data": [
119
+ {
120
+ "default_splits/split_name": "train"
121
+ }
122
+ ]
123
+ },
124
+ {
125
+ "@type": "cr:RecordSet",
126
+ "@id": "default",
127
+ "description": "facebook/natural_reasoning - 'default' subset",
128
+ "field": [
129
+ {
130
+ "@type": "cr:Field",
131
+ "@id": "default/split",
132
+ "dataType": "sc:Text",
133
+ "source": {
134
+ "fileSet": {
135
+ "@id": "parquet-files-for-config-default"
136
+ },
137
+ "extract": {
138
+ "fileProperty": "fullpath"
139
+ },
140
+ "transform": {
141
+ "regex": "default/(?:partial-)?(train)/.+parquet$"
142
+ }
143
+ },
144
+ "references": {
145
+ "field": {
146
+ "@id": "default_splits/split_name"
147
+ }
148
+ }
149
+ },
150
+ {
151
+ "@type": "cr:Field",
152
+ "@id": "default/question",
153
+ "dataType": "sc:Text",
154
+ "source": {
155
+ "fileSet": {
156
+ "@id": "parquet-files-for-config-default"
157
+ },
158
+ "extract": {
159
+ "column": "question"
160
+ }
161
+ }
162
+ },
163
+ {
164
+ "@type": "cr:Field",
165
+ "@id": "default/reference_answer",
166
+ "dataType": "sc:Text",
167
+ "source": {
168
+ "fileSet": {
169
+ "@id": "parquet-files-for-config-default"
170
+ },
171
+ "extract": {
172
+ "column": "reference_answer"
173
+ }
174
+ }
175
+ },
176
+ {
177
+ "@type": "cr:Field",
178
+ "@id": "default/responses",
179
+ "subField": [
180
+ {
181
+ "@type": "cr:Field",
182
+ "@id": "default/responses/response_model",
183
+ "dataType": "sc:Text",
184
+ "source": {
185
+ "fileSet": {
186
+ "@id": "parquet-files-for-config-default"
187
+ },
188
+ "extract": {
189
+ "column": "responses"
190
+ },
191
+ "transform": {
192
+ "jsonPath": "response_model"
193
+ }
194
+ }
195
+ },
196
+ {
197
+ "@type": "cr:Field",
198
+ "@id": "default/responses/response",
199
+ "dataType": "sc:Text",
200
+ "source": {
201
+ "fileSet": {
202
+ "@id": "parquet-files-for-config-default"
203
+ },
204
+ "extract": {
205
+ "column": "responses"
206
+ },
207
+ "transform": {
208
+ "jsonPath": "response"
209
+ }
210
+ }
211
+ }
212
+ ],
213
+ "repeated": true
214
+ }
215
+ ]
216
+ }
217
+ ],
218
+ "conformsTo": "http://mlcommons.org/croissant/1.0",
219
+ "name": "natural_reasoning",
220
+ "description": "NaturalReasoning is a large-scale dataset for general reasoning tasks. It consists of high-quality challenging reasoning questions backtranslated from pretraining corpora DCLM and FineMath. The questions have been deduplicated and decontaminated from popular reasoning benchmarks including MATH, GPQA, MMLU-Pro, MMLU-STEM. For each question, we extract the reference final answer from the original document from the pretraining corpora if possible. We also provide a model-generated response from\u2026 See the full description on the dataset page: https://huggingface.co/datasets/facebook/natural_reasoning.",
221
+ "alternateName": [
222
+ "facebook/natural_reasoning",
223
+ "Natural Reasoning"
224
+ ],
225
+ "creator": {
226
+ "@type": "Organization",
227
+ "name": "AI at Meta",
228
+ "url": "https://huggingface.co/facebook"
229
+ },
230
+ "keywords": [
231
+ "text-generation",
232
+ "English",
233
+ "cc-by-nc-4.0",
234
+ "1M - 10M",
235
+ "json",
236
+ "Text",
237
+ "Datasets",
238
+ "pandas",
239
+ "Croissant",
240
+ "Polars",
241
+ "arxiv:2502.13124",
242
+ "\ud83c\uddfa\ud83c\uddf8 Region: US"
243
+ ],
244
+ "license": "https://choosealicense.com/licenses/cc-by-nc-4.0/",
245
+ "url": "https://huggingface.co/datasets/facebook/natural_reasoning"
246
+ }
247
+ ```
report_croissant-validation_shitspotter.md ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CROISSANT VALIDATION REPORT
2
+ ================================================================================
3
+ ## VALIDATION RESULTS
4
+ --------------------------------------------------------------------------------
5
+ Starting validation for file: croissant
6
+ ### JSON Format Validation
7
+
8
+ The URL returned valid JSON.
9
+ ### Croissant Schema Validation
10
+
11
+ The dataset passes Croissant validation.
12
+ ### Records Generation Test (Optional)
13
+ ?
14
+ Record set 'default' failed due to generation error:
15
+
16
+ ```text
17
+ An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
18
+
19
+ Traceback (most recent call last):
20
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 70, in execute_operations_sequentially
21
+ operation(set_output_in_memory=True)
22
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/base_operation.py", line 121, in __call__
23
+ output = self.call() if inputs is None else self.call(*inputs)
24
+ ^^^^^^^^^^^^^^^^^^
25
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/operations/read.py", line 196, in call
26
+ file_content = self._read_file_content(self.node.encoding_formats, file)
27
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
28
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/operations/read.py", line 135, in _read_file_content
29
+ df = pd.read_parquet(file)
30
+ ^^^^^^^^^^^^^^^^^^^^^
31
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pandas/io/parquet.py", line 670, in read_parquet
32
+ return impl.read(
33
+ ^^^^^^^^^^
34
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pandas/io/parquet.py", line 272, in read
35
+ pa_table = self.api.parquet.read_table(
36
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
37
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pyarrow/parquet/core.py", line 1793, in read_table
38
+ dataset = ParquetDataset(
39
+ ^^^^^^^^^^^^^^^
40
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pyarrow/parquet/core.py", line 1360, in __init__
41
+ [fragment], schema=schema or fragment.physical_schema,
42
+ ^^^^^^^^^^^^^^^^^^^^^^^^
43
+ File "pyarrow/_dataset.pyx", line 1431, in pyarrow._dataset.Fragment.physical_schema.__get__
44
+ File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
45
+ File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
46
+ pyarrow.lib.ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.
47
+
48
+ The above exception was the direct cause of the following exception:
49
+
50
+ Traceback (most recent call last):
51
+ File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 61, in validate_records
52
+ raise result # re-raise actual error outside timeout
53
+ ^^^^^^^^^^^^
54
+ File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 37, in try_generate_record
55
+ next(record_iterator)
56
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/datasets.py", line 171, in __iter__
57
+ yield from execute_operations_sequentially(
58
+ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 72, in execute_operations_sequentially
59
+ raise GenerationError(
60
+ mlcroissant._src.core.issues.GenerationError: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
61
+ ```
62
+ ## JSON-LD REFERENCE
63
+ ================================================================================
64
+ ```json
65
+ {
66
+ "@context": {
67
+ "@language": "en",
68
+ "@vocab": "https://schema.org/",
69
+ "arrayShape": "cr:arrayShape",
70
+ "citeAs": "cr:citeAs",
71
+ "column": "cr:column",
72
+ "conformsTo": "dct:conformsTo",
73
+ "cr": "http://mlcommons.org/croissant/",
74
+ "data": {
75
+ "@id": "cr:data",
76
+ "@type": "@json"
77
+ },
78
+ "dataBiases": "cr:dataBiases",
79
+ "dataCollection": "cr:dataCollection",
80
+ "dataType": {
81
+ "@id": "cr:dataType",
82
+ "@type": "@vocab"
83
+ },
84
+ "dct": "http://purl.org/dc/terms/",
85
+ "extract": "cr:extract",
86
+ "field": "cr:field",
87
+ "fileProperty": "cr:fileProperty",
88
+ "fileObject": "cr:fileObject",
89
+ "fileSet": "cr:fileSet",
90
+ "format": "cr:format",
91
+ "includes": "cr:includes",
92
+ "isArray": "cr:isArray",
93
+ "isLiveDataset": "cr:isLiveDataset",
94
+ "jsonPath": "cr:jsonPath",
95
+ "key": "cr:key",
96
+ "md5": "cr:md5",
97
+ "parentField": "cr:parentField",
98
+ "path": "cr:path",
99
+ "personalSensitiveInformation": "cr:personalSensitiveInformation",
100
+ "recordSet": "cr:recordSet",
101
+ "references": "cr:references",
102
+ "regex": "cr:regex",
103
+ "repeated": "cr:repeated",
104
+ "replace": "cr:replace",
105
+ "sc": "https://schema.org/",
106
+ "separator": "cr:separator",
107
+ "source": "cr:source",
108
+ "subField": "cr:subField",
109
+ "transform": "cr:transform",
110
+ "@base": "cr_base_iri/"
111
+ },
112
+ "@type": "sc:Dataset",
113
+ "distribution": [
114
+ {
115
+ "@type": "cr:FileObject",
116
+ "@id": "repo",
117
+ "name": "repo",
118
+ "description": "The Hugging Face git repository.",
119
+ "contentUrl": "https://huggingface.co/datasets/erotemic/shitspotter/tree/refs%2Fconvert%2Fparquet",
120
+ "encodingFormat": "git+https",
121
+ "sha256": "https://github.com/mlcommons/croissant/issues/80"
122
+ },
123
+ {
124
+ "@type": "cr:FileSet",
125
+ "@id": "parquet-files-for-config-default",
126
+ "containedIn": {
127
+ "@id": "repo"
128
+ },
129
+ "encodingFormat": "application/x-parquet",
130
+ "includes": "default/*/*.parquet"
131
+ }
132
+ ],
133
+ "recordSet": [
134
+ {
135
+ "@type": "cr:RecordSet",
136
+ "dataType": "cr:Split",
137
+ "key": {
138
+ "@id": "default_splits/split_name"
139
+ },
140
+ "@id": "default_splits",
141
+ "name": "default_splits",
142
+ "description": "Splits for the default config.",
143
+ "field": [
144
+ {
145
+ "@type": "cr:Field",
146
+ "@id": "default_splits/split_name",
147
+ "dataType": "sc:Text"
148
+ }
149
+ ],
150
+ "data": [
151
+ {
152
+ "default_splits/split_name": "train"
153
+ },
154
+ {
155
+ "default_splits/split_name": "validation"
156
+ },
157
+ {
158
+ "default_splits/split_name": "test"
159
+ }
160
+ ]
161
+ },
162
+ {
163
+ "@type": "cr:RecordSet",
164
+ "@id": "default",
165
+ "description": "erotemic/shitspotter - 'default' subset (first 5GB)\n\nAdditional information:\n- 3 splits: train, validation, test",
166
+ "field": [
167
+ {
168
+ "@type": "cr:Field",
169
+ "@id": "default/split",
170
+ "dataType": "sc:Text",
171
+ "source": {
172
+ "fileSet": {
173
+ "@id": "parquet-files-for-config-default"
174
+ },
175
+ "extract": {
176
+ "fileProperty": "fullpath"
177
+ },
178
+ "transform": {
179
+ "regex": "default/(?:partial-)?(train|validation|test)/.+parquet$"
180
+ }
181
+ },
182
+ "references": {
183
+ "field": {
184
+ "@id": "default_splits/split_name"
185
+ }
186
+ }
187
+ },
188
+ {
189
+ "@type": "cr:Field",
190
+ "@id": "default/jpg",
191
+ "dataType": "sc:ImageObject",
192
+ "source": {
193
+ "fileSet": {
194
+ "@id": "parquet-files-for-config-default"
195
+ },
196
+ "extract": {
197
+ "column": "jpg"
198
+ },
199
+ "transform": {
200
+ "jsonPath": "bytes"
201
+ }
202
+ }
203
+ },
204
+ {
205
+ "@type": "cr:Field",
206
+ "@id": "default/json",
207
+ "subField": [
208
+ {
209
+ "@type": "cr:Field",
210
+ "@id": "default/json/annotations",
211
+ "subField": [
212
+ {
213
+ "@type": "cr:Field",
214
+ "@id": "default/json/annotations/bbox",
215
+ "dataType": "cr:Int64",
216
+ "source": {
217
+ "fileSet": {
218
+ "@id": "parquet-files-for-config-default"
219
+ },
220
+ "extract": {
221
+ "column": "json"
222
+ }
223
+ },
224
+ "isArray": true,
225
+ "arrayShape": "-1"
226
+ },
227
+ {
228
+ "@type": "cr:Field",
229
+ "@id": "default/json/annotations/category_id",
230
+ "dataType": "cr:Int64",
231
+ "source": {
232
+ "fileSet": {
233
+ "@id": "parquet-files-for-config-default"
234
+ },
235
+ "extract": {
236
+ "column": "json"
237
+ },
238
+ "transform": {
239
+ "jsonPath": "category_id"
240
+ }
241
+ }
242
+ },
243
+ {
244
+ "@type": "cr:Field",
245
+ "@id": "default/json/annotations/iscrowd",
246
+ "dataType": "cr:Int64",
247
+ "source": {
248
+ "fileSet": {
249
+ "@id": "parquet-files-for-config-default"
250
+ },
251
+ "extract": {
252
+ "column": "json"
253
+ },
254
+ "transform": {
255
+ "jsonPath": "iscrowd"
256
+ }
257
+ }
258
+ },
259
+ {
260
+ "@type": "cr:Field",
261
+ "@id": "default/json/annotations/segmentation",
262
+ "subField": [
263
+ {
264
+ "@type": "cr:Field",
265
+ "@id": "default/json/annotations/segmentation/exterior",
266
+ "dataType": "cr:Float64",
267
+ "source": {
268
+ "fileSet": {
269
+ "@id": "parquet-files-for-config-default"
270
+ },
271
+ "extract": {
272
+ "column": "json"
273
+ }
274
+ },
275
+ "isArray": true,
276
+ "arrayShape": "-1,-1"
277
+ },
278
+ null
279
+ ]
280
+ }
281
+ ],
282
+ "isArray": true,
283
+ "arrayShape": "-1"
284
+ },
285
+ {
286
+ "@type": "cr:Field",
287
+ "@id": "default/json/file_name",
288
+ "dataType": "sc:Text",
289
+ "source": {
290
+ "fileSet": {
291
+ "@id": "parquet-files-for-config-default"
292
+ },
293
+ "extract": {
294
+ "column": "json"
295
+ },
296
+ "transform": {
297
+ "jsonPath": "file_name"
298
+ }
299
+ }
300
+ },
301
+ {
302
+ "@type": "cr:Field",
303
+ "@id": "default/json/height",
304
+ "dataType": "cr:Int64",
305
+ "source": {
306
+ "fileSet": {
307
+ "@id": "parquet-files-for-config-default"
308
+ },
309
+ "extract": {
310
+ "column": "json"
311
+ },
312
+ "transform": {
313
+ "jsonPath": "height"
314
+ }
315
+ }
316
+ },
317
+ {
318
+ "@type": "cr:Field",
319
+ "@id": "default/json/id",
320
+ "dataType": "cr:Int64",
321
+ "source": {
322
+ "fileSet": {
323
+ "@id": "parquet-files-for-config-default"
324
+ },
325
+ "extract": {
326
+ "column": "json"
327
+ },
328
+ "transform": {
329
+ "jsonPath": "id"
330
+ }
331
+ }
332
+ },
333
+ {
334
+ "@type": "cr:Field",
335
+ "@id": "default/json/width",
336
+ "dataType": "cr:Int64",
337
+ "source": {
338
+ "fileSet": {
339
+ "@id": "parquet-files-for-config-default"
340
+ },
341
+ "extract": {
342
+ "column": "json"
343
+ },
344
+ "transform": {
345
+ "jsonPath": "width"
346
+ }
347
+ }
348
+ }
349
+ ]
350
+ },
351
+ {
352
+ "@type": "cr:Field",
353
+ "@id": "default/__key__",
354
+ "dataType": "sc:Text",
355
+ "source": {
356
+ "fileSet": {
357
+ "@id": "parquet-files-for-config-default"
358
+ },
359
+ "extract": {
360
+ "column": "__key__"
361
+ }
362
+ }
363
+ },
364
+ {
365
+ "@type": "cr:Field",
366
+ "@id": "default/__url__",
367
+ "dataType": "sc:Text",
368
+ "source": {
369
+ "fileSet": {
370
+ "@id": "parquet-files-for-config-default"
371
+ },
372
+ "extract": {
373
+ "column": "__url__"
374
+ }
375
+ }
376
+ }
377
+ ]
378
+ }
379
+ ],
380
+ "conformsTo": "http://mlcommons.org/croissant/1.1",
381
+ "name": "shitspotter",
382
+ "description": "\n\t\n\t\t\n\t\tDataset Card for ShitSpotter (\"ScatSpotter\")\n\t\n\nShitSpotter (or \"ScatSpotter\" in formal settings) is an open dataset of images containing dog feces. \nThis dataset contains full-resolution smartphone images of dog feces (\"poop\") collected in urban outdoor \nenvironments taken using a \"before/after/negative\" protocol. \nIt includes thousands of polygon annotations of feces in varied lighting, seasonal, and terrain conditions. \nThe dataset is designed for training and evaluating object\u2026 See the full description on the dataset page: https://huggingface.co/datasets/erotemic/shitspotter.",
383
+ "alternateName": [
384
+ "erotemic/shitspotter",
385
+ "ShitSpotter"
386
+ ],
387
+ "creator": {
388
+ "@type": "Person",
389
+ "name": "Jonathan P Crall",
390
+ "url": "https://huggingface.co/erotemic"
391
+ },
392
+ "keywords": [
393
+ "object-detection",
394
+ "image-segmentation",
395
+ "English",
396
+ "cc-by-4.0",
397
+ "1K - 10K",
398
+ "webdataset",
399
+ "Image",
400
+ "Text",
401
+ "Datasets",
402
+ "WebDataset",
403
+ "Croissant",
404
+ "arxiv:2412.16473",
405
+ "\ud83c\uddfa\ud83c\uddf8 Region: US"
406
+ ],
407
+ "license": "https://choosealicense.com/licenses/cc-by-4.0/",
408
+ "url": "https://huggingface.co/datasets/erotemic/shitspotter"
409
+ }
410
+ ```
validation.py CHANGED
@@ -31,6 +31,14 @@ def validate_croissant(json_data):
31
  error_details = traceback.format_exc()
32
  error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
33
  return False, error_message
 
 
 
 
 
 
 
 
34
 
35
  def validate_records(json_data):
36
  """Validate that records can be generated within the time limit."""
@@ -46,15 +54,25 @@ def validate_records(json_data):
46
  for record_set in record_sets:
47
  try:
48
  records = dataset.records(record_set=record_set.uuid)
49
- _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
 
 
 
 
 
50
  results.append(f"Record set '{record_set.uuid}' passed validation.")
 
51
  except func_timeout.exceptions.FunctionTimedOut:
52
- error_message = f"Record set '{record_set.uuid}' generation took too long (>60s)"
53
- return False, error_message
 
54
  except Exception as e:
55
  error_details = traceback.format_exc()
56
- error_message = f"Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}"
57
- return False, error_message
 
 
 
58
 
59
  return True, "\n".join(results)
60
  except Exception as e:
@@ -72,9 +90,20 @@ def generate_validation_report(filename, json_data, results):
72
  report.append(f"Starting validation for file: {filename}")
73
 
74
  # Add validation results
75
- for test_name, passed, message in results:
 
 
 
 
 
 
76
  report.append(f"### {test_name}")
77
- report.append("✓" if passed else "")
 
 
 
 
 
78
  report.append(message.strip()) # Remove any trailing newlines
79
 
80
  # Add JSON-LD reference
@@ -84,4 +113,4 @@ def generate_validation_report(filename, json_data, results):
84
  report.append(json.dumps(json_data, indent=2))
85
  report.append("```")
86
 
87
- return "\n".join(report)
 
31
  error_details = traceback.format_exc()
32
  error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
33
  return False, error_message
34
+
35
+ def try_generate_record(record_iterator):
36
+ try:
37
+ next(record_iterator)
38
+ return "success"
39
+ except Exception as e:
40
+ # Return the exception object to the outer function
41
+ return e
42
 
43
  def validate_records(json_data):
44
  """Validate that records can be generated within the time limit."""
 
54
  for record_set in record_sets:
55
  try:
56
  records = dataset.records(record_set=record_set.uuid)
57
+
58
+ result = func_timeout.func_timeout(WAIT_TIME, try_generate_record, args=(iter(records),))
59
+
60
+ if isinstance(result, Exception):
61
+ raise result # re-raise actual error outside timeout
62
+
63
  results.append(f"Record set '{record_set.uuid}' passed validation.")
64
+
65
  except func_timeout.exceptions.FunctionTimedOut:
66
+ error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)."
67
+ return False, error_message, "warning"
68
+
69
  except Exception as e:
70
  error_details = traceback.format_exc()
71
+ error_message = (
72
+ f"Record set '{record_set.uuid}' failed due to generation error:\n\n"
73
+ f"```text\n{str(e)}\n\n{error_details}```"
74
+ )
75
+ return False, error_message, "warning"
76
 
77
  return True, "\n".join(results)
78
  except Exception as e:
 
90
  report.append(f"Starting validation for file: {filename}")
91
 
92
  # Add validation results
93
+ for result in results:
94
+ if len(result) == 4:
95
+ test_name, passed, message, status = result
96
+ else:
97
+ test_name, passed, message = result
98
+ status = "pass" if passed else "error"
99
+
100
  report.append(f"### {test_name}")
101
+ if status == "pass":
102
+ report.append("✓")
103
+ elif status == "warning":
104
+ report.append("?") # Question mark for warning
105
+ else:
106
+ report.append("✗")
107
  report.append(message.strip()) # Remove any trailing newlines
108
 
109
  # Add JSON-LD reference
 
113
  report.append(json.dumps(json_data, indent=2))
114
  report.append("```")
115
 
116
+ return "\n".join(report)