Commit
·
6ec1943
1
Parent(s):
71ddcd2
better reporting of generation issues
Browse files- __pycache__/validation.cpython-312.pyc +0 -0
- app.py +44 -30
- apt.txt +1 -0
- report_croissant-validation_Student Performance on an Entrance Examination.md +342 -0
- report_croissant-validation_credit-g.md +982 -0
- report_croissant-validation_natural_reasoning.md +247 -0
- report_croissant-validation_shitspotter.md +410 -0
- validation.py +37 -8
__pycache__/validation.cpython-312.pyc
CHANGED
Binary files a/__pycache__/validation.cpython-312.pyc and b/__pycache__/validation.cpython-312.pyc differ
|
|
app.py
CHANGED
@@ -9,35 +9,30 @@ def process_file(file):
|
|
9 |
results = []
|
10 |
json_data = None
|
11 |
|
12 |
-
# Use just the filename instead of full path
|
13 |
filename = file.name.split("/")[-1]
|
14 |
-
|
15 |
# Check 1: JSON validation
|
16 |
json_valid, json_message, json_data = validate_json(file.name)
|
17 |
-
# Remove empty checkmarks from messages
|
18 |
json_message = json_message.replace("\n✓\n", "\n")
|
19 |
-
results.append(("JSON Format Validation", json_valid, json_message))
|
20 |
-
|
21 |
if not json_valid:
|
22 |
return results, None
|
23 |
-
|
24 |
# Check 2: Croissant validation
|
25 |
-
croissant_valid, croissant_message = validate_croissant(json_data)
|
26 |
-
# Remove empty checkmarks from messages
|
27 |
croissant_message = croissant_message.replace("\n✓\n", "\n")
|
28 |
-
results.append(("Croissant Schema Validation", croissant_valid, croissant_message))
|
29 |
-
|
30 |
if not croissant_valid:
|
31 |
return results, None
|
32 |
-
|
33 |
-
# Check 3: Records validation
|
34 |
-
records_valid, records_message = validate_records(json_data)
|
35 |
-
# Remove empty checkmarks from messages
|
36 |
records_message = records_message.replace("\n✓\n", "\n")
|
37 |
-
results.append(("Records Generation Test", records_valid, records_message))
|
38 |
-
|
39 |
|
40 |
-
# Generate
|
41 |
report = generate_validation_report(filename, json_data, results)
|
42 |
|
43 |
return results, report
|
@@ -160,6 +155,10 @@ def create_ui():
|
|
160 |
.status-error {
|
161 |
background-color: #f44336 !important;
|
162 |
}
|
|
|
|
|
|
|
|
|
163 |
|
164 |
.step-details {
|
165 |
padding: 12px 15px;
|
@@ -365,9 +364,9 @@ def create_ui():
|
|
365 |
None
|
366 |
]
|
367 |
|
368 |
-
records_valid, records_message = validate_records(json_data)
|
369 |
-
results.append(("Records Generation Test", records_valid, records_message))
|
370 |
-
|
371 |
# Generate report
|
372 |
report = generate_validation_report(url.split("/")[-1], json_data, results)
|
373 |
report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
|
@@ -411,17 +410,32 @@ def create_ui():
|
|
411 |
None,
|
412 |
None
|
413 |
]
|
414 |
-
|
415 |
def build_results_html(results):
|
416 |
-
# Build validation results HTML
|
417 |
html = '<div class="validation-results">'
|
418 |
-
|
419 |
-
for i,
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
|
|
|
|
|
425 |
html += f'''
|
426 |
<div class="validation-step" id="step-{i}">
|
427 |
<div class="step-header" onclick="
|
@@ -445,10 +459,10 @@ def create_ui():
|
|
445 |
</div>
|
446 |
</div>
|
447 |
'''
|
448 |
-
|
449 |
html += '</div>'
|
450 |
return gr.update(value=html, visible=True)
|
451 |
-
|
452 |
def on_validate(file):
|
453 |
if file is None:
|
454 |
return [
|
|
|
9 |
results = []
|
10 |
json_data = None
|
11 |
|
|
|
12 |
filename = file.name.split("/")[-1]
|
13 |
+
|
14 |
# Check 1: JSON validation
|
15 |
json_valid, json_message, json_data = validate_json(file.name)
|
|
|
16 |
json_message = json_message.replace("\n✓\n", "\n")
|
17 |
+
results.append(("JSON Format Validation", json_valid, json_message, "pass" if json_valid else "error"))
|
18 |
+
|
19 |
if not json_valid:
|
20 |
return results, None
|
21 |
+
|
22 |
# Check 2: Croissant validation
|
23 |
+
croissant_valid, croissant_message = validate_croissant(json_data)
|
|
|
24 |
croissant_message = croissant_message.replace("\n✓\n", "\n")
|
25 |
+
results.append(("Croissant Schema Validation", croissant_valid, croissant_message, "pass" if croissant_valid else "error"))
|
26 |
+
|
27 |
if not croissant_valid:
|
28 |
return results, None
|
29 |
+
|
30 |
+
# Check 3: Records validation (with timeout-safe and error-specific logic)
|
31 |
+
records_valid, records_message, records_status = validate_records(json_data)
|
|
|
32 |
records_message = records_message.replace("\n✓\n", "\n")
|
33 |
+
results.append(("Records Generation Test", records_valid, records_message, records_status))
|
|
|
34 |
|
35 |
+
# Generate final report
|
36 |
report = generate_validation_report(filename, json_data, results)
|
37 |
|
38 |
return results, report
|
|
|
155 |
.status-error {
|
156 |
background-color: #f44336 !important;
|
157 |
}
|
158 |
+
|
159 |
+
.status-warning {
|
160 |
+
background-color: #ff9800 !important; /* Amber for warnings */
|
161 |
+
}
|
162 |
|
163 |
.step-details {
|
164 |
padding: 12px 15px;
|
|
|
364 |
None
|
365 |
]
|
366 |
|
367 |
+
records_valid, records_message, records_status = validate_records(json_data)
|
368 |
+
results.append(("Records Generation Test (Optional)", records_valid, records_message, records_status))
|
369 |
+
|
370 |
# Generate report
|
371 |
report = generate_validation_report(url.split("/")[-1], json_data, results)
|
372 |
report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
|
|
|
410 |
None,
|
411 |
None
|
412 |
]
|
413 |
+
|
414 |
def build_results_html(results):
|
|
|
415 |
html = '<div class="validation-results">'
|
416 |
+
|
417 |
+
for i, result in enumerate(results):
|
418 |
+
if len(result) == 4:
|
419 |
+
test_name, passed, message, status = result
|
420 |
+
else:
|
421 |
+
test_name, passed, message = result
|
422 |
+
status = "pass" if passed else "error"
|
423 |
+
|
424 |
+
if status == "pass":
|
425 |
+
status_class = "status-success"
|
426 |
+
status_icon = "✓"
|
427 |
+
message_with_emoji = "✅ " + message
|
428 |
+
elif status == "warning":
|
429 |
+
status_class = "status-warning"
|
430 |
+
status_icon = "?"
|
431 |
+
message_with_emoji = "⚠️ Could not automatically generate records. This is oftentimes not an issue (e.g. datasets could be too large or too complex), and it's not required to pass this test to submit to NeurIPS.\n\n" + message
|
432 |
+
else: # error
|
433 |
+
status_class = "status-error"
|
434 |
+
status_icon = "✗"
|
435 |
+
message_with_emoji = "❌ " + message
|
436 |
|
437 |
+
message_with_emoji = message_with_emoji.replace("\n", "<br>")
|
438 |
+
|
439 |
html += f'''
|
440 |
<div class="validation-step" id="step-{i}">
|
441 |
<div class="step-header" onclick="
|
|
|
459 |
</div>
|
460 |
</div>
|
461 |
'''
|
462 |
+
|
463 |
html += '</div>'
|
464 |
return gr.update(value=html, visible=True)
|
465 |
+
|
466 |
def on_validate(file):
|
467 |
if file is None:
|
468 |
return [
|
apt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
git-lfs
|
report_croissant-validation_Student Performance on an Entrance Examination.md
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CROISSANT VALIDATION REPORT
|
2 |
+
================================================================================
|
3 |
+
## VALIDATION RESULTS
|
4 |
+
--------------------------------------------------------------------------------
|
5 |
+
Starting validation for file: download
|
6 |
+
### JSON Format Validation
|
7 |
+
✓
|
8 |
+
The URL returned valid JSON.
|
9 |
+
### Croissant Schema Validation
|
10 |
+
✓
|
11 |
+
The dataset passes Croissant validation.
|
12 |
+
### Records Generation Test
|
13 |
+
✓
|
14 |
+
Record set 'Student_Performance_on_an_Entrance_Examination.csv' passed validation.
|
15 |
+
## JSON-LD REFERENCE
|
16 |
+
================================================================================
|
17 |
+
```json
|
18 |
+
{
|
19 |
+
"@context": {
|
20 |
+
"@language": "en",
|
21 |
+
"@vocab": "https://schema.org/",
|
22 |
+
"citeAs": "cr:citeAs",
|
23 |
+
"column": "cr:column",
|
24 |
+
"conformsTo": "dct:conformsTo",
|
25 |
+
"cr": "http://mlcommons.org/croissant/",
|
26 |
+
"data": {
|
27 |
+
"@id": "cr:data",
|
28 |
+
"@type": "@json"
|
29 |
+
},
|
30 |
+
"dataBiases": "cr:dataBiases",
|
31 |
+
"dataCollection": "cr:dataCollection",
|
32 |
+
"dataType": {
|
33 |
+
"@id": "cr:dataType",
|
34 |
+
"@type": "@vocab"
|
35 |
+
},
|
36 |
+
"dct": "http://purl.org/dc/terms/",
|
37 |
+
"extract": "cr:extract",
|
38 |
+
"field": "cr:field",
|
39 |
+
"fileProperty": "cr:fileProperty",
|
40 |
+
"fileObject": "cr:fileObject",
|
41 |
+
"fileSet": "cr:fileSet",
|
42 |
+
"format": "cr:format",
|
43 |
+
"includes": "cr:includes",
|
44 |
+
"isEnumeration": "cr:isEnumeration",
|
45 |
+
"isLiveDataset": "cr:isLiveDataset",
|
46 |
+
"jsonPath": "cr:jsonPath",
|
47 |
+
"key": "cr:key",
|
48 |
+
"md5": "cr:md5",
|
49 |
+
"parentField": "cr:parentField",
|
50 |
+
"path": "cr:path",
|
51 |
+
"personalSensitiveInformation": "cr:personalSensitiveInformation",
|
52 |
+
"recordSet": "cr:recordSet",
|
53 |
+
"references": "cr:references",
|
54 |
+
"regex": "cr:regex",
|
55 |
+
"repeated": "cr:repeated",
|
56 |
+
"replace": "cr:replace",
|
57 |
+
"sc": "https://schema.org/",
|
58 |
+
"separator": "cr:separator",
|
59 |
+
"source": "cr:source",
|
60 |
+
"subField": "cr:subField",
|
61 |
+
"transform": "cr:transform",
|
62 |
+
"wd": "https://www.wikidata.org/wiki/",
|
63 |
+
"@base": "cr_base_iri/"
|
64 |
+
},
|
65 |
+
"alternateName": " Examining Demographic, Academic, and Socioeconomic Factors",
|
66 |
+
"conformsTo": "http://mlcommons.org/croissant/1.0",
|
67 |
+
"license": {
|
68 |
+
"@type": "sc:CreativeWork",
|
69 |
+
"name": "Other (specified in description)"
|
70 |
+
},
|
71 |
+
"distribution": [
|
72 |
+
{
|
73 |
+
"contentUrl": "https://www.kaggle.com/api/v1/datasets/download/adilshamim8/student-performance-on-an-entrance-examination?datasetVersionNumber=1",
|
74 |
+
"contentSize": "4.299 KB",
|
75 |
+
"md5": "c8RSY3Vq8U4A+IMWxNtpMQ==",
|
76 |
+
"encodingFormat": "application/zip",
|
77 |
+
"@id": "archive.zip",
|
78 |
+
"@type": "cr:FileObject",
|
79 |
+
"name": "archive.zip",
|
80 |
+
"description": "Archive containing all the contents of the Student Performance on an Entrance Examination dataset"
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"contentUrl": "Student_Performance_on_an_Entrance_Examination.csv",
|
84 |
+
"containedIn": {
|
85 |
+
"@id": "archive.zip"
|
86 |
+
},
|
87 |
+
"encodingFormat": "text/csv",
|
88 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject",
|
89 |
+
"@type": "cr:FileObject",
|
90 |
+
"name": "Student_Performance_on_an_Entrance_Examination.csv",
|
91 |
+
"description": "- **Gender** \n *Description:* Indicates the candidate\u2019s gender (e.g., Male, Female). This field helps in analyzing performance trends and demographic differences based on gender.\n\n- **Caste** \n *Description:* Specifies the caste category of the candidate (such as General, OBC, SC, ST, etc.). This information can be used to explore socio-cultural factors and their influence on academic performance.\n\n- **coaching** \n *Description:* Denotes whether the candidate attended any coaching classes prior to the examination. It typically categorizes candidates into those who attended coaching within Assam, outside Assam, or not at all, providing insights into the role of supplementary education.\n\n- **Class_ten_education** \n *Description:* Records the board or institution where the candidate completed their Class X education. This can be useful for assessing the impact of the quality of secondary education on subsequent exam performance.\n\n- **twelve_education** \n *Description:* Indicates the board or institution where the candidate completed their Class XII education. Analyzing this field can reveal differences in educational standards and curricula that may affect entrance exam outcomes.\n\n- **medium** \n *Description:* Specifies the medium of instruction used during the candidate\u2019s Class XII education (e.g., English, Assamese, etc.). The medium of instruction might influence comprehension and performance in the exam.\n\n- **Class_X_Percentage** \n *Description:* Represents the percentage marks secured by the candidate in their Class X examinations. This score serves as a baseline indicator of academic ability and prior educational attainment.\n\n- **Class_XII_Percentage** \n *Description:* Denotes the percentage marks achieved by the candidate in their Class XII examinations, providing further insight into their academic consistency and preparation for the entrance exam.\n\n- **Father_occupation** \n *Description:* Captures the occupation of the candidate\u2019s father. This socioeconomic indicator can help in understanding how parental employment and associated factors might influence educational opportunities and performance.\n\n- **Mother_occupation** \n *Description:* Captures the occupation of the candidate\u2019s mother. Like the father's occupation, this field contributes to a broader view of the candidate's socioeconomic background and its potential impact on academic success.\n\n- **time** \n *Description:* Records the time or session related to the exam or data collection. This field can be useful for tracking trends over time or correlating performance with specific examination sessions.\n\n- **Performance** \n *Description:* Represents the candidate\u2019s performance in the Common Entrance Examination (CEE). This could be presented as a numeric score, grade, or categorical outcome (e.g., pass/fail), and serves as the primary variable for assessing academic achievement in the dataset."
|
92 |
+
}
|
93 |
+
],
|
94 |
+
"recordSet": [
|
95 |
+
{
|
96 |
+
"field": [
|
97 |
+
{
|
98 |
+
"dataType": [
|
99 |
+
"sc:Text"
|
100 |
+
],
|
101 |
+
"source": {
|
102 |
+
"fileObject": {
|
103 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
104 |
+
},
|
105 |
+
"extract": {
|
106 |
+
"column": "Gender"
|
107 |
+
}
|
108 |
+
},
|
109 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Gender",
|
110 |
+
"@type": "cr:Field",
|
111 |
+
"name": "Gender",
|
112 |
+
"description": "Indicates the candidate\u2019s gender (e.g., Male, Female). This field helps in analyzing performance trends and demographic differences based on gender."
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"dataType": [
|
116 |
+
"sc:Text"
|
117 |
+
],
|
118 |
+
"source": {
|
119 |
+
"fileObject": {
|
120 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
121 |
+
},
|
122 |
+
"extract": {
|
123 |
+
"column": "Caste"
|
124 |
+
}
|
125 |
+
},
|
126 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Caste",
|
127 |
+
"@type": "cr:Field",
|
128 |
+
"name": "Caste",
|
129 |
+
"description": "Specifies the caste category of the candidate (such as General, OBC, SC, ST, etc.). This information can be used to explore socio-cultural factors and their influence on academic performance."
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"dataType": [
|
133 |
+
"sc:Text"
|
134 |
+
],
|
135 |
+
"source": {
|
136 |
+
"fileObject": {
|
137 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
138 |
+
},
|
139 |
+
"extract": {
|
140 |
+
"column": "coaching"
|
141 |
+
}
|
142 |
+
},
|
143 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/coaching",
|
144 |
+
"@type": "cr:Field",
|
145 |
+
"name": "coaching",
|
146 |
+
"description": " Denotes whether the candidate attended any coaching classes prior to the examination. It typically categorizes candidates into those who attended coaching within Assam, outside Assam, or not at all, providing insights into the role of supplementary education."
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataType": [
|
150 |
+
"sc:Text"
|
151 |
+
],
|
152 |
+
"source": {
|
153 |
+
"fileObject": {
|
154 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
155 |
+
},
|
156 |
+
"extract": {
|
157 |
+
"column": "Class_ten_education"
|
158 |
+
}
|
159 |
+
},
|
160 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Class_ten_education",
|
161 |
+
"@type": "cr:Field",
|
162 |
+
"name": "Class_ten_education",
|
163 |
+
"description": " Records the board or institution where the candidate completed their Class X education. This can be useful for assessing the impact of the quality of secondary education on subsequent exam performance."
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"dataType": [
|
167 |
+
"sc:Text"
|
168 |
+
],
|
169 |
+
"source": {
|
170 |
+
"fileObject": {
|
171 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
172 |
+
},
|
173 |
+
"extract": {
|
174 |
+
"column": "twelve_education"
|
175 |
+
}
|
176 |
+
},
|
177 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/twelve_education",
|
178 |
+
"@type": "cr:Field",
|
179 |
+
"name": "twelve_education",
|
180 |
+
"description": " Indicates the board or institution where the candidate completed their Class XII education. Analyzing this field can reveal differences in educational standards and curricula that may affect entrance exam outcomes."
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"dataType": [
|
184 |
+
"sc:Text"
|
185 |
+
],
|
186 |
+
"source": {
|
187 |
+
"fileObject": {
|
188 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
189 |
+
},
|
190 |
+
"extract": {
|
191 |
+
"column": "medium"
|
192 |
+
}
|
193 |
+
},
|
194 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/medium",
|
195 |
+
"@type": "cr:Field",
|
196 |
+
"name": "medium",
|
197 |
+
"description": "Specifies the medium of instruction used during the candidate\u2019s Class XII education (e.g., English, Assamese, etc.). The medium of instruction might influence comprehension and performance in the exam."
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"dataType": [
|
201 |
+
"sc:Text"
|
202 |
+
],
|
203 |
+
"source": {
|
204 |
+
"fileObject": {
|
205 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
206 |
+
},
|
207 |
+
"extract": {
|
208 |
+
"column": "Class_X_Percentage"
|
209 |
+
}
|
210 |
+
},
|
211 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Class_X_Percentage",
|
212 |
+
"@type": "cr:Field",
|
213 |
+
"name": "Class_X_Percentage",
|
214 |
+
"description": " Represents the percentage marks secured by the candidate in their Class X examinations. This score serves as a baseline indicator of academic ability and prior educational attainment."
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"dataType": [
|
218 |
+
"sc:Text"
|
219 |
+
],
|
220 |
+
"source": {
|
221 |
+
"fileObject": {
|
222 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
223 |
+
},
|
224 |
+
"extract": {
|
225 |
+
"column": "Class_XII_Percentage"
|
226 |
+
}
|
227 |
+
},
|
228 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Class_XII_Percentage",
|
229 |
+
"@type": "cr:Field",
|
230 |
+
"name": "Class_XII_Percentage",
|
231 |
+
"description": " Denotes the percentage marks achieved by the candidate in their Class XII examinations, providing further insight into their academic consistency and preparation for the entrance exam."
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"dataType": [
|
235 |
+
"sc:Text"
|
236 |
+
],
|
237 |
+
"source": {
|
238 |
+
"fileObject": {
|
239 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
240 |
+
},
|
241 |
+
"extract": {
|
242 |
+
"column": "Father_occupation"
|
243 |
+
}
|
244 |
+
},
|
245 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Father_occupation",
|
246 |
+
"@type": "cr:Field",
|
247 |
+
"name": "Father_occupation",
|
248 |
+
"description": "Captures the occupation of the candidate\u2019s father. This socioeconomic indicator can help in understanding how parental employment and associated factors might influence educational opportunities and performance."
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"dataType": [
|
252 |
+
"sc:Text"
|
253 |
+
],
|
254 |
+
"source": {
|
255 |
+
"fileObject": {
|
256 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
257 |
+
},
|
258 |
+
"extract": {
|
259 |
+
"column": "Mother_occupation"
|
260 |
+
}
|
261 |
+
},
|
262 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Mother_occupation",
|
263 |
+
"@type": "cr:Field",
|
264 |
+
"name": "Mother_occupation",
|
265 |
+
"description": " Captures the occupation of the candidate\u2019s mother. Like the father's occupation, this field contributes to a broader view of the candidate's socioeconomic background and its potential impact on academic success."
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"dataType": [
|
269 |
+
"sc:Text"
|
270 |
+
],
|
271 |
+
"source": {
|
272 |
+
"fileObject": {
|
273 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
274 |
+
},
|
275 |
+
"extract": {
|
276 |
+
"column": "time"
|
277 |
+
}
|
278 |
+
},
|
279 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/time",
|
280 |
+
"@type": "cr:Field",
|
281 |
+
"name": "time",
|
282 |
+
"description": " Records the time or session related to the exam or data collection. This field can be useful for tracking trends over time or correlating performance with specific examination sessions."
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"dataType": [
|
286 |
+
"sc:Text"
|
287 |
+
],
|
288 |
+
"source": {
|
289 |
+
"fileObject": {
|
290 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv_fileobject"
|
291 |
+
},
|
292 |
+
"extract": {
|
293 |
+
"column": "Performance"
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv/Performance",
|
297 |
+
"@type": "cr:Field",
|
298 |
+
"name": "Performance",
|
299 |
+
"description": " Represents the candidate\u2019s performance in the Common Entrance Examination (CEE). This could be presented as a numeric score, grade, or categorical outcome (e.g., pass/fail), and serves as the primary variable for assessing academic achievement in the dataset."
|
300 |
+
}
|
301 |
+
],
|
302 |
+
"@id": "Student_Performance_on_an_Entrance_Examination.csv",
|
303 |
+
"@type": "cr:RecordSet",
|
304 |
+
"name": "Student_Performance_on_an_Entrance_Examination.csv",
|
305 |
+
"description": "- **Gender** \n *Description:* Indicates the candidate\u2019s gender (e.g., Male, Female). This field helps in analyzing performance trends and demographic differences based on gender.\n\n- **Caste** \n *Description:* Specifies the caste category of the candidate (such as General, OBC, SC, ST, etc.). This information can be used to explore socio-cultural factors and their influence on academic performance.\n\n- **coaching** \n *Description:* Denotes whether the candidate attended any coaching classes prior to the examination. It typically categorizes candidates into those who attended coaching within Assam, outside Assam, or not at all, providing insights into the role of supplementary education.\n\n- **Class_ten_education** \n *Description:* Records the board or institution where the candidate completed their Class X education. This can be useful for assessing the impact of the quality of secondary education on subsequent exam performance.\n\n- **twelve_education** \n *Description:* Indicates the board or institution where the candidate completed their Class XII education. Analyzing this field can reveal differences in educational standards and curricula that may affect entrance exam outcomes.\n\n- **medium** \n *Description:* Specifies the medium of instruction used during the candidate\u2019s Class XII education (e.g., English, Assamese, etc.). The medium of instruction might influence comprehension and performance in the exam.\n\n- **Class_X_Percentage** \n *Description:* Represents the percentage marks secured by the candidate in their Class X examinations. This score serves as a baseline indicator of academic ability and prior educational attainment.\n\n- **Class_XII_Percentage** \n *Description:* Denotes the percentage marks achieved by the candidate in their Class XII examinations, providing further insight into their academic consistency and preparation for the entrance exam.\n\n- **Father_occupation** \n *Description:* Captures the occupation of the candidate\u2019s father. This socioeconomic indicator can help in understanding how parental employment and associated factors might influence educational opportunities and performance.\n\n- **Mother_occupation** \n *Description:* Captures the occupation of the candidate\u2019s mother. Like the father's occupation, this field contributes to a broader view of the candidate's socioeconomic background and its potential impact on academic success.\n\n- **time** \n *Description:* Records the time or session related to the exam or data collection. This field can be useful for tracking trends over time or correlating performance with specific examination sessions.\n\n- **Performance** \n *Description:* Represents the candidate\u2019s performance in the Common Entrance Examination (CEE). This could be presented as a numeric score, grade, or categorical outcome (e.g., pass/fail), and serves as the primary variable for assessing academic achievement in the dataset."
|
306 |
+
}
|
307 |
+
],
|
308 |
+
"version": 1,
|
309 |
+
"keywords": [
|
310 |
+
"subject > people and society > education",
|
311 |
+
"technique > data visualization",
|
312 |
+
"technique > exploratory data analysis",
|
313 |
+
"subject > people and society > education > universities and colleges",
|
314 |
+
"subject > people and society > social science",
|
315 |
+
"subject > people and society > education > standardized testing"
|
316 |
+
],
|
317 |
+
"isAccessibleForFree": true,
|
318 |
+
"includedInDataCatalog": {
|
319 |
+
"@type": "sc:DataCatalog",
|
320 |
+
"name": "Kaggle",
|
321 |
+
"url": "https://www.kaggle.com"
|
322 |
+
},
|
323 |
+
"creator": {
|
324 |
+
"@type": "sc:Person",
|
325 |
+
"name": "Adil Shamim",
|
326 |
+
"url": "/adilshamim8",
|
327 |
+
"image": "https://storage.googleapis.com/kaggle-avatars/thumbnails/22146488-kg.jpg?t=2025-02-08-13-40-43"
|
328 |
+
},
|
329 |
+
"publisher": {
|
330 |
+
"@type": "sc:Organization",
|
331 |
+
"name": "Kaggle",
|
332 |
+
"url": "https://www.kaggle.com/organizations/kaggle",
|
333 |
+
"image": "https://storage.googleapis.com/kaggle-organizations/4/thumbnail.png"
|
334 |
+
},
|
335 |
+
"thumbnailUrl": "https://storage.googleapis.com/kaggle-datasets-images/6783385/10912302/0f54936fde1351d0247218871f9c6336/dataset-card.jpg?t=2025-03-04-00-24-52",
|
336 |
+
"dateModified": "2025-03-04T00:09:21.697",
|
337 |
+
"@type": "sc:Dataset",
|
338 |
+
"name": "Student Performance on an Entrance Examination",
|
339 |
+
"url": "https://www.kaggle.com/datasets/adilshamim8/student-performance-on-an-entrance-examination/versions/1",
|
340 |
+
"description": "\n\nThis dataset contains comprehensive information regarding candidates' performance in a common entrance examination, alongside various demographic and academic indicators. It is designed to support analysis into the factors influencing success in competitive exams and can serve as a valuable resource for educational researchers and data scientists.\n\n#### Key Features:\n- **Examination Performance:** Data reflecting the candidate\u2019s results in the entrance examination.\n- **Candidate Demographics:** \n - **Sex:** Gender of the candidate.\n - **Caste:** Caste classification of the candidate.\n- **Coaching Details:**\n - Information on whether the candidate attended coaching classes within Assam, outside Assam, or did not attend any coaching.\n- **Educational Background:**\n - **Board Details:** Names of the boards where the candidate studied during Class X and Class XII.\n - **Medium of Instruction:** The medium used for teaching during Class XII.\n- **Academic Performance:**\n - **Class X Percentage:** Marks secured at the Class X level.\n - **Class XII Percentage:** Marks secured at the Class XII level.\n- **Parental Occupation:**\n - Occupation details for both the candidate's father and mother, which can help analyze socioeconomic influences on performance.\n\n#### Use Cases:\n- **Performance Analysis:** Examine correlations between educational background, coaching, and exam performance.\n- **Predictive Modeling:** Develop models to predict exam outcomes based on prior academic results and demographic factors.\n- **Educational Research:** Explore the impact of socio-economic and educational variables on academic success.\n\nThis dataset is ideal for conducting in-depth studies into the determinants of academic achievement and for designing interventions to improve student performance in competitive exams."
|
341 |
+
}
|
342 |
+
```
|
report_croissant-validation_credit-g.md
ADDED
@@ -0,0 +1,982 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CROISSANT VALIDATION REPORT
|
2 |
+
================================================================================
|
3 |
+
## VALIDATION RESULTS
|
4 |
+
--------------------------------------------------------------------------------
|
5 |
+
Starting validation for file: dataset_31_croissant 14.json
|
6 |
+
### JSON Format Validation
|
7 |
+
✓
|
8 |
+
The file is valid JSON.
|
9 |
+
### Croissant Schema Validation
|
10 |
+
✓
|
11 |
+
The dataset passes Croissant validation.
|
12 |
+
### Records Generation Test
|
13 |
+
✗
|
14 |
+
Record set '_:Ne3c47f5599c9458993fb484e2e59014e' failed: An error occured during the sequential generation of the dataset, more specifically during the operation Join(_:Ne3c47f5599c9458993fb484e2e59014e)
|
15 |
+
|
16 |
+
Traceback (most recent call last):
|
17 |
+
File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in validate_records
|
18 |
+
_ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
|
19 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
20 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/dafunc.py", line 108, in func_timeout
|
21 |
+
raise_exception(exception)
|
22 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/py3_raise.py", line 7, in raise_exception
|
23 |
+
raise exception[0] from None
|
24 |
+
File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in <lambda>
|
25 |
+
_ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
|
26 |
+
^^^^^^^^^^^^^^^^^^^
|
27 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/datasets.py", line 171, in __iter__
|
28 |
+
yield from execute_operations_sequentially(
|
29 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 72, in execute_operations_sequentially
|
30 |
+
raise GenerationError(
|
31 |
+
mlcroissant._src.core.issues.GenerationError: An error occured during the sequential generation of the dataset, more specifically during the operation Join(_:Ne3c47f5599c9458993fb484e2e59014e)
|
32 |
+
## JSON-LD REFERENCE
|
33 |
+
================================================================================
|
34 |
+
```json
|
35 |
+
{
|
36 |
+
"@context": {
|
37 |
+
"@language": "en",
|
38 |
+
"@vocab": "https://schema.org/",
|
39 |
+
"citeAs": "cr:citeAs",
|
40 |
+
"column": "cr:column",
|
41 |
+
"conformsTo": "dct:conformsTo",
|
42 |
+
"cr": "http://mlcommons.org/croissant/",
|
43 |
+
"rai": "http://mlcommons.org/croissant/RAI/",
|
44 |
+
"data": {
|
45 |
+
"@id": "cr:data",
|
46 |
+
"@type": "@json"
|
47 |
+
},
|
48 |
+
"dataType": {
|
49 |
+
"@id": "cr:dataType",
|
50 |
+
"@type": "@vocab"
|
51 |
+
},
|
52 |
+
"dct": "http://purl.org/dc/terms/",
|
53 |
+
"examples": {
|
54 |
+
"@id": "cr:examples",
|
55 |
+
"@type": "@json"
|
56 |
+
},
|
57 |
+
"extract": "cr:extract",
|
58 |
+
"field": "cr:field",
|
59 |
+
"fileProperty": "cr:fileProperty",
|
60 |
+
"fileObject": "cr:fileObject",
|
61 |
+
"fileSet": "cr:fileSet",
|
62 |
+
"format": "cr:format",
|
63 |
+
"includes": "cr:includes",
|
64 |
+
"isLiveDataset": "cr:isLiveDataset",
|
65 |
+
"jsonPath": "cr:jsonPath",
|
66 |
+
"key": "cr:key",
|
67 |
+
"md5": "cr:md5",
|
68 |
+
"parentField": "cr:parentField",
|
69 |
+
"path": "cr:path",
|
70 |
+
"recordSet": "cr:recordSet",
|
71 |
+
"references": "cr:references",
|
72 |
+
"regex": "cr:regex",
|
73 |
+
"repeated": "cr:repeated",
|
74 |
+
"replace": "cr:replace",
|
75 |
+
"sc": "https://schema.org/",
|
76 |
+
"separator": "cr:separator",
|
77 |
+
"source": "cr:source",
|
78 |
+
"subField": "cr:subField",
|
79 |
+
"transform": "cr:transform",
|
80 |
+
"@base": "cr_base_iri/"
|
81 |
+
},
|
82 |
+
"@type": "sc:Dataset",
|
83 |
+
"citeAs": "https://dl.acm.org/doi/abs/10.1145/967900.968104",
|
84 |
+
"conformsTo": "http://mlcommons.org/croissant/1.0",
|
85 |
+
"creator": [
|
86 |
+
{
|
87 |
+
"@type": "sc:Person",
|
88 |
+
"name": "Dr. Hans Hofmann"
|
89 |
+
}
|
90 |
+
],
|
91 |
+
"dateCreated": "2014-04-06T23:21:47",
|
92 |
+
"datePublished": "1994-11-17T00:00:00",
|
93 |
+
"description": "**Author**: Dr. Hans Hofmann \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994 \n**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n**German Credit dataset** \nThis dataset classifies people described by a set of attributes as good or bad credit risks.\n\nThis dataset comes with a cost matrix: \n``` \nGood Bad (predicted) \nGood 0 1 (actual) \nBad 5 0 \n```\n\nIt is worse to class a customer as good when they are bad (5), than it is to class a customer as bad when they are good (1). \n\n### Attribute description \n\n1. Status of existing checking account, in Deutsche Mark. \n2. Duration in months \n3. Credit history (credits taken, paid back duly, delays, critical accounts) \n4. Purpose of the credit (car, television,...) \n5. Credit amount \n6. Status of savings account/bonds, in Deutsche Mark. \n7. Present employment, in number of years. \n8. Installment rate in percentage of disposable income \n9. Personal status (married, single,...) and sex \n10. Other debtors / guarantors \n11. Present residence since X years \n12. Property (e.g. real estate) \n13. Age in years \n14. Other installment plans (banks, stores) \n15. Housing (rent, own,...) \n16. Number of existing credits at this bank \n17. Job \n18. Number of people being liable to provide maintenance for \n19. Telephone (yes,no) \n20. Foreign worker (yes,no)",
|
94 |
+
"inLanguage": "en",
|
95 |
+
"isAccessibleForFree": true,
|
96 |
+
"keywords": [
|
97 |
+
"credit_scoring",
|
98 |
+
"Data Science",
|
99 |
+
"Economics",
|
100 |
+
"finance_problem",
|
101 |
+
"mythbusting_1",
|
102 |
+
"OpenML-CC18",
|
103 |
+
"OpenML100",
|
104 |
+
"Statistics",
|
105 |
+
"study_1",
|
106 |
+
"study_123",
|
107 |
+
"study_14",
|
108 |
+
"study_144",
|
109 |
+
"study_15",
|
110 |
+
"study_20",
|
111 |
+
"study_218",
|
112 |
+
"study_241",
|
113 |
+
"study_34",
|
114 |
+
"study_37",
|
115 |
+
"study_41",
|
116 |
+
"study_50",
|
117 |
+
"study_52",
|
118 |
+
"study_7",
|
119 |
+
"study_70",
|
120 |
+
"study_98",
|
121 |
+
"study_99",
|
122 |
+
"uci"
|
123 |
+
],
|
124 |
+
"license": "Public",
|
125 |
+
"name": "credit-g",
|
126 |
+
"sameAs": "https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)",
|
127 |
+
"url": "https://www.openml.org/search?type=data&id=31",
|
128 |
+
"version": 1,
|
129 |
+
"distribution": [
|
130 |
+
{
|
131 |
+
"@type": "cr:FileObject",
|
132 |
+
"@id": "data-file",
|
133 |
+
"name": "data-file",
|
134 |
+
"description": "Data file belonging to the dataset.",
|
135 |
+
"contentUrl": "https://api.openml.org/data/v1/download/31/credit-g.arff",
|
136 |
+
"encodingFormat": "text/plain",
|
137 |
+
"md5": "9a475053fed0c26ee95cd4525e50074c"
|
138 |
+
}
|
139 |
+
],
|
140 |
+
"recordSet": [
|
141 |
+
{
|
142 |
+
"@type": "cr:RecordSet",
|
143 |
+
"@id": "enumerations/checking_status",
|
144 |
+
"name": "checking_status",
|
145 |
+
"description": "Possible values for checking_status",
|
146 |
+
"dataType": "sc:Enumeration",
|
147 |
+
"field": [
|
148 |
+
{
|
149 |
+
"@type": "cr:Field",
|
150 |
+
"@id": "enumerations/checking_status/value",
|
151 |
+
"name": "value",
|
152 |
+
"description": "The value of checking_status.",
|
153 |
+
"dataType": "sc:Text"
|
154 |
+
}
|
155 |
+
],
|
156 |
+
"data": [
|
157 |
+
{
|
158 |
+
"enumerations/checking_status/value": "0<=X<200"
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"enumerations/checking_status/value": "<0"
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"enumerations/checking_status/value": ">=200"
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"enumerations/checking_status/value": "no checking"
|
168 |
+
}
|
169 |
+
]
|
170 |
+
},
|
171 |
+
{
|
172 |
+
"@type": "cr:RecordSet",
|
173 |
+
"@id": "enumerations/credit_history",
|
174 |
+
"name": "credit_history",
|
175 |
+
"description": "Possible values for credit_history",
|
176 |
+
"dataType": "sc:Enumeration",
|
177 |
+
"field": [
|
178 |
+
{
|
179 |
+
"@type": "cr:Field",
|
180 |
+
"@id": "enumerations/credit_history/value",
|
181 |
+
"name": "value",
|
182 |
+
"description": "The value of credit_history.",
|
183 |
+
"dataType": "sc:Text"
|
184 |
+
}
|
185 |
+
],
|
186 |
+
"data": [
|
187 |
+
{
|
188 |
+
"enumerations/credit_history/value": "all paid"
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"enumerations/credit_history/value": "critical/other existing credit"
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"enumerations/credit_history/value": "delayed previously"
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"enumerations/credit_history/value": "existing paid"
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"enumerations/credit_history/value": "no credits/all paid"
|
201 |
+
}
|
202 |
+
]
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"@type": "cr:RecordSet",
|
206 |
+
"@id": "enumerations/purpose",
|
207 |
+
"name": "purpose",
|
208 |
+
"description": "Possible values for purpose",
|
209 |
+
"dataType": "sc:Enumeration",
|
210 |
+
"field": [
|
211 |
+
{
|
212 |
+
"@type": "cr:Field",
|
213 |
+
"@id": "enumerations/purpose/value",
|
214 |
+
"name": "value",
|
215 |
+
"description": "The value of purpose.",
|
216 |
+
"dataType": "sc:Text"
|
217 |
+
}
|
218 |
+
],
|
219 |
+
"data": [
|
220 |
+
{
|
221 |
+
"enumerations/purpose/value": "business"
|
222 |
+
},
|
223 |
+
{
|
224 |
+
"enumerations/purpose/value": "domestic appliance"
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"enumerations/purpose/value": "education"
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"enumerations/purpose/value": "furniture/equipment"
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"enumerations/purpose/value": "new car"
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"enumerations/purpose/value": "other"
|
237 |
+
},
|
238 |
+
{
|
239 |
+
"enumerations/purpose/value": "radio/tv"
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"enumerations/purpose/value": "repairs"
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"enumerations/purpose/value": "retraining"
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"enumerations/purpose/value": "used car"
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"enumerations/purpose/value": "vacation"
|
252 |
+
}
|
253 |
+
]
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"@type": "cr:RecordSet",
|
257 |
+
"@id": "enumerations/savings_status",
|
258 |
+
"name": "savings_status",
|
259 |
+
"description": "Possible values for savings_status",
|
260 |
+
"dataType": "sc:Enumeration",
|
261 |
+
"field": [
|
262 |
+
{
|
263 |
+
"@type": "cr:Field",
|
264 |
+
"@id": "enumerations/savings_status/value",
|
265 |
+
"name": "value",
|
266 |
+
"description": "The value of savings_status.",
|
267 |
+
"dataType": "sc:Text"
|
268 |
+
}
|
269 |
+
],
|
270 |
+
"data": [
|
271 |
+
{
|
272 |
+
"enumerations/savings_status/value": "100<=X<500"
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"enumerations/savings_status/value": "500<=X<1000"
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"enumerations/savings_status/value": "<100"
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"enumerations/savings_status/value": ">=1000"
|
282 |
+
},
|
283 |
+
{
|
284 |
+
"enumerations/savings_status/value": "no known savings"
|
285 |
+
}
|
286 |
+
]
|
287 |
+
},
|
288 |
+
{
|
289 |
+
"@type": "cr:RecordSet",
|
290 |
+
"@id": "enumerations/employment",
|
291 |
+
"name": "employment",
|
292 |
+
"description": "Possible values for employment",
|
293 |
+
"dataType": "sc:Enumeration",
|
294 |
+
"field": [
|
295 |
+
{
|
296 |
+
"@type": "cr:Field",
|
297 |
+
"@id": "enumerations/employment/value",
|
298 |
+
"name": "value",
|
299 |
+
"description": "The value of employment.",
|
300 |
+
"dataType": "sc:Text"
|
301 |
+
}
|
302 |
+
],
|
303 |
+
"data": [
|
304 |
+
{
|
305 |
+
"enumerations/employment/value": "1<=X<4"
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"enumerations/employment/value": "4<=X<7"
|
309 |
+
},
|
310 |
+
{
|
311 |
+
"enumerations/employment/value": "<1"
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"enumerations/employment/value": ">=7"
|
315 |
+
},
|
316 |
+
{
|
317 |
+
"enumerations/employment/value": "unemployed"
|
318 |
+
}
|
319 |
+
]
|
320 |
+
},
|
321 |
+
{
|
322 |
+
"@type": "cr:RecordSet",
|
323 |
+
"@id": "enumerations/personal_status",
|
324 |
+
"name": "personal_status",
|
325 |
+
"description": "Possible values for personal_status",
|
326 |
+
"dataType": "sc:Enumeration",
|
327 |
+
"field": [
|
328 |
+
{
|
329 |
+
"@type": "cr:Field",
|
330 |
+
"@id": "enumerations/personal_status/value",
|
331 |
+
"name": "value",
|
332 |
+
"description": "The value of personal_status.",
|
333 |
+
"dataType": "sc:Text"
|
334 |
+
}
|
335 |
+
],
|
336 |
+
"data": [
|
337 |
+
{
|
338 |
+
"enumerations/personal_status/value": "female div/dep/mar"
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"enumerations/personal_status/value": "female single"
|
342 |
+
},
|
343 |
+
{
|
344 |
+
"enumerations/personal_status/value": "male div/sep"
|
345 |
+
},
|
346 |
+
{
|
347 |
+
"enumerations/personal_status/value": "male mar/wid"
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"enumerations/personal_status/value": "male single"
|
351 |
+
}
|
352 |
+
]
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"@type": "cr:RecordSet",
|
356 |
+
"@id": "enumerations/other_parties",
|
357 |
+
"name": "other_parties",
|
358 |
+
"description": "Possible values for other_parties",
|
359 |
+
"dataType": "sc:Enumeration",
|
360 |
+
"field": [
|
361 |
+
{
|
362 |
+
"@type": "cr:Field",
|
363 |
+
"@id": "enumerations/other_parties/value",
|
364 |
+
"name": "value",
|
365 |
+
"description": "The value of other_parties.",
|
366 |
+
"dataType": "sc:Text"
|
367 |
+
}
|
368 |
+
],
|
369 |
+
"data": [
|
370 |
+
{
|
371 |
+
"enumerations/other_parties/value": "co applicant"
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"enumerations/other_parties/value": "guarantor"
|
375 |
+
},
|
376 |
+
{
|
377 |
+
"enumerations/other_parties/value": "none"
|
378 |
+
}
|
379 |
+
]
|
380 |
+
},
|
381 |
+
{
|
382 |
+
"@type": "cr:RecordSet",
|
383 |
+
"@id": "enumerations/property_magnitude",
|
384 |
+
"name": "property_magnitude",
|
385 |
+
"description": "Possible values for property_magnitude",
|
386 |
+
"dataType": "sc:Enumeration",
|
387 |
+
"field": [
|
388 |
+
{
|
389 |
+
"@type": "cr:Field",
|
390 |
+
"@id": "enumerations/property_magnitude/value",
|
391 |
+
"name": "value",
|
392 |
+
"description": "The value of property_magnitude.",
|
393 |
+
"dataType": "sc:Text"
|
394 |
+
}
|
395 |
+
],
|
396 |
+
"data": [
|
397 |
+
{
|
398 |
+
"enumerations/property_magnitude/value": "car"
|
399 |
+
},
|
400 |
+
{
|
401 |
+
"enumerations/property_magnitude/value": "life insurance"
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"enumerations/property_magnitude/value": "no known property"
|
405 |
+
},
|
406 |
+
{
|
407 |
+
"enumerations/property_magnitude/value": "real estate"
|
408 |
+
}
|
409 |
+
]
|
410 |
+
},
|
411 |
+
{
|
412 |
+
"@type": "cr:RecordSet",
|
413 |
+
"@id": "enumerations/other_payment_plans",
|
414 |
+
"name": "other_payment_plans",
|
415 |
+
"description": "Possible values for other_payment_plans",
|
416 |
+
"dataType": "sc:Enumeration",
|
417 |
+
"field": [
|
418 |
+
{
|
419 |
+
"@type": "cr:Field",
|
420 |
+
"@id": "enumerations/other_payment_plans/value",
|
421 |
+
"name": "value",
|
422 |
+
"description": "The value of other_payment_plans.",
|
423 |
+
"dataType": "sc:Text"
|
424 |
+
}
|
425 |
+
],
|
426 |
+
"data": [
|
427 |
+
{
|
428 |
+
"enumerations/other_payment_plans/value": "bank"
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"enumerations/other_payment_plans/value": "none"
|
432 |
+
},
|
433 |
+
{
|
434 |
+
"enumerations/other_payment_plans/value": "stores"
|
435 |
+
}
|
436 |
+
]
|
437 |
+
},
|
438 |
+
{
|
439 |
+
"@type": "cr:RecordSet",
|
440 |
+
"@id": "enumerations/housing",
|
441 |
+
"name": "housing",
|
442 |
+
"description": "Possible values for housing",
|
443 |
+
"dataType": "sc:Enumeration",
|
444 |
+
"field": [
|
445 |
+
{
|
446 |
+
"@type": "cr:Field",
|
447 |
+
"@id": "enumerations/housing/value",
|
448 |
+
"name": "value",
|
449 |
+
"description": "The value of housing.",
|
450 |
+
"dataType": "sc:Text"
|
451 |
+
}
|
452 |
+
],
|
453 |
+
"data": [
|
454 |
+
{
|
455 |
+
"enumerations/housing/value": "for free"
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"enumerations/housing/value": "own"
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"enumerations/housing/value": "rent"
|
462 |
+
}
|
463 |
+
]
|
464 |
+
},
|
465 |
+
{
|
466 |
+
"@type": "cr:RecordSet",
|
467 |
+
"@id": "enumerations/job",
|
468 |
+
"name": "job",
|
469 |
+
"description": "Possible values for job",
|
470 |
+
"dataType": "sc:Enumeration",
|
471 |
+
"field": [
|
472 |
+
{
|
473 |
+
"@type": "cr:Field",
|
474 |
+
"@id": "enumerations/job/value",
|
475 |
+
"name": "value",
|
476 |
+
"description": "The value of job.",
|
477 |
+
"dataType": "sc:Text"
|
478 |
+
}
|
479 |
+
],
|
480 |
+
"data": [
|
481 |
+
{
|
482 |
+
"enumerations/job/value": "high qualif/self emp/mgmt"
|
483 |
+
},
|
484 |
+
{
|
485 |
+
"enumerations/job/value": "skilled"
|
486 |
+
},
|
487 |
+
{
|
488 |
+
"enumerations/job/value": "unemp/unskilled non res"
|
489 |
+
},
|
490 |
+
{
|
491 |
+
"enumerations/job/value": "unskilled resident"
|
492 |
+
}
|
493 |
+
]
|
494 |
+
},
|
495 |
+
{
|
496 |
+
"@type": "cr:RecordSet",
|
497 |
+
"@id": "enumerations/own_telephone",
|
498 |
+
"name": "own_telephone",
|
499 |
+
"description": "Possible values for own_telephone",
|
500 |
+
"dataType": "sc:Enumeration",
|
501 |
+
"field": [
|
502 |
+
{
|
503 |
+
"@type": "cr:Field",
|
504 |
+
"@id": "enumerations/own_telephone/value",
|
505 |
+
"name": "value",
|
506 |
+
"description": "The value of own_telephone.",
|
507 |
+
"dataType": "sc:Text"
|
508 |
+
}
|
509 |
+
],
|
510 |
+
"data": [
|
511 |
+
{
|
512 |
+
"enumerations/own_telephone/value": "none"
|
513 |
+
},
|
514 |
+
{
|
515 |
+
"enumerations/own_telephone/value": "yes"
|
516 |
+
}
|
517 |
+
]
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"@type": "cr:RecordSet",
|
521 |
+
"@id": "enumerations/foreign_worker",
|
522 |
+
"name": "foreign_worker",
|
523 |
+
"description": "Possible values for foreign_worker",
|
524 |
+
"dataType": "sc:Enumeration",
|
525 |
+
"field": [
|
526 |
+
{
|
527 |
+
"@type": "cr:Field",
|
528 |
+
"@id": "enumerations/foreign_worker/value",
|
529 |
+
"name": "value",
|
530 |
+
"description": "The value of foreign_worker.",
|
531 |
+
"dataType": "sc:Text"
|
532 |
+
}
|
533 |
+
],
|
534 |
+
"data": [
|
535 |
+
{
|
536 |
+
"enumerations/foreign_worker/value": "no"
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"enumerations/foreign_worker/value": "yes"
|
540 |
+
}
|
541 |
+
]
|
542 |
+
},
|
543 |
+
{
|
544 |
+
"@type": "cr:RecordSet",
|
545 |
+
"@id": "enumerations/class",
|
546 |
+
"name": "class",
|
547 |
+
"description": "Possible values for class",
|
548 |
+
"dataType": "sc:Enumeration",
|
549 |
+
"field": [
|
550 |
+
{
|
551 |
+
"@type": "cr:Field",
|
552 |
+
"@id": "enumerations/class/value",
|
553 |
+
"name": "value",
|
554 |
+
"description": "The value of class.",
|
555 |
+
"dataType": "sc:Text"
|
556 |
+
}
|
557 |
+
],
|
558 |
+
"data": [
|
559 |
+
{
|
560 |
+
"enumerations/class/value": "bad"
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"enumerations/class/value": "good"
|
564 |
+
}
|
565 |
+
]
|
566 |
+
},
|
567 |
+
{
|
568 |
+
"@type": "cr:RecordSet",
|
569 |
+
"name": "data-file-description",
|
570 |
+
"description": "Listing the fields of the data.",
|
571 |
+
"field": [
|
572 |
+
{
|
573 |
+
"@type": "cr:Field",
|
574 |
+
"@id": "features/0-checking_status",
|
575 |
+
"name": "checking_status",
|
576 |
+
"description": "checking_status - a field.",
|
577 |
+
"dataType": "sc:Text",
|
578 |
+
"references": {
|
579 |
+
"field": {
|
580 |
+
"@id": "enumerations/checking_status/value"
|
581 |
+
}
|
582 |
+
},
|
583 |
+
"source": {
|
584 |
+
"fileObject": {
|
585 |
+
"@id": "data-file"
|
586 |
+
},
|
587 |
+
"extract": {
|
588 |
+
"column": "checking_status"
|
589 |
+
}
|
590 |
+
}
|
591 |
+
},
|
592 |
+
{
|
593 |
+
"@type": "cr:Field",
|
594 |
+
"@id": "features/1-duration",
|
595 |
+
"name": "duration",
|
596 |
+
"description": "duration - a field.",
|
597 |
+
"dataType": [
|
598 |
+
"sc:Float",
|
599 |
+
"sc:Integer"
|
600 |
+
],
|
601 |
+
"source": {
|
602 |
+
"fileObject": {
|
603 |
+
"@id": "data-file"
|
604 |
+
},
|
605 |
+
"extract": {
|
606 |
+
"column": "duration"
|
607 |
+
}
|
608 |
+
}
|
609 |
+
},
|
610 |
+
{
|
611 |
+
"@type": "cr:Field",
|
612 |
+
"@id": "features/2-credit_history",
|
613 |
+
"name": "credit_history",
|
614 |
+
"description": "credit_history - a field.",
|
615 |
+
"dataType": "sc:Text",
|
616 |
+
"references": {
|
617 |
+
"field": {
|
618 |
+
"@id": "enumerations/credit_history/value"
|
619 |
+
}
|
620 |
+
},
|
621 |
+
"source": {
|
622 |
+
"fileObject": {
|
623 |
+
"@id": "data-file"
|
624 |
+
},
|
625 |
+
"extract": {
|
626 |
+
"column": "credit_history"
|
627 |
+
}
|
628 |
+
}
|
629 |
+
},
|
630 |
+
{
|
631 |
+
"@type": "cr:Field",
|
632 |
+
"@id": "features/3-purpose",
|
633 |
+
"name": "purpose",
|
634 |
+
"description": "purpose - a field.",
|
635 |
+
"dataType": "sc:Text",
|
636 |
+
"references": {
|
637 |
+
"field": {
|
638 |
+
"@id": "enumerations/purpose/value"
|
639 |
+
}
|
640 |
+
},
|
641 |
+
"source": {
|
642 |
+
"fileObject": {
|
643 |
+
"@id": "data-file"
|
644 |
+
},
|
645 |
+
"extract": {
|
646 |
+
"column": "purpose"
|
647 |
+
}
|
648 |
+
}
|
649 |
+
},
|
650 |
+
{
|
651 |
+
"@type": "cr:Field",
|
652 |
+
"@id": "features/4-credit_amount",
|
653 |
+
"name": "credit_amount",
|
654 |
+
"description": "credit_amount - a field.",
|
655 |
+
"dataType": [
|
656 |
+
"sc:Float",
|
657 |
+
"sc:Integer"
|
658 |
+
],
|
659 |
+
"source": {
|
660 |
+
"fileObject": {
|
661 |
+
"@id": "data-file"
|
662 |
+
},
|
663 |
+
"extract": {
|
664 |
+
"column": "credit_amount"
|
665 |
+
}
|
666 |
+
}
|
667 |
+
},
|
668 |
+
{
|
669 |
+
"@type": "cr:Field",
|
670 |
+
"@id": "features/5-savings_status",
|
671 |
+
"name": "savings_status",
|
672 |
+
"description": "savings_status - a field.",
|
673 |
+
"dataType": "sc:Text",
|
674 |
+
"references": {
|
675 |
+
"field": {
|
676 |
+
"@id": "enumerations/savings_status/value"
|
677 |
+
}
|
678 |
+
},
|
679 |
+
"source": {
|
680 |
+
"fileObject": {
|
681 |
+
"@id": "data-file"
|
682 |
+
},
|
683 |
+
"extract": {
|
684 |
+
"column": "savings_status"
|
685 |
+
}
|
686 |
+
}
|
687 |
+
},
|
688 |
+
{
|
689 |
+
"@type": "cr:Field",
|
690 |
+
"@id": "features/6-employment",
|
691 |
+
"name": "employment",
|
692 |
+
"description": "employment - a field.",
|
693 |
+
"dataType": "sc:Text",
|
694 |
+
"references": {
|
695 |
+
"field": {
|
696 |
+
"@id": "enumerations/employment/value"
|
697 |
+
}
|
698 |
+
},
|
699 |
+
"source": {
|
700 |
+
"fileObject": {
|
701 |
+
"@id": "data-file"
|
702 |
+
},
|
703 |
+
"extract": {
|
704 |
+
"column": "employment"
|
705 |
+
}
|
706 |
+
}
|
707 |
+
},
|
708 |
+
{
|
709 |
+
"@type": "cr:Field",
|
710 |
+
"@id": "features/7-installment_commitment",
|
711 |
+
"name": "installment_commitment",
|
712 |
+
"description": "installment_commitment - a field.",
|
713 |
+
"dataType": [
|
714 |
+
"sc:Float",
|
715 |
+
"sc:Integer"
|
716 |
+
],
|
717 |
+
"source": {
|
718 |
+
"fileObject": {
|
719 |
+
"@id": "data-file"
|
720 |
+
},
|
721 |
+
"extract": {
|
722 |
+
"column": "installment_commitment"
|
723 |
+
}
|
724 |
+
}
|
725 |
+
},
|
726 |
+
{
|
727 |
+
"@type": "cr:Field",
|
728 |
+
"@id": "features/8-personal_status",
|
729 |
+
"name": "personal_status",
|
730 |
+
"description": "personal_status - a field.",
|
731 |
+
"dataType": "sc:Text",
|
732 |
+
"references": {
|
733 |
+
"field": {
|
734 |
+
"@id": "enumerations/personal_status/value"
|
735 |
+
}
|
736 |
+
},
|
737 |
+
"source": {
|
738 |
+
"fileObject": {
|
739 |
+
"@id": "data-file"
|
740 |
+
},
|
741 |
+
"extract": {
|
742 |
+
"column": "personal_status"
|
743 |
+
}
|
744 |
+
}
|
745 |
+
},
|
746 |
+
{
|
747 |
+
"@type": "cr:Field",
|
748 |
+
"@id": "features/9-other_parties",
|
749 |
+
"name": "other_parties",
|
750 |
+
"description": "other_parties - a field.",
|
751 |
+
"dataType": "sc:Text",
|
752 |
+
"references": {
|
753 |
+
"field": {
|
754 |
+
"@id": "enumerations/other_parties/value"
|
755 |
+
}
|
756 |
+
},
|
757 |
+
"source": {
|
758 |
+
"fileObject": {
|
759 |
+
"@id": "data-file"
|
760 |
+
},
|
761 |
+
"extract": {
|
762 |
+
"column": "other_parties"
|
763 |
+
}
|
764 |
+
}
|
765 |
+
},
|
766 |
+
{
|
767 |
+
"@type": "cr:Field",
|
768 |
+
"@id": "features/10-residence_since",
|
769 |
+
"name": "residence_since",
|
770 |
+
"description": "residence_since - a field.",
|
771 |
+
"dataType": [
|
772 |
+
"sc:Float",
|
773 |
+
"sc:Integer"
|
774 |
+
],
|
775 |
+
"source": {
|
776 |
+
"fileObject": {
|
777 |
+
"@id": "data-file"
|
778 |
+
},
|
779 |
+
"extract": {
|
780 |
+
"column": "residence_since"
|
781 |
+
}
|
782 |
+
}
|
783 |
+
},
|
784 |
+
{
|
785 |
+
"@type": "cr:Field",
|
786 |
+
"@id": "features/11-property_magnitude",
|
787 |
+
"name": "property_magnitude",
|
788 |
+
"description": "property_magnitude - a field.",
|
789 |
+
"dataType": "sc:Text",
|
790 |
+
"references": {
|
791 |
+
"field": {
|
792 |
+
"@id": "enumerations/property_magnitude/value"
|
793 |
+
}
|
794 |
+
},
|
795 |
+
"source": {
|
796 |
+
"fileObject": {
|
797 |
+
"@id": "data-file"
|
798 |
+
},
|
799 |
+
"extract": {
|
800 |
+
"column": "property_magnitude"
|
801 |
+
}
|
802 |
+
}
|
803 |
+
},
|
804 |
+
{
|
805 |
+
"@type": "cr:Field",
|
806 |
+
"@id": "features/12-age",
|
807 |
+
"name": "age",
|
808 |
+
"description": "age - a field.",
|
809 |
+
"dataType": [
|
810 |
+
"sc:Float",
|
811 |
+
"sc:Integer"
|
812 |
+
],
|
813 |
+
"source": {
|
814 |
+
"fileObject": {
|
815 |
+
"@id": "data-file"
|
816 |
+
},
|
817 |
+
"extract": {
|
818 |
+
"column": "age"
|
819 |
+
}
|
820 |
+
}
|
821 |
+
},
|
822 |
+
{
|
823 |
+
"@type": "cr:Field",
|
824 |
+
"@id": "features/13-other_payment_plans",
|
825 |
+
"name": "other_payment_plans",
|
826 |
+
"description": "other_payment_plans - a field.",
|
827 |
+
"dataType": "sc:Text",
|
828 |
+
"references": {
|
829 |
+
"field": {
|
830 |
+
"@id": "enumerations/other_payment_plans/value"
|
831 |
+
}
|
832 |
+
},
|
833 |
+
"source": {
|
834 |
+
"fileObject": {
|
835 |
+
"@id": "data-file"
|
836 |
+
},
|
837 |
+
"extract": {
|
838 |
+
"column": "other_payment_plans"
|
839 |
+
}
|
840 |
+
}
|
841 |
+
},
|
842 |
+
{
|
843 |
+
"@type": "cr:Field",
|
844 |
+
"@id": "features/14-housing",
|
845 |
+
"name": "housing",
|
846 |
+
"description": "housing - a field.",
|
847 |
+
"dataType": "sc:Text",
|
848 |
+
"references": {
|
849 |
+
"field": {
|
850 |
+
"@id": "enumerations/housing/value"
|
851 |
+
}
|
852 |
+
},
|
853 |
+
"source": {
|
854 |
+
"fileObject": {
|
855 |
+
"@id": "data-file"
|
856 |
+
},
|
857 |
+
"extract": {
|
858 |
+
"column": "housing"
|
859 |
+
}
|
860 |
+
}
|
861 |
+
},
|
862 |
+
{
|
863 |
+
"@type": "cr:Field",
|
864 |
+
"@id": "features/15-existing_credits",
|
865 |
+
"name": "existing_credits",
|
866 |
+
"description": "existing_credits - a field.",
|
867 |
+
"dataType": [
|
868 |
+
"sc:Float",
|
869 |
+
"sc:Integer"
|
870 |
+
],
|
871 |
+
"source": {
|
872 |
+
"fileObject": {
|
873 |
+
"@id": "data-file"
|
874 |
+
},
|
875 |
+
"extract": {
|
876 |
+
"column": "existing_credits"
|
877 |
+
}
|
878 |
+
}
|
879 |
+
},
|
880 |
+
{
|
881 |
+
"@type": "cr:Field",
|
882 |
+
"@id": "features/16-job",
|
883 |
+
"name": "job",
|
884 |
+
"description": "job - a field.",
|
885 |
+
"dataType": "sc:Text",
|
886 |
+
"references": {
|
887 |
+
"field": {
|
888 |
+
"@id": "enumerations/job/value"
|
889 |
+
}
|
890 |
+
},
|
891 |
+
"source": {
|
892 |
+
"fileObject": {
|
893 |
+
"@id": "data-file"
|
894 |
+
},
|
895 |
+
"extract": {
|
896 |
+
"column": "job"
|
897 |
+
}
|
898 |
+
}
|
899 |
+
},
|
900 |
+
{
|
901 |
+
"@type": "cr:Field",
|
902 |
+
"@id": "features/17-num_dependents",
|
903 |
+
"name": "num_dependents",
|
904 |
+
"description": "num_dependents - a field.",
|
905 |
+
"dataType": [
|
906 |
+
"sc:Float",
|
907 |
+
"sc:Integer"
|
908 |
+
],
|
909 |
+
"source": {
|
910 |
+
"fileObject": {
|
911 |
+
"@id": "data-file"
|
912 |
+
},
|
913 |
+
"extract": {
|
914 |
+
"column": "num_dependents"
|
915 |
+
}
|
916 |
+
}
|
917 |
+
},
|
918 |
+
{
|
919 |
+
"@type": "cr:Field",
|
920 |
+
"@id": "features/18-own_telephone",
|
921 |
+
"name": "own_telephone",
|
922 |
+
"description": "own_telephone - a field.",
|
923 |
+
"dataType": "sc:Text",
|
924 |
+
"references": {
|
925 |
+
"field": {
|
926 |
+
"@id": "enumerations/own_telephone/value"
|
927 |
+
}
|
928 |
+
},
|
929 |
+
"source": {
|
930 |
+
"fileObject": {
|
931 |
+
"@id": "data-file"
|
932 |
+
},
|
933 |
+
"extract": {
|
934 |
+
"column": "own_telephone"
|
935 |
+
}
|
936 |
+
}
|
937 |
+
},
|
938 |
+
{
|
939 |
+
"@type": "cr:Field",
|
940 |
+
"@id": "features/19-foreign_worker",
|
941 |
+
"name": "foreign_worker",
|
942 |
+
"description": "foreign_worker - a field.",
|
943 |
+
"dataType": "sc:Text",
|
944 |
+
"references": {
|
945 |
+
"field": {
|
946 |
+
"@id": "enumerations/foreign_worker/value"
|
947 |
+
}
|
948 |
+
},
|
949 |
+
"source": {
|
950 |
+
"fileObject": {
|
951 |
+
"@id": "data-file"
|
952 |
+
},
|
953 |
+
"extract": {
|
954 |
+
"column": "foreign_worker"
|
955 |
+
}
|
956 |
+
}
|
957 |
+
},
|
958 |
+
{
|
959 |
+
"@type": "cr:Field",
|
960 |
+
"@id": "features/20-class",
|
961 |
+
"name": "class",
|
962 |
+
"description": "class - the default target field.",
|
963 |
+
"dataType": "sc:Text",
|
964 |
+
"references": {
|
965 |
+
"field": {
|
966 |
+
"@id": "enumerations/class/value"
|
967 |
+
}
|
968 |
+
},
|
969 |
+
"source": {
|
970 |
+
"fileObject": {
|
971 |
+
"@id": "data-file"
|
972 |
+
},
|
973 |
+
"extract": {
|
974 |
+
"column": "class"
|
975 |
+
}
|
976 |
+
}
|
977 |
+
}
|
978 |
+
]
|
979 |
+
}
|
980 |
+
]
|
981 |
+
}
|
982 |
+
```
|
report_croissant-validation_natural_reasoning.md
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CROISSANT VALIDATION REPORT
|
2 |
+
================================================================================
|
3 |
+
## VALIDATION RESULTS
|
4 |
+
--------------------------------------------------------------------------------
|
5 |
+
Starting validation for file: croissant
|
6 |
+
### JSON Format Validation
|
7 |
+
✓
|
8 |
+
The URL returned valid JSON.
|
9 |
+
### Croissant Schema Validation
|
10 |
+
✓
|
11 |
+
The dataset passes Croissant validation.
|
12 |
+
### Records Generation Test
|
13 |
+
✗
|
14 |
+
Record set 'default' failed: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
|
15 |
+
|
16 |
+
Traceback (most recent call last):
|
17 |
+
File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in validate_records
|
18 |
+
_ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
|
19 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
20 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/dafunc.py", line 108, in func_timeout
|
21 |
+
raise_exception(exception)
|
22 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/py3_raise.py", line 7, in raise_exception
|
23 |
+
raise exception[0] from None
|
24 |
+
File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in <lambda>
|
25 |
+
_ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
|
26 |
+
^^^^^^^^^^^^^^^^^^^
|
27 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/datasets.py", line 171, in __iter__
|
28 |
+
yield from execute_operations_sequentially(
|
29 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 72, in execute_operations_sequentially
|
30 |
+
raise GenerationError(
|
31 |
+
mlcroissant._src.core.issues.GenerationError: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
|
32 |
+
## JSON-LD REFERENCE
|
33 |
+
================================================================================
|
34 |
+
```json
|
35 |
+
{
|
36 |
+
"@context": {
|
37 |
+
"@language": "en",
|
38 |
+
"@vocab": "https://schema.org/",
|
39 |
+
"citeAs": "cr:citeAs",
|
40 |
+
"column": "cr:column",
|
41 |
+
"conformsTo": "dct:conformsTo",
|
42 |
+
"cr": "http://mlcommons.org/croissant/",
|
43 |
+
"data": {
|
44 |
+
"@id": "cr:data",
|
45 |
+
"@type": "@json"
|
46 |
+
},
|
47 |
+
"dataBiases": "cr:dataBiases",
|
48 |
+
"dataCollection": "cr:dataCollection",
|
49 |
+
"dataType": {
|
50 |
+
"@id": "cr:dataType",
|
51 |
+
"@type": "@vocab"
|
52 |
+
},
|
53 |
+
"dct": "http://purl.org/dc/terms/",
|
54 |
+
"extract": "cr:extract",
|
55 |
+
"field": "cr:field",
|
56 |
+
"fileProperty": "cr:fileProperty",
|
57 |
+
"fileObject": "cr:fileObject",
|
58 |
+
"fileSet": "cr:fileSet",
|
59 |
+
"format": "cr:format",
|
60 |
+
"includes": "cr:includes",
|
61 |
+
"isLiveDataset": "cr:isLiveDataset",
|
62 |
+
"jsonPath": "cr:jsonPath",
|
63 |
+
"key": "cr:key",
|
64 |
+
"md5": "cr:md5",
|
65 |
+
"parentField": "cr:parentField",
|
66 |
+
"path": "cr:path",
|
67 |
+
"personalSensitiveInformation": "cr:personalSensitiveInformation",
|
68 |
+
"recordSet": "cr:recordSet",
|
69 |
+
"references": "cr:references",
|
70 |
+
"regex": "cr:regex",
|
71 |
+
"repeated": "cr:repeated",
|
72 |
+
"replace": "cr:replace",
|
73 |
+
"sc": "https://schema.org/",
|
74 |
+
"separator": "cr:separator",
|
75 |
+
"source": "cr:source",
|
76 |
+
"subField": "cr:subField",
|
77 |
+
"transform": "cr:transform",
|
78 |
+
"@base": "cr_base_iri/"
|
79 |
+
},
|
80 |
+
"@type": "sc:Dataset",
|
81 |
+
"distribution": [
|
82 |
+
{
|
83 |
+
"@type": "cr:FileObject",
|
84 |
+
"@id": "repo",
|
85 |
+
"name": "repo",
|
86 |
+
"description": "The Hugging Face git repository.",
|
87 |
+
"contentUrl": "https://huggingface.co/datasets/facebook/natural_reasoning/tree/refs%2Fconvert%2Fparquet",
|
88 |
+
"encodingFormat": "git+https",
|
89 |
+
"sha256": "https://github.com/mlcommons/croissant/issues/80"
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"@type": "cr:FileSet",
|
93 |
+
"@id": "parquet-files-for-config-default",
|
94 |
+
"containedIn": {
|
95 |
+
"@id": "repo"
|
96 |
+
},
|
97 |
+
"encodingFormat": "application/x-parquet",
|
98 |
+
"includes": "default/*/*.parquet"
|
99 |
+
}
|
100 |
+
],
|
101 |
+
"recordSet": [
|
102 |
+
{
|
103 |
+
"@type": "cr:RecordSet",
|
104 |
+
"dataType": "cr:Split",
|
105 |
+
"key": {
|
106 |
+
"@id": "default_splits/split_name"
|
107 |
+
},
|
108 |
+
"@id": "default_splits",
|
109 |
+
"name": "default_splits",
|
110 |
+
"description": "Splits for the default config.",
|
111 |
+
"field": [
|
112 |
+
{
|
113 |
+
"@type": "cr:Field",
|
114 |
+
"@id": "default_splits/split_name",
|
115 |
+
"dataType": "sc:Text"
|
116 |
+
}
|
117 |
+
],
|
118 |
+
"data": [
|
119 |
+
{
|
120 |
+
"default_splits/split_name": "train"
|
121 |
+
}
|
122 |
+
]
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"@type": "cr:RecordSet",
|
126 |
+
"@id": "default",
|
127 |
+
"description": "facebook/natural_reasoning - 'default' subset",
|
128 |
+
"field": [
|
129 |
+
{
|
130 |
+
"@type": "cr:Field",
|
131 |
+
"@id": "default/split",
|
132 |
+
"dataType": "sc:Text",
|
133 |
+
"source": {
|
134 |
+
"fileSet": {
|
135 |
+
"@id": "parquet-files-for-config-default"
|
136 |
+
},
|
137 |
+
"extract": {
|
138 |
+
"fileProperty": "fullpath"
|
139 |
+
},
|
140 |
+
"transform": {
|
141 |
+
"regex": "default/(?:partial-)?(train)/.+parquet$"
|
142 |
+
}
|
143 |
+
},
|
144 |
+
"references": {
|
145 |
+
"field": {
|
146 |
+
"@id": "default_splits/split_name"
|
147 |
+
}
|
148 |
+
}
|
149 |
+
},
|
150 |
+
{
|
151 |
+
"@type": "cr:Field",
|
152 |
+
"@id": "default/question",
|
153 |
+
"dataType": "sc:Text",
|
154 |
+
"source": {
|
155 |
+
"fileSet": {
|
156 |
+
"@id": "parquet-files-for-config-default"
|
157 |
+
},
|
158 |
+
"extract": {
|
159 |
+
"column": "question"
|
160 |
+
}
|
161 |
+
}
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"@type": "cr:Field",
|
165 |
+
"@id": "default/reference_answer",
|
166 |
+
"dataType": "sc:Text",
|
167 |
+
"source": {
|
168 |
+
"fileSet": {
|
169 |
+
"@id": "parquet-files-for-config-default"
|
170 |
+
},
|
171 |
+
"extract": {
|
172 |
+
"column": "reference_answer"
|
173 |
+
}
|
174 |
+
}
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"@type": "cr:Field",
|
178 |
+
"@id": "default/responses",
|
179 |
+
"subField": [
|
180 |
+
{
|
181 |
+
"@type": "cr:Field",
|
182 |
+
"@id": "default/responses/response_model",
|
183 |
+
"dataType": "sc:Text",
|
184 |
+
"source": {
|
185 |
+
"fileSet": {
|
186 |
+
"@id": "parquet-files-for-config-default"
|
187 |
+
},
|
188 |
+
"extract": {
|
189 |
+
"column": "responses"
|
190 |
+
},
|
191 |
+
"transform": {
|
192 |
+
"jsonPath": "response_model"
|
193 |
+
}
|
194 |
+
}
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"@type": "cr:Field",
|
198 |
+
"@id": "default/responses/response",
|
199 |
+
"dataType": "sc:Text",
|
200 |
+
"source": {
|
201 |
+
"fileSet": {
|
202 |
+
"@id": "parquet-files-for-config-default"
|
203 |
+
},
|
204 |
+
"extract": {
|
205 |
+
"column": "responses"
|
206 |
+
},
|
207 |
+
"transform": {
|
208 |
+
"jsonPath": "response"
|
209 |
+
}
|
210 |
+
}
|
211 |
+
}
|
212 |
+
],
|
213 |
+
"repeated": true
|
214 |
+
}
|
215 |
+
]
|
216 |
+
}
|
217 |
+
],
|
218 |
+
"conformsTo": "http://mlcommons.org/croissant/1.0",
|
219 |
+
"name": "natural_reasoning",
|
220 |
+
"description": "NaturalReasoning is a large-scale dataset for general reasoning tasks. It consists of high-quality challenging reasoning questions backtranslated from pretraining corpora DCLM and FineMath. The questions have been deduplicated and decontaminated from popular reasoning benchmarks including MATH, GPQA, MMLU-Pro, MMLU-STEM. For each question, we extract the reference final answer from the original document from the pretraining corpora if possible. We also provide a model-generated response from\u2026 See the full description on the dataset page: https://huggingface.co/datasets/facebook/natural_reasoning.",
|
221 |
+
"alternateName": [
|
222 |
+
"facebook/natural_reasoning",
|
223 |
+
"Natural Reasoning"
|
224 |
+
],
|
225 |
+
"creator": {
|
226 |
+
"@type": "Organization",
|
227 |
+
"name": "AI at Meta",
|
228 |
+
"url": "https://huggingface.co/facebook"
|
229 |
+
},
|
230 |
+
"keywords": [
|
231 |
+
"text-generation",
|
232 |
+
"English",
|
233 |
+
"cc-by-nc-4.0",
|
234 |
+
"1M - 10M",
|
235 |
+
"json",
|
236 |
+
"Text",
|
237 |
+
"Datasets",
|
238 |
+
"pandas",
|
239 |
+
"Croissant",
|
240 |
+
"Polars",
|
241 |
+
"arxiv:2502.13124",
|
242 |
+
"\ud83c\uddfa\ud83c\uddf8 Region: US"
|
243 |
+
],
|
244 |
+
"license": "https://choosealicense.com/licenses/cc-by-nc-4.0/",
|
245 |
+
"url": "https://huggingface.co/datasets/facebook/natural_reasoning"
|
246 |
+
}
|
247 |
+
```
|
report_croissant-validation_shitspotter.md
ADDED
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CROISSANT VALIDATION REPORT
|
2 |
+
================================================================================
|
3 |
+
## VALIDATION RESULTS
|
4 |
+
--------------------------------------------------------------------------------
|
5 |
+
Starting validation for file: croissant
|
6 |
+
### JSON Format Validation
|
7 |
+
✓
|
8 |
+
The URL returned valid JSON.
|
9 |
+
### Croissant Schema Validation
|
10 |
+
✓
|
11 |
+
The dataset passes Croissant validation.
|
12 |
+
### Records Generation Test (Optional)
|
13 |
+
?
|
14 |
+
Record set 'default' failed due to generation error:
|
15 |
+
|
16 |
+
```text
|
17 |
+
An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
|
18 |
+
|
19 |
+
Traceback (most recent call last):
|
20 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 70, in execute_operations_sequentially
|
21 |
+
operation(set_output_in_memory=True)
|
22 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/base_operation.py", line 121, in __call__
|
23 |
+
output = self.call() if inputs is None else self.call(*inputs)
|
24 |
+
^^^^^^^^^^^^^^^^^^
|
25 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/operations/read.py", line 196, in call
|
26 |
+
file_content = self._read_file_content(self.node.encoding_formats, file)
|
27 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
28 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/operations/read.py", line 135, in _read_file_content
|
29 |
+
df = pd.read_parquet(file)
|
30 |
+
^^^^^^^^^^^^^^^^^^^^^
|
31 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pandas/io/parquet.py", line 670, in read_parquet
|
32 |
+
return impl.read(
|
33 |
+
^^^^^^^^^^
|
34 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pandas/io/parquet.py", line 272, in read
|
35 |
+
pa_table = self.api.parquet.read_table(
|
36 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
37 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pyarrow/parquet/core.py", line 1793, in read_table
|
38 |
+
dataset = ParquetDataset(
|
39 |
+
^^^^^^^^^^^^^^^
|
40 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/pyarrow/parquet/core.py", line 1360, in __init__
|
41 |
+
[fragment], schema=schema or fragment.physical_schema,
|
42 |
+
^^^^^^^^^^^^^^^^^^^^^^^^
|
43 |
+
File "pyarrow/_dataset.pyx", line 1431, in pyarrow._dataset.Fragment.physical_schema.__get__
|
44 |
+
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
|
45 |
+
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
|
46 |
+
pyarrow.lib.ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.
|
47 |
+
|
48 |
+
The above exception was the direct cause of the following exception:
|
49 |
+
|
50 |
+
Traceback (most recent call last):
|
51 |
+
File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 61, in validate_records
|
52 |
+
raise result # re-raise actual error outside timeout
|
53 |
+
^^^^^^^^^^^^
|
54 |
+
File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 37, in try_generate_record
|
55 |
+
next(record_iterator)
|
56 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/datasets.py", line 171, in __iter__
|
57 |
+
yield from execute_operations_sequentially(
|
58 |
+
File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 72, in execute_operations_sequentially
|
59 |
+
raise GenerationError(
|
60 |
+
mlcroissant._src.core.issues.GenerationError: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default)
|
61 |
+
```
|
62 |
+
## JSON-LD REFERENCE
|
63 |
+
================================================================================
|
64 |
+
```json
|
65 |
+
{
|
66 |
+
"@context": {
|
67 |
+
"@language": "en",
|
68 |
+
"@vocab": "https://schema.org/",
|
69 |
+
"arrayShape": "cr:arrayShape",
|
70 |
+
"citeAs": "cr:citeAs",
|
71 |
+
"column": "cr:column",
|
72 |
+
"conformsTo": "dct:conformsTo",
|
73 |
+
"cr": "http://mlcommons.org/croissant/",
|
74 |
+
"data": {
|
75 |
+
"@id": "cr:data",
|
76 |
+
"@type": "@json"
|
77 |
+
},
|
78 |
+
"dataBiases": "cr:dataBiases",
|
79 |
+
"dataCollection": "cr:dataCollection",
|
80 |
+
"dataType": {
|
81 |
+
"@id": "cr:dataType",
|
82 |
+
"@type": "@vocab"
|
83 |
+
},
|
84 |
+
"dct": "http://purl.org/dc/terms/",
|
85 |
+
"extract": "cr:extract",
|
86 |
+
"field": "cr:field",
|
87 |
+
"fileProperty": "cr:fileProperty",
|
88 |
+
"fileObject": "cr:fileObject",
|
89 |
+
"fileSet": "cr:fileSet",
|
90 |
+
"format": "cr:format",
|
91 |
+
"includes": "cr:includes",
|
92 |
+
"isArray": "cr:isArray",
|
93 |
+
"isLiveDataset": "cr:isLiveDataset",
|
94 |
+
"jsonPath": "cr:jsonPath",
|
95 |
+
"key": "cr:key",
|
96 |
+
"md5": "cr:md5",
|
97 |
+
"parentField": "cr:parentField",
|
98 |
+
"path": "cr:path",
|
99 |
+
"personalSensitiveInformation": "cr:personalSensitiveInformation",
|
100 |
+
"recordSet": "cr:recordSet",
|
101 |
+
"references": "cr:references",
|
102 |
+
"regex": "cr:regex",
|
103 |
+
"repeated": "cr:repeated",
|
104 |
+
"replace": "cr:replace",
|
105 |
+
"sc": "https://schema.org/",
|
106 |
+
"separator": "cr:separator",
|
107 |
+
"source": "cr:source",
|
108 |
+
"subField": "cr:subField",
|
109 |
+
"transform": "cr:transform",
|
110 |
+
"@base": "cr_base_iri/"
|
111 |
+
},
|
112 |
+
"@type": "sc:Dataset",
|
113 |
+
"distribution": [
|
114 |
+
{
|
115 |
+
"@type": "cr:FileObject",
|
116 |
+
"@id": "repo",
|
117 |
+
"name": "repo",
|
118 |
+
"description": "The Hugging Face git repository.",
|
119 |
+
"contentUrl": "https://huggingface.co/datasets/erotemic/shitspotter/tree/refs%2Fconvert%2Fparquet",
|
120 |
+
"encodingFormat": "git+https",
|
121 |
+
"sha256": "https://github.com/mlcommons/croissant/issues/80"
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"@type": "cr:FileSet",
|
125 |
+
"@id": "parquet-files-for-config-default",
|
126 |
+
"containedIn": {
|
127 |
+
"@id": "repo"
|
128 |
+
},
|
129 |
+
"encodingFormat": "application/x-parquet",
|
130 |
+
"includes": "default/*/*.parquet"
|
131 |
+
}
|
132 |
+
],
|
133 |
+
"recordSet": [
|
134 |
+
{
|
135 |
+
"@type": "cr:RecordSet",
|
136 |
+
"dataType": "cr:Split",
|
137 |
+
"key": {
|
138 |
+
"@id": "default_splits/split_name"
|
139 |
+
},
|
140 |
+
"@id": "default_splits",
|
141 |
+
"name": "default_splits",
|
142 |
+
"description": "Splits for the default config.",
|
143 |
+
"field": [
|
144 |
+
{
|
145 |
+
"@type": "cr:Field",
|
146 |
+
"@id": "default_splits/split_name",
|
147 |
+
"dataType": "sc:Text"
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"data": [
|
151 |
+
{
|
152 |
+
"default_splits/split_name": "train"
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"default_splits/split_name": "validation"
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"default_splits/split_name": "test"
|
159 |
+
}
|
160 |
+
]
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"@type": "cr:RecordSet",
|
164 |
+
"@id": "default",
|
165 |
+
"description": "erotemic/shitspotter - 'default' subset (first 5GB)\n\nAdditional information:\n- 3 splits: train, validation, test",
|
166 |
+
"field": [
|
167 |
+
{
|
168 |
+
"@type": "cr:Field",
|
169 |
+
"@id": "default/split",
|
170 |
+
"dataType": "sc:Text",
|
171 |
+
"source": {
|
172 |
+
"fileSet": {
|
173 |
+
"@id": "parquet-files-for-config-default"
|
174 |
+
},
|
175 |
+
"extract": {
|
176 |
+
"fileProperty": "fullpath"
|
177 |
+
},
|
178 |
+
"transform": {
|
179 |
+
"regex": "default/(?:partial-)?(train|validation|test)/.+parquet$"
|
180 |
+
}
|
181 |
+
},
|
182 |
+
"references": {
|
183 |
+
"field": {
|
184 |
+
"@id": "default_splits/split_name"
|
185 |
+
}
|
186 |
+
}
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"@type": "cr:Field",
|
190 |
+
"@id": "default/jpg",
|
191 |
+
"dataType": "sc:ImageObject",
|
192 |
+
"source": {
|
193 |
+
"fileSet": {
|
194 |
+
"@id": "parquet-files-for-config-default"
|
195 |
+
},
|
196 |
+
"extract": {
|
197 |
+
"column": "jpg"
|
198 |
+
},
|
199 |
+
"transform": {
|
200 |
+
"jsonPath": "bytes"
|
201 |
+
}
|
202 |
+
}
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"@type": "cr:Field",
|
206 |
+
"@id": "default/json",
|
207 |
+
"subField": [
|
208 |
+
{
|
209 |
+
"@type": "cr:Field",
|
210 |
+
"@id": "default/json/annotations",
|
211 |
+
"subField": [
|
212 |
+
{
|
213 |
+
"@type": "cr:Field",
|
214 |
+
"@id": "default/json/annotations/bbox",
|
215 |
+
"dataType": "cr:Int64",
|
216 |
+
"source": {
|
217 |
+
"fileSet": {
|
218 |
+
"@id": "parquet-files-for-config-default"
|
219 |
+
},
|
220 |
+
"extract": {
|
221 |
+
"column": "json"
|
222 |
+
}
|
223 |
+
},
|
224 |
+
"isArray": true,
|
225 |
+
"arrayShape": "-1"
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"@type": "cr:Field",
|
229 |
+
"@id": "default/json/annotations/category_id",
|
230 |
+
"dataType": "cr:Int64",
|
231 |
+
"source": {
|
232 |
+
"fileSet": {
|
233 |
+
"@id": "parquet-files-for-config-default"
|
234 |
+
},
|
235 |
+
"extract": {
|
236 |
+
"column": "json"
|
237 |
+
},
|
238 |
+
"transform": {
|
239 |
+
"jsonPath": "category_id"
|
240 |
+
}
|
241 |
+
}
|
242 |
+
},
|
243 |
+
{
|
244 |
+
"@type": "cr:Field",
|
245 |
+
"@id": "default/json/annotations/iscrowd",
|
246 |
+
"dataType": "cr:Int64",
|
247 |
+
"source": {
|
248 |
+
"fileSet": {
|
249 |
+
"@id": "parquet-files-for-config-default"
|
250 |
+
},
|
251 |
+
"extract": {
|
252 |
+
"column": "json"
|
253 |
+
},
|
254 |
+
"transform": {
|
255 |
+
"jsonPath": "iscrowd"
|
256 |
+
}
|
257 |
+
}
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"@type": "cr:Field",
|
261 |
+
"@id": "default/json/annotations/segmentation",
|
262 |
+
"subField": [
|
263 |
+
{
|
264 |
+
"@type": "cr:Field",
|
265 |
+
"@id": "default/json/annotations/segmentation/exterior",
|
266 |
+
"dataType": "cr:Float64",
|
267 |
+
"source": {
|
268 |
+
"fileSet": {
|
269 |
+
"@id": "parquet-files-for-config-default"
|
270 |
+
},
|
271 |
+
"extract": {
|
272 |
+
"column": "json"
|
273 |
+
}
|
274 |
+
},
|
275 |
+
"isArray": true,
|
276 |
+
"arrayShape": "-1,-1"
|
277 |
+
},
|
278 |
+
null
|
279 |
+
]
|
280 |
+
}
|
281 |
+
],
|
282 |
+
"isArray": true,
|
283 |
+
"arrayShape": "-1"
|
284 |
+
},
|
285 |
+
{
|
286 |
+
"@type": "cr:Field",
|
287 |
+
"@id": "default/json/file_name",
|
288 |
+
"dataType": "sc:Text",
|
289 |
+
"source": {
|
290 |
+
"fileSet": {
|
291 |
+
"@id": "parquet-files-for-config-default"
|
292 |
+
},
|
293 |
+
"extract": {
|
294 |
+
"column": "json"
|
295 |
+
},
|
296 |
+
"transform": {
|
297 |
+
"jsonPath": "file_name"
|
298 |
+
}
|
299 |
+
}
|
300 |
+
},
|
301 |
+
{
|
302 |
+
"@type": "cr:Field",
|
303 |
+
"@id": "default/json/height",
|
304 |
+
"dataType": "cr:Int64",
|
305 |
+
"source": {
|
306 |
+
"fileSet": {
|
307 |
+
"@id": "parquet-files-for-config-default"
|
308 |
+
},
|
309 |
+
"extract": {
|
310 |
+
"column": "json"
|
311 |
+
},
|
312 |
+
"transform": {
|
313 |
+
"jsonPath": "height"
|
314 |
+
}
|
315 |
+
}
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"@type": "cr:Field",
|
319 |
+
"@id": "default/json/id",
|
320 |
+
"dataType": "cr:Int64",
|
321 |
+
"source": {
|
322 |
+
"fileSet": {
|
323 |
+
"@id": "parquet-files-for-config-default"
|
324 |
+
},
|
325 |
+
"extract": {
|
326 |
+
"column": "json"
|
327 |
+
},
|
328 |
+
"transform": {
|
329 |
+
"jsonPath": "id"
|
330 |
+
}
|
331 |
+
}
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"@type": "cr:Field",
|
335 |
+
"@id": "default/json/width",
|
336 |
+
"dataType": "cr:Int64",
|
337 |
+
"source": {
|
338 |
+
"fileSet": {
|
339 |
+
"@id": "parquet-files-for-config-default"
|
340 |
+
},
|
341 |
+
"extract": {
|
342 |
+
"column": "json"
|
343 |
+
},
|
344 |
+
"transform": {
|
345 |
+
"jsonPath": "width"
|
346 |
+
}
|
347 |
+
}
|
348 |
+
}
|
349 |
+
]
|
350 |
+
},
|
351 |
+
{
|
352 |
+
"@type": "cr:Field",
|
353 |
+
"@id": "default/__key__",
|
354 |
+
"dataType": "sc:Text",
|
355 |
+
"source": {
|
356 |
+
"fileSet": {
|
357 |
+
"@id": "parquet-files-for-config-default"
|
358 |
+
},
|
359 |
+
"extract": {
|
360 |
+
"column": "__key__"
|
361 |
+
}
|
362 |
+
}
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"@type": "cr:Field",
|
366 |
+
"@id": "default/__url__",
|
367 |
+
"dataType": "sc:Text",
|
368 |
+
"source": {
|
369 |
+
"fileSet": {
|
370 |
+
"@id": "parquet-files-for-config-default"
|
371 |
+
},
|
372 |
+
"extract": {
|
373 |
+
"column": "__url__"
|
374 |
+
}
|
375 |
+
}
|
376 |
+
}
|
377 |
+
]
|
378 |
+
}
|
379 |
+
],
|
380 |
+
"conformsTo": "http://mlcommons.org/croissant/1.1",
|
381 |
+
"name": "shitspotter",
|
382 |
+
"description": "\n\t\n\t\t\n\t\tDataset Card for ShitSpotter (\"ScatSpotter\")\n\t\n\nShitSpotter (or \"ScatSpotter\" in formal settings) is an open dataset of images containing dog feces. \nThis dataset contains full-resolution smartphone images of dog feces (\"poop\") collected in urban outdoor \nenvironments taken using a \"before/after/negative\" protocol. \nIt includes thousands of polygon annotations of feces in varied lighting, seasonal, and terrain conditions. \nThe dataset is designed for training and evaluating object\u2026 See the full description on the dataset page: https://huggingface.co/datasets/erotemic/shitspotter.",
|
383 |
+
"alternateName": [
|
384 |
+
"erotemic/shitspotter",
|
385 |
+
"ShitSpotter"
|
386 |
+
],
|
387 |
+
"creator": {
|
388 |
+
"@type": "Person",
|
389 |
+
"name": "Jonathan P Crall",
|
390 |
+
"url": "https://huggingface.co/erotemic"
|
391 |
+
},
|
392 |
+
"keywords": [
|
393 |
+
"object-detection",
|
394 |
+
"image-segmentation",
|
395 |
+
"English",
|
396 |
+
"cc-by-4.0",
|
397 |
+
"1K - 10K",
|
398 |
+
"webdataset",
|
399 |
+
"Image",
|
400 |
+
"Text",
|
401 |
+
"Datasets",
|
402 |
+
"WebDataset",
|
403 |
+
"Croissant",
|
404 |
+
"arxiv:2412.16473",
|
405 |
+
"\ud83c\uddfa\ud83c\uddf8 Region: US"
|
406 |
+
],
|
407 |
+
"license": "https://choosealicense.com/licenses/cc-by-4.0/",
|
408 |
+
"url": "https://huggingface.co/datasets/erotemic/shitspotter"
|
409 |
+
}
|
410 |
+
```
|
validation.py
CHANGED
@@ -31,6 +31,14 @@ def validate_croissant(json_data):
|
|
31 |
error_details = traceback.format_exc()
|
32 |
error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
|
33 |
return False, error_message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def validate_records(json_data):
|
36 |
"""Validate that records can be generated within the time limit."""
|
@@ -46,15 +54,25 @@ def validate_records(json_data):
|
|
46 |
for record_set in record_sets:
|
47 |
try:
|
48 |
records = dataset.records(record_set=record_set.uuid)
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
50 |
results.append(f"Record set '{record_set.uuid}' passed validation.")
|
|
|
51 |
except func_timeout.exceptions.FunctionTimedOut:
|
52 |
-
error_message = f"Record set '{record_set.uuid}' generation took too long (>
|
53 |
-
return False, error_message
|
|
|
54 |
except Exception as e:
|
55 |
error_details = traceback.format_exc()
|
56 |
-
error_message =
|
57 |
-
|
|
|
|
|
|
|
58 |
|
59 |
return True, "\n".join(results)
|
60 |
except Exception as e:
|
@@ -72,9 +90,20 @@ def generate_validation_report(filename, json_data, results):
|
|
72 |
report.append(f"Starting validation for file: {filename}")
|
73 |
|
74 |
# Add validation results
|
75 |
-
for
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
report.append(f"### {test_name}")
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
78 |
report.append(message.strip()) # Remove any trailing newlines
|
79 |
|
80 |
# Add JSON-LD reference
|
@@ -84,4 +113,4 @@ def generate_validation_report(filename, json_data, results):
|
|
84 |
report.append(json.dumps(json_data, indent=2))
|
85 |
report.append("```")
|
86 |
|
87 |
-
return "\n".join(report)
|
|
|
31 |
error_details = traceback.format_exc()
|
32 |
error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
|
33 |
return False, error_message
|
34 |
+
|
35 |
+
def try_generate_record(record_iterator):
|
36 |
+
try:
|
37 |
+
next(record_iterator)
|
38 |
+
return "success"
|
39 |
+
except Exception as e:
|
40 |
+
# Return the exception object to the outer function
|
41 |
+
return e
|
42 |
|
43 |
def validate_records(json_data):
|
44 |
"""Validate that records can be generated within the time limit."""
|
|
|
54 |
for record_set in record_sets:
|
55 |
try:
|
56 |
records = dataset.records(record_set=record_set.uuid)
|
57 |
+
|
58 |
+
result = func_timeout.func_timeout(WAIT_TIME, try_generate_record, args=(iter(records),))
|
59 |
+
|
60 |
+
if isinstance(result, Exception):
|
61 |
+
raise result # re-raise actual error outside timeout
|
62 |
+
|
63 |
results.append(f"Record set '{record_set.uuid}' passed validation.")
|
64 |
+
|
65 |
except func_timeout.exceptions.FunctionTimedOut:
|
66 |
+
error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)."
|
67 |
+
return False, error_message, "warning"
|
68 |
+
|
69 |
except Exception as e:
|
70 |
error_details = traceback.format_exc()
|
71 |
+
error_message = (
|
72 |
+
f"Record set '{record_set.uuid}' failed due to generation error:\n\n"
|
73 |
+
f"```text\n{str(e)}\n\n{error_details}```"
|
74 |
+
)
|
75 |
+
return False, error_message, "warning"
|
76 |
|
77 |
return True, "\n".join(results)
|
78 |
except Exception as e:
|
|
|
90 |
report.append(f"Starting validation for file: {filename}")
|
91 |
|
92 |
# Add validation results
|
93 |
+
for result in results:
|
94 |
+
if len(result) == 4:
|
95 |
+
test_name, passed, message, status = result
|
96 |
+
else:
|
97 |
+
test_name, passed, message = result
|
98 |
+
status = "pass" if passed else "error"
|
99 |
+
|
100 |
report.append(f"### {test_name}")
|
101 |
+
if status == "pass":
|
102 |
+
report.append("✓")
|
103 |
+
elif status == "warning":
|
104 |
+
report.append("?") # Question mark for warning
|
105 |
+
else:
|
106 |
+
report.append("✗")
|
107 |
report.append(message.strip()) # Remove any trailing newlines
|
108 |
|
109 |
# Add JSON-LD reference
|
|
|
113 |
report.append(json.dumps(json_data, indent=2))
|
114 |
report.append("```")
|
115 |
|
116 |
+
return "\n".join(report)
|