diff --git a/.gitattributes b/.gitattributes
index c7d9f3332a950355d5a77d85000f05e6f45435ea..9465f3dee05702760cf48275b10a710eb23dfee3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..faafcee93060917fc6c1428a7e986b02e7496c66
--- /dev/null
+++ b/config.json
@@ -0,0 +1,31 @@
+{
+ "apply_residual_connection_post_layernorm": false,
+ "architectures": [
+ "BloomModel"
+ ],
+ "attention_dropout": 0.0,
+ "attention_softmax_in_fp32": true,
+ "bias_dropout_fusion": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "hidden_dropout": 0.0,
+ "initializer_range": 0.02,
+ "layer_norm_epsilon": 1e-05,
+ "masked_softmax_fusion": true,
+ "model_type": "bloom",
+ "n_embed": 4096,
+ "n_inner": null,
+ "n_layer": 30,
+ "num_attention_heads": 32,
+ "offset_alibi": 100,
+ "pad_token_id": 3,
+ "pretraining_tp": 4,
+ "seq_length": 2048,
+ "skip_bias_add": true,
+ "skip_bias_add_qkv": false,
+ "slow_but_exact": false,
+ "transformers_version": "4.21.0.dev0",
+ "unk_token_id": 0,
+ "use_cache": true,
+ "vocab_size": 250880
+}
diff --git a/evaluation/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a619beb35ee1243d0b9479748f97b082592c55d
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "ar",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.5360688285903376
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..65b62734c4236bd1dc41270da21bdf4711a5e0a4
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "ar",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.5188616810059563
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b769b8c3b74f739f6933a6d0b149d2c8a39a5781
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "ar",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.5916611515552614
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..267588bbcfd0899cab0ab26c2b84b5a4ca89b780
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "ar",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.528788881535407
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json b/evaluation/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..946f8afd1b3976b40f7f989e228d0ffb5343a807
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "ar",
+ "template_name": "Story Continuation and Options",
+ "evaluation": {
+ "accuracy": 0.5109199205823958
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..864c36dd8ceae83917c081ce089a0724fbb1fb77
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "es",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.5023163467902052
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..91e1339e328514ad5a554deccfc24b880c45efd0
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "es",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.5274652547981469
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/es/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d208d80d840ec1aa13dde035deb402cd866e615
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "es",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.6644606221045665
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e462f3aafab69dd84fd83985e5ec5950c407873
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "es",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.5095962938451357
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json b/evaluation/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f3ad507bfce2f26d75be779db6f79dec83f89f9
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "es",
+ "template_name": "Story Continuation and Options",
+ "evaluation": {
+ "accuracy": 0.5115817339510258
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec4e7f27ad92b40d52924fd07a74c70799daba3d
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "eu",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.49172733289212445
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1db681209fb1a5b40a3ff0c10ca3df16568fffd7
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "eu",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.4672402382528127
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..22fc89d050cf07c5ed41922b6c07e15a8e646f54
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "eu",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.5737921906022502
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fee9c7223fed7ddcf28d674659e236f98a2ec079
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "eu",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.4784910655195235
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json b/evaluation/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ced3abb63eb534eb7ef068755b201dcc017421c
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "eu",
+ "template_name": "Story Continuation and Options",
+ "evaluation": {
+ "accuracy": 0.4685638649900728
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..edc499179154cbb77fca348dadc0310afc542b79
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "hi",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.4923891462607545
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..72a9909b8697c9ef32f14999ec2faa1bb7699ce5
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "hi",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.5181998676373263
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d84809dd3d80cd3b87e9427a36f49e2bbff09fc
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "hi",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.5936465916611515
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json b/evaluation/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf2222531ac983a48156e1970f8d65c674537dbd
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "hi",
+ "template_name": "Story Continuation and Options",
+ "evaluation": {
+ "accuracy": 0.5102581072137657
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..379a87308013018cfca827f95c2d6de007e33f92
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "id",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.5082726671078756
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/id/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a744a48ddac73d57497c4b08b87f079aad2e00e
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "id",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.6267372600926538
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f11bbefc218222908ec2a42115852ed6ed681b9
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "id",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.5043017868960953
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/sw/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/sw/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fe10a78fe6aac57742d14d969b952ac8018df50
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/sw/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "sw",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.5062872270019855
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/sw/Choose_Story_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/sw/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..35b1566f3b35800727cf13a3fbeaff78e1630933
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/sw/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "sw",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.5069490403706155
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/sw/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/sw/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb2bc16890cf5c81bbf82b34ee11ad935d655008
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/sw/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "sw",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.5446724023825281
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/sw/Novel_Correct_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/sw/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..768c36e3d52cd6e2674d9bf07bfca70aa03e8749
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/sw/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "sw",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.5029781601588352
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/sw/Story_Continuation_and_Options/results.json b/evaluation/Muennighoff_xstory_cloze/sw/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c8b564ff98d5bb0f2605c76a6cb7d629a48815d
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/sw/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "sw",
+ "template_name": "Story Continuation and Options",
+ "evaluation": {
+ "accuracy": 0.5102581072137657
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/te/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/te/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6e74cbefd9e38f3fccc1e509c9515f9756f38ae
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/te/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "te",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.48643282594308407
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='te', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/te/Choose_Story_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/te/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..88425be47be1c1ba91130dc08ed8c2e93a106141
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/te/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "te",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.5274652547981469
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='te', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/te/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/te/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..706ba4291dc1bfe1384d3c5209e70ffe671ecb24
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/te/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "te",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.585704831237591
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='te', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/te/Novel_Correct_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/te/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..59d79028f6e4af619a5cf93169622f1e824ce2a9
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/te/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "te",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.49172733289212445
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='te', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json b/evaluation/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9ace86f692a06b401b85b49dfc551bb338392ba
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "zh",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.5122435473196558
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0f52a5a819e44ea7b5464b55b57f98e4b4daf79
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "zh",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.5095962938451357
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ecfaf8dc1a13f4853a555b8317ca59696f99240
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "zh",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.6254136333553938
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json b/evaluation/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..27981e7c0a8bf4da988b58572d41cbd73752ea16
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "zh",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.5215089344804765
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json b/evaluation/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a023f9f9ec8f311f9804c6960c0f984b8e7524c
--- /dev/null
+++ b/evaluation/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xstory_cloze",
+ "dataset_config_name": "zh",
+ "template_name": "Story Continuation and Options",
+ "evaluation": {
+ "accuracy": 0.5162144275314361
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/en/Replace/results.json b/evaluation/Muennighoff_xwinograd/en/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf56dcd1ffc643d895a189aa32fa4f89d24c3a3f
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/en/Replace/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "en",
+ "template_name": "Replace",
+ "evaluation": {
+ "accuracy": 0.5075268817204301
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/en/True_or_False/results.json b/evaluation/Muennighoff_xwinograd/en/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..dee32d446832a5eb48712cdde605ccb4d8e614ea
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/en/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "en",
+ "template_name": "True or False",
+ "evaluation": {
+ "accuracy": 0.5156989247311828
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6b7653a345fb63c59466faf15dc76eb3143096f
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "en",
+ "template_name": "does underscore refer to",
+ "evaluation": {
+ "accuracy": 0.5010752688172043
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/en/stand_for/results.json b/evaluation/Muennighoff_xwinograd/en/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..48a77254ed12dc282065a3af8909a4a44c5285fd
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/en/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "en",
+ "template_name": "stand for",
+ "evaluation": {
+ "accuracy": 0.5023655913978494
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/en/underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/en/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5efcc4788aaf85d9beb87749887af4d42576f351
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/en/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "en",
+ "template_name": "underscore refer to",
+ "evaluation": {
+ "accuracy": 0.5083870967741936
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/fr/Replace/results.json b/evaluation/Muennighoff_xwinograd/fr/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb712e56a5f8d1e12b5edc32dee4f8e7d8dd7828
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/fr/Replace/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "fr",
+ "template_name": "Replace",
+ "evaluation": {
+ "accuracy": 0.46987951807228917
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/fr/True_or_False/results.json b/evaluation/Muennighoff_xwinograd/fr/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6400baea08e45ec27d9ff34b6808c46785810647
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/fr/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "fr",
+ "template_name": "True or False",
+ "evaluation": {
+ "accuracy": 0.46987951807228917
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..49091286c0080f4ff011007804f45b79c5f7a251
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "fr",
+ "template_name": "does underscore refer to",
+ "evaluation": {
+ "accuracy": 0.5542168674698795
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/fr/stand_for/results.json b/evaluation/Muennighoff_xwinograd/fr/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b31602bf157316874b4e12c9010cc8b525c284c
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/fr/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "fr",
+ "template_name": "stand for",
+ "evaluation": {
+ "accuracy": 0.4457831325301205
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/fr/underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..97af9956aa6c7efb6c5c0cbdd5e747c84a620a0f
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "fr",
+ "template_name": "underscore refer to",
+ "evaluation": {
+ "accuracy": 0.5421686746987951
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/pt/Replace/results.json b/evaluation/Muennighoff_xwinograd/pt/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f95f67d9eb59b8a69bd3946bf50dca8ddd81c07a
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/pt/Replace/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "pt",
+ "template_name": "Replace",
+ "evaluation": {
+ "accuracy": 0.5057034220532319
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/pt/True_or_False/results.json b/evaluation/Muennighoff_xwinograd/pt/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8abc8dab9668fb5dd9eb4d2007292dc0a558c20
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/pt/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "pt",
+ "template_name": "True or False",
+ "evaluation": {
+ "accuracy": 0.5475285171102662
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..153df2e986758f2d27a8d2831d87e41ad939ecab
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "pt",
+ "template_name": "does underscore refer to",
+ "evaluation": {
+ "accuracy": 0.49809885931558934
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/pt/stand_for/results.json b/evaluation/Muennighoff_xwinograd/pt/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cc72cd82b131b05bc66b13582ccbfaaad4e8ab8
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/pt/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "pt",
+ "template_name": "stand for",
+ "evaluation": {
+ "accuracy": 0.5209125475285171
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/pt/underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd701cde00947c2a094c0baa046e1b238390c2a6
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "pt",
+ "template_name": "underscore refer to",
+ "evaluation": {
+ "accuracy": 0.5095057034220533
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/zh/Replace/results.json b/evaluation/Muennighoff_xwinograd/zh/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8ce09e330f06a98a880833dce3aa6f61235d2f5
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/zh/Replace/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "zh",
+ "template_name": "Replace",
+ "evaluation": {
+ "accuracy": 0.5178571428571429
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/zh/True_or_False/results.json b/evaluation/Muennighoff_xwinograd/zh/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..996c738ce8b55330d8936773084b80b1c5f690d0
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/zh/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "zh",
+ "template_name": "True or False",
+ "evaluation": {
+ "accuracy": 0.5
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..16964ba0f6f10cb502adcc2f528e93e40acef294
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "zh",
+ "template_name": "does underscore refer to",
+ "evaluation": {
+ "accuracy": 0.48412698412698413
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/zh/stand_for/results.json b/evaluation/Muennighoff_xwinograd/zh/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea49905746e3d69050c90f32e863086a46075db7
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/zh/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "zh",
+ "template_name": "stand for",
+ "evaluation": {
+ "accuracy": 0.5158730158730159
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/Muennighoff_xwinograd/zh/underscore_refer_to/results.json b/evaluation/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..44f142815f9e4906f3626f729399f3bf61c9b1dc
--- /dev/null
+++ b/evaluation/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "Muennighoff/xwinograd",
+ "dataset_config_name": "zh",
+ "template_name": "underscore refer to",
+ "evaluation": {
+ "accuracy": 0.501984126984127
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r1/GPT-3_style/results.json b/evaluation/anli/dev_r1/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..de9e322a7ff9b20b32a201ae4c310f3c9a4cf518
--- /dev/null
+++ b/evaluation/anli/dev_r1/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r1",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.331
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r1/MNLI_crowdsource/results.json b/evaluation/anli/dev_r1/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8850e9cdb9589f49589a7ac3dabb5b507c975e38
--- /dev/null
+++ b/evaluation/anli/dev_r1/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r1",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.334
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r1/can_we_infer/results.json b/evaluation/anli/dev_r1/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fee456c05b6ffc6c517d280cc48b009234831a8
--- /dev/null
+++ b/evaluation/anli/dev_r1/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r1",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.342
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r1/guaranteed_possible_impossible/results.json b/evaluation/anli/dev_r1/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..47c3215e4205e1e31c70838534b8ced7c11a1549
--- /dev/null
+++ b/evaluation/anli/dev_r1/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r1",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.336
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r1/justified_in_saying/results.json b/evaluation/anli/dev_r1/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2df3490193c7cb90478496fc7cf011d1dabd1ffa
--- /dev/null
+++ b/evaluation/anli/dev_r1/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r1",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.332
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r2/GPT-3_style/results.json b/evaluation/anli/dev_r2/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f64c5c129290f38956089358a50a7184d6790258
--- /dev/null
+++ b/evaluation/anli/dev_r2/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r2",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.338
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r2/MNLI_crowdsource/results.json b/evaluation/anli/dev_r2/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1fe8f15360179c6424087a13fede3795820e8c0
--- /dev/null
+++ b/evaluation/anli/dev_r2/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r2",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.334
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r2/can_we_infer/results.json b/evaluation/anli/dev_r2/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..70ccfac7b567247e4718327e93d5526a90ea6654
--- /dev/null
+++ b/evaluation/anli/dev_r2/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r2",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.324
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r2/guaranteed_possible_impossible/results.json b/evaluation/anli/dev_r2/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a179e4a9cbb19b21e64a4a5a91667def2c0b46e4
--- /dev/null
+++ b/evaluation/anli/dev_r2/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r2",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.321
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r2/justified_in_saying/results.json b/evaluation/anli/dev_r2/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5c0275d3fad1af2a1b627051d667eb28b282d6b
--- /dev/null
+++ b/evaluation/anli/dev_r2/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r2",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.341
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r3/GPT-3_style/results.json b/evaluation/anli/dev_r3/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9179d7985cfad8f73836bdcac9e6db55678cbf5
--- /dev/null
+++ b/evaluation/anli/dev_r3/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r3",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3375
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r3/MNLI_crowdsource/results.json b/evaluation/anli/dev_r3/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a165b056afe551fdc3e5f25f3155392b2dc07890
--- /dev/null
+++ b/evaluation/anli/dev_r3/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r3",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.335
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r3/can_we_infer/results.json b/evaluation/anli/dev_r3/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e3ce8a489ce30eb95c55b452cd2cc596a453d60
--- /dev/null
+++ b/evaluation/anli/dev_r3/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r3",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.33416666666666667
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r3/guaranteed_possible_impossible/results.json b/evaluation/anli/dev_r3/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..45eefe0ece81b7f3aea560e7df9af246b268641a
--- /dev/null
+++ b/evaluation/anli/dev_r3/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r3",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.33166666666666667
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/anli/dev_r3/justified_in_saying/results.json b/evaluation/anli/dev_r3/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c00aed61a27297f87c5eac6ab92daca1651b9bd
--- /dev/null
+++ b/evaluation/anli/dev_r3/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "anli",
+ "dataset_config_name": "dev_r3",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.3566666666666667
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/story_cloze/2016/Answer_Given_options/results.json b/evaluation/story_cloze/2016/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..edc01b21ef22e78ddca180e402008cc2e05a723c
--- /dev/null
+++ b/evaluation/story_cloze/2016/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "story_cloze",
+ "dataset_config_name": "2016",
+ "template_name": "Answer Given options",
+ "evaluation": {
+ "accuracy": 0.5195082843399251
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/story_cloze/2016/Choose_Story_Ending/results.json b/evaluation/story_cloze/2016/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b39c3aae7798f1698fcf54998a389ac9021ad627
--- /dev/null
+++ b/evaluation/story_cloze/2016/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "story_cloze",
+ "dataset_config_name": "2016",
+ "template_name": "Choose Story Ending",
+ "evaluation": {
+ "accuracy": 0.5456974879743453
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/story_cloze/2016/Generate_Ending/results.json b/evaluation/story_cloze/2016/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd9a38392e5ac57a564d2708c10340c8a0f0a0ec
--- /dev/null
+++ b/evaluation/story_cloze/2016/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "story_cloze",
+ "dataset_config_name": "2016",
+ "template_name": "Generate Ending",
+ "evaluation": {
+ "accuracy": 0.7087119187600214
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/story_cloze/2016/Novel_Correct_Ending/results.json b/evaluation/story_cloze/2016/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c0efa6f35f7d66a198701714605ac8bce0daf12
--- /dev/null
+++ b/evaluation/story_cloze/2016/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "story_cloze",
+ "dataset_config_name": "2016",
+ "template_name": "Novel Correct Ending",
+ "evaluation": {
+ "accuracy": 0.5104222340994121
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/cb/GPT-3_style/results.json b/evaluation/super_glue/cb/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..795ce37f0bff6ec220331330559dbb4ee84048db
--- /dev/null
+++ b/evaluation/super_glue/cb/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "cb",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.48214285714285715
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/cb/MNLI_crowdsource/results.json b/evaluation/super_glue/cb/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..eae4111a69ad840de0deb691a0ddf345eb4c02ac
--- /dev/null
+++ b/evaluation/super_glue/cb/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "cb",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.4107142857142857
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/cb/can_we_infer/results.json b/evaluation/super_glue/cb/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a1029984ea23a3ad4844f612c38bc0c0026c81c
--- /dev/null
+++ b/evaluation/super_glue/cb/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "cb",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3392857142857143
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/cb/guaranteed_possible_impossible/results.json b/evaluation/super_glue/cb/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..81a13815b6eb94163a20743cf9f595bf145606a1
--- /dev/null
+++ b/evaluation/super_glue/cb/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "cb",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.42857142857142855
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/cb/justified_in_saying/results.json b/evaluation/super_glue/cb/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8aec71293c07812907980ba6de82128e6741219
--- /dev/null
+++ b/evaluation/super_glue/cb/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "cb",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.25
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/copa/C1_or_C2?_premise/results.json b/evaluation/super_glue/copa/C1_or_C2?_premise/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..700ba7cc6d2ae1d0126f6c8729eab785ca5e51fa
--- /dev/null
+++ b/evaluation/super_glue/copa/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "copa",
+ "template_name": "C1 or C2? premise, so/because\u2026",
+ "evaluation": {
+ "accuracy": 0.71
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/copa/best_option/results.json b/evaluation/super_glue/copa/best_option/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..41094907ea54e800b7a33e47570b1e39c5901565
--- /dev/null
+++ b/evaluation/super_glue/copa/best_option/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "copa",
+ "template_name": "best_option",
+ "evaluation": {
+ "accuracy": 0.56
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/copa/cause_effect/results.json b/evaluation/super_glue/copa/cause_effect/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..df1f83571d5a4d6ef1889460125e07954b2322c3
--- /dev/null
+++ b/evaluation/super_glue/copa/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "copa",
+ "template_name": "cause_effect",
+ "evaluation": {
+ "accuracy": 0.62
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/copa/i_am_hesitating/results.json b/evaluation/super_glue/copa/i_am_hesitating/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bdb0b8ca6c2874e0a9d249aa614e4f3bf94f40a5
--- /dev/null
+++ b/evaluation/super_glue/copa/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "copa",
+ "template_name": "i_am_hesitating",
+ "evaluation": {
+ "accuracy": 0.53
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/copa/plausible_alternatives/results.json b/evaluation/super_glue/copa/plausible_alternatives/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..db99882f481b05973e31c7b252899d30b4bad4f4
--- /dev/null
+++ b/evaluation/super_glue/copa/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "copa",
+ "template_name": "plausible_alternatives",
+ "evaluation": {
+ "accuracy": 0.6
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/rte/GPT-3_style/results.json b/evaluation/super_glue/rte/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9c453b330e1ca95a8bd3b87489cfa9c7ce0d32e
--- /dev/null
+++ b/evaluation/super_glue/rte/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "rte",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.5054151624548736
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/rte/MNLI_crowdsource/results.json b/evaluation/super_glue/rte/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f13c0ee6dcd39f930732d7143b937041e2647af
--- /dev/null
+++ b/evaluation/super_glue/rte/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "rte",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.49458483754512633
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/rte/does_it_follow_that/results.json b/evaluation/super_glue/rte/does_it_follow_that/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59090909e7f5a16fb74a2903a323f32dcc8e52d
--- /dev/null
+++ b/evaluation/super_glue/rte/does_it_follow_that/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "rte",
+ "template_name": "does it follow that",
+ "evaluation": {
+ "accuracy": 0.4729241877256318
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/rte/guaranteed_true/results.json b/evaluation/super_glue/rte/guaranteed_true/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b5d230eb3e4dc3884e8c16f7c594edb3be76b38
--- /dev/null
+++ b/evaluation/super_glue/rte/guaranteed_true/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "rte",
+ "template_name": "guaranteed true",
+ "evaluation": {
+ "accuracy": 0.5018050541516246
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/super_glue/rte/should_assume/results.json b/evaluation/super_glue/rte/should_assume/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a4c0f23a4f635252986a1c6cd3d4c4b49d0aaad
--- /dev/null
+++ b/evaluation/super_glue/rte/should_assume/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "super_glue",
+ "dataset_config_name": "rte",
+ "template_name": "should assume",
+ "evaluation": {
+ "accuracy": 0.5306859205776173
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/winogrande/winogrande_xl/Replace/results.json b/evaluation/winogrande/winogrande_xl/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..badf7b273c2e723fbeea7990196712df34ae28cd
--- /dev/null
+++ b/evaluation/winogrande/winogrande_xl/Replace/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "winogrande",
+ "dataset_config_name": "winogrande_xl",
+ "template_name": "Replace",
+ "evaluation": {
+ "accuracy": 0.5114443567482242
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/winogrande/winogrande_xl/True_or_False/results.json b/evaluation/winogrande/winogrande_xl/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a2c40de97cd5f11c4c56fdbfe4d1873e924be65
--- /dev/null
+++ b/evaluation/winogrande/winogrande_xl/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "winogrande",
+ "dataset_config_name": "winogrande_xl",
+ "template_name": "True or False",
+ "evaluation": {
+ "accuracy": 0.4956590370955012
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/winogrande/winogrande_xl/does_underscore_refer_to/results.json b/evaluation/winogrande/winogrande_xl/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b2523b85b39888c41b3b497c4e14f28af57d24d
--- /dev/null
+++ b/evaluation/winogrande/winogrande_xl/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "winogrande",
+ "dataset_config_name": "winogrande_xl",
+ "template_name": "does underscore refer to",
+ "evaluation": {
+ "accuracy": 0.4988161010260458
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/winogrande/winogrande_xl/stand_for/results.json b/evaluation/winogrande/winogrande_xl/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3706200dcf0e752b4db344c01bbb6dee6cfb998b
--- /dev/null
+++ b/evaluation/winogrande/winogrande_xl/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "winogrande",
+ "dataset_config_name": "winogrande_xl",
+ "template_name": "stand for",
+ "evaluation": {
+ "accuracy": 0.5082872928176796
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/winogrande/winogrande_xl/underscore_refer_to/results.json b/evaluation/winogrande/winogrande_xl/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e7f27c34a7c153c96517265ca701827d1f09bac
--- /dev/null
+++ b/evaluation/winogrande/winogrande_xl/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "winogrande",
+ "dataset_config_name": "winogrande_xl",
+ "template_name": "underscore refer to",
+ "evaluation": {
+ "accuracy": 0.5177584846093133
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/ar/MNLI_crowdsource/results.json b/evaluation/xnli/ar/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..45bc46267e922a5d5085010aa9774fa82db3d49a
--- /dev/null
+++ b/evaluation/xnli/ar/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "ar",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.3333333333333333
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/en/GPT-3_style/results.json b/evaluation/xnli/en/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f274d67efcf60860c92a771acdedbd2862076b4
--- /dev/null
+++ b/evaluation/xnli/en/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "en",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3349397590361446
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/en/MNLI_crowdsource/results.json b/evaluation/xnli/en/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d62811a064b16168f4e5d9827f1e501b9b953928
--- /dev/null
+++ b/evaluation/xnli/en/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "en",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.3325301204819277
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/en/can_we_infer/results.json b/evaluation/xnli/en/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6013d6484d4a6f692a47cc0272652c8bfe61124
--- /dev/null
+++ b/evaluation/xnli/en/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "en",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3538152610441767
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/en/guaranteed_possible_impossible/results.json b/evaluation/xnli/en/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3a64cf90a63de4a3d61fc8618c4b1f7da1b938f
--- /dev/null
+++ b/evaluation/xnli/en/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "en",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.35943775100401604
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/en/justified_in_saying/results.json b/evaluation/xnli/en/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f009a1582716c333f3a36dc9eedeef84b3199a58
--- /dev/null
+++ b/evaluation/xnli/en/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "en",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.3502008032128514
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/es/GPT-3_style/results.json b/evaluation/xnli/es/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..89be2a4e7f50a9e49ef686ad52e58233c44a78e4
--- /dev/null
+++ b/evaluation/xnli/es/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "es",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3353413654618474
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/es/can_we_infer/results.json b/evaluation/xnli/es/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a25ebb29f5bb3dc7cfcf4de84525745c699ac53
--- /dev/null
+++ b/evaluation/xnli/es/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "es",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3449799196787149
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/es/justified_in_saying/results.json b/evaluation/xnli/es/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..efd13dc76db8505aaed31e41fd04aec2c42d343a
--- /dev/null
+++ b/evaluation/xnli/es/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "es",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.3497991967871486
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/fr/GPT-3_style/results.json b/evaluation/xnli/fr/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..33f32c8969ab9f7b606ea809e396c8561d4ebff5
--- /dev/null
+++ b/evaluation/xnli/fr/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "fr",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3333333333333333
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/fr/MNLI_crowdsource/results.json b/evaluation/xnli/fr/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9609370ef404346965aa9a2b8de6e7d4c6bc210a
--- /dev/null
+++ b/evaluation/xnli/fr/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "fr",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.3333333333333333
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/fr/can_we_infer/results.json b/evaluation/xnli/fr/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..de48b86cb8b02cff6aac4f84269731c4ca37518e
--- /dev/null
+++ b/evaluation/xnli/fr/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "fr",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3530120481927711
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/fr/guaranteed_possible_impossible/results.json b/evaluation/xnli/fr/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e842b9aabbdaaf675819e6e70e41d943113813ff
--- /dev/null
+++ b/evaluation/xnli/fr/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "fr",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.3449799196787149
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/fr/justified_in_saying/results.json b/evaluation/xnli/fr/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..20392501423ba2a10e13f53f6e25dc85a292b1ae
--- /dev/null
+++ b/evaluation/xnli/fr/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "fr",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.3449799196787149
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/hi/GPT-3_style/results.json b/evaluation/xnli/hi/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5bd36d9fae87bf8c5141b18698c116a5229deac
--- /dev/null
+++ b/evaluation/xnli/hi/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "hi",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3329317269076305
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/hi/MNLI_crowdsource/results.json b/evaluation/xnli/hi/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..96abf643d336f8f596945c569e143b886ac54a32
--- /dev/null
+++ b/evaluation/xnli/hi/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "hi",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.334136546184739
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/hi/can_we_infer/results.json b/evaluation/xnli/hi/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e2595953edfcc7a8881d8809c9e417c292daa25
--- /dev/null
+++ b/evaluation/xnli/hi/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "hi",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3534136546184739
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/hi/guaranteed_possible_impossible/results.json b/evaluation/xnli/hi/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c25bdb160f82ba6f5b432210b1a753ebb4e1a3e0
--- /dev/null
+++ b/evaluation/xnli/hi/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "hi",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.3317269076305221
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/hi/justified_in_saying/results.json b/evaluation/xnli/hi/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b857fc0c0e00d8cddbceb8b596f672a59cb5ed7c
--- /dev/null
+++ b/evaluation/xnli/hi/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "hi",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.344578313253012
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/sw/GPT-3_style/results.json b/evaluation/xnli/sw/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2698e1d44185ee818e1c9cfb60df1f65191ee8b2
--- /dev/null
+++ b/evaluation/xnli/sw/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "sw",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3337349397590361
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/sw/MNLI_crowdsource/results.json b/evaluation/xnli/sw/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..32c6c3f4f554ccb39f5cca91bacaa7798daad61d
--- /dev/null
+++ b/evaluation/xnli/sw/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "sw",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.3401606425702811
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/ur/GPT-3_style/results.json b/evaluation/xnli/ur/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..66a7fe493bfea5d2c353598e1eaae3e41333f114
--- /dev/null
+++ b/evaluation/xnli/ur/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "ur",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3333333333333333
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/ur/MNLI_crowdsource/results.json b/evaluation/xnli/ur/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..245d51e4683c69c511ab8f4bb19b8f911374071a
--- /dev/null
+++ b/evaluation/xnli/ur/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "ur",
+ "template_name": "MNLI crowdsource",
+ "evaluation": {
+ "accuracy": 0.3353413654618474
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/ur/can_we_infer/results.json b/evaluation/xnli/ur/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..34db4b841e4fb3a2eafb5294cbf7baeb8a9efba2
--- /dev/null
+++ b/evaluation/xnli/ur/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "ur",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3449799196787149
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/ur/guaranteed_possible_impossible/results.json b/evaluation/xnli/ur/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1eafda43d0608d32720daaac59a72483017b451a
--- /dev/null
+++ b/evaluation/xnli/ur/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "ur",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.3345381526104418
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/ur/justified_in_saying/results.json b/evaluation/xnli/ur/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..20c6e79c12c5370dcf330d9225e17925d6726119
--- /dev/null
+++ b/evaluation/xnli/ur/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "ur",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.3365461847389558
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/vi/GPT-3_style/results.json b/evaluation/xnli/vi/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e678bfa6132f07f7ec0118c535a3b1039c71e07
--- /dev/null
+++ b/evaluation/xnli/vi/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "vi",
+ "template_name": "GPT-3 style",
+ "evaluation": {
+ "accuracy": 0.3337349397590361
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/vi/can_we_infer/results.json b/evaluation/xnli/vi/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..959fdc005e0acc06690418f83a416049cf4aec02
--- /dev/null
+++ b/evaluation/xnli/vi/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "vi",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3429718875502008
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/vi/justified_in_saying/results.json b/evaluation/xnli/vi/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d91d5796c87d2d4060a2e738e79ef6a1f23b9e6
--- /dev/null
+++ b/evaluation/xnli/vi/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "vi",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.3409638554216867
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/zh/can_we_infer/results.json b/evaluation/xnli/zh/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cc0e5fd3cb1a865aa6c4f910c2b7ae12da2d14d
--- /dev/null
+++ b/evaluation/xnli/zh/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "zh",
+ "template_name": "can we infer",
+ "evaluation": {
+ "accuracy": 0.3477911646586345
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/zh/guaranteed_possible_impossible/results.json b/evaluation/xnli/zh/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a776babe08366608051e59ff915611890ada37a
--- /dev/null
+++ b/evaluation/xnli/zh/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "zh",
+ "template_name": "guaranteed/possible/impossible",
+ "evaluation": {
+ "accuracy": 0.3325301204819277
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation/xnli/zh/justified_in_saying/results.json b/evaluation/xnli/zh/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a27fc50b7d5aba0e363e5ca1d66a6f8ac5b9ef6
--- /dev/null
+++ b/evaluation/xnli/zh/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+ "dataset_name": "xnli",
+ "dataset_config_name": "zh",
+ "template_name": "justified in saying",
+ "evaluation": {
+ "accuracy": 0.3582329317269076
+ },
+ "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000', nospace=False, output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/tasky_global_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/pytorch_model.bin b/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9bf8129c9e2831be6b254c06be785bf02483c1d4
--- /dev/null
+++ b/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:842ed99ea2c0a5389fad4876c4519253ae0f02ca9dfd4eeebc62e860fa65014e
+size 28276195547
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..25bc39604f72700b3b8e10bd69bb2f227157edd1
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1 @@
+{"bos_token": "", "eos_token": "", "unk_token": "", "pad_token": ""}
\ No newline at end of file
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..370bd68e20b4b6574ee05b213a74b244e3f492f3
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fa39cd4b1500feb205bcce3b9703a4373414cafe4970e0657b413f7ddd2a9d3
+size 14500438
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c682566982d68ea5e729ba312051cb8f33947ba
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1 @@
+{"unk_token": "", "eos_token": "", "bos_token": "", "pad_token": "", "name_or_path": "bigscience/tokenizer", "special_tokens_map_file": null, "tokenizer_class": "BloomTokenizerFast", "padding_side": "left"}
\ No newline at end of file