thibaud frere commited on
Commit
85e5ca8
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__
3
+ *.py[cod]
4
+ *.so
5
+ .Python
6
+ env/
7
+ venv/
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ *.egg
12
+ .idea/
13
+ .vscode/
14
+ *.swp
15
+ .DS_Store
16
+ # Node
17
+ node_modules/
18
+ *.log
19
+ *.env
20
+ *.cache
21
+
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Node runtime as the base image for building the application
2
+ FROM node:20 AS build
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy package.json and package-lock.json
8
+ COPY app/package*.json ./
9
+
10
+ # Install dependencies
11
+ RUN npm install
12
+
13
+ # Copy the rest of the application code
14
+ COPY app/ .
15
+ COPY analysis/data ../analysis/data
16
+
17
+ # Build the application
18
+ RUN npm run build
19
+
20
+ # Use an official Nginx runtime as the base image for serving the application
21
+ FROM nginx:alpine
22
+
23
+ # Copy the built application from the build stage
24
+ COPY --from=build /app/dist /usr/share/nginx/html
25
+
26
+ # Copy a custom Nginx configuration file
27
+ COPY nginx.conf /etc/nginx/nginx.conf
28
+
29
+ # Create necessary directories and set permissions
30
+ RUN mkdir -p /var/cache/nginx /var/run /var/log/nginx && \
31
+ chmod -R 777 /var/cache/nginx /var/run /var/log/nginx /etc/nginx/nginx.conf
32
+
33
+ # Switch to non-root user
34
+ USER nginx
35
+
36
+ # Expose port 8080
37
+ EXPOSE 8080
38
+
39
+ # Command to run the application
40
+ CMD ["nginx", "-g", "daemon off;"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: 'Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks'
3
+ emoji: 📝
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ header: mini
9
+ app_port: 8080
10
+ thumbnail: https://huggingface.co/spaces/HuggingFaceFW/blogpost-fine-tasks/resolve/main/app/assets/images/banner.png
11
+ ---
app/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ node_modules/
2
+ *.log
3
+ *.env
4
+ *.cache
app/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## How to run
2
+ 1. npm install
3
+ 2. npm run dev
app/package.json ADDED
Binary file (1.02 kB). View file
 
app/src/bibliography.bib ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @misc{penedo2024finewebdatasetsdecantingweb,
2
+ title={The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale},
3
+ author={Guilherme Penedo and Hynek Kydlíček and Loubna Ben allal and Anton Lozhkov and Margaret Mitchell and Colin Raffel and Leandro Von Werra and Thomas Wolf},
4
+ year={2024},
5
+ eprint={2406.17557},
6
+ archivePrefix={arXiv},
7
+ primaryClass={cs.CL},
8
+ url={https://arxiv.org/abs/2406.17557},
9
+ }
10
+ @article{hendryckstest2021,
11
+ title={Measuring Massive Multitask Language Understanding},
12
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
13
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
14
+ year={2021}
15
+ }
16
+ @inproceedings{zellers2019hellaswag,
17
+ title={HellaSwag: Can a Machine Really Finish Your Sentence?},
18
+ author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
19
+ booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
20
+ year={2019}
21
+ }
22
+ @misc{madaan2024quantifyingvarianceevaluationbenchmarks,
23
+ title={Quantifying Variance in Evaluation Benchmarks},
24
+ author={Lovish Madaan and Aaditya K. Singh and Rylan Schaeffer and Andrew Poulton and Sanmi Koyejo and Pontus Stenetorp and Sharan Narang and Dieuwke Hupkes},
25
+ year={2024},
26
+ eprint={2406.10229},
27
+ archivePrefix={arXiv},
28
+ primaryClass={cs.LG},
29
+ url={https://arxiv.org/abs/2406.10229},
30
+ }
31
+ @misc{open-llm-leaderboard-v2,
32
+ author = {Clémentine Fourrier and Nathan Habib and Alina Lozovskaya and Konrad Szafer and Thomas Wolf},
33
+ title = {Open LLM Leaderboard v2},
34
+ year = {2024},
35
+ publisher = {Hugging Face},
36
+ howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}",
37
+ }
38
+ @misc{biderman2024lessonstrenchesreproducibleevaluation,
39
+ title={Lessons from the Trenches on Reproducible Evaluation of Language Models},
40
+ author={Stella Biderman and Hailey Schoelkopf and Lintang Sutawika and Leo Gao and Jonathan Tow and Baber Abbasi and Alham Fikri Aji and Pawan Sasanka Ammanamanchi and Sidney Black and Jordan Clive and Anthony DiPofi and Julen Etxaniz and Benjamin Fattori and Jessica Zosa Forde and Charles Foster and Jeffrey Hsu and Mimansa Jaiswal and Wilson Y. Lee and Haonan Li and Charles Lovering and Niklas Muennighoff and Ellie Pavlick and Jason Phang and Aviya Skowron and Samson Tan and Xiangru Tang and Kevin A. Wang and Genta Indra Winata and François Yvon and Andy Zou},
41
+ year={2024},
42
+ eprint={2405.14782},
43
+ archivePrefix={arXiv},
44
+ primaryClass={cs.CL},
45
+ url={https://arxiv.org/abs/2405.14782},
46
+ }
47
+ @misc{li2024datacomplmsearchgenerationtraining,
48
+ title={DataComp-LM: In search of the next generation of training sets for language models},
49
+ author={Jeffrey Li and Alex Fang and Georgios Smyrnis and Maor Ivgi and Matt Jordan and Samir Gadre and Hritik Bansal and Etash Guha and Sedrick Keh and Kushal Arora and Saurabh Garg and Rui Xin and Niklas Muennighoff and Reinhard Heckel and Jean Mercat and Mayee Chen and Suchin Gururangan and Mitchell Wortsman and Alon Albalak and Yonatan Bitton and Marianna Nezhurina and Amro Abbas and Cheng-Yu Hsieh and Dhruba Ghosh and Josh Gardner and Maciej Kilian and Hanlin Zhang and Rulin Shao and Sarah Pratt and Sunny Sanyal and Gabriel Ilharco and Giannis Daras and Kalyani Marathe and Aaron Gokaslan and Jieyu Zhang and Khyathi Chandu and Thao Nguyen and Igor Vasiljevic and Sham Kakade and Shuran Song and Sujay Sanghavi and Fartash Faghri and Sewoong Oh and Luke Zettlemoyer and Kyle Lo and Alaaeldin El-Nouby and Hadi Pouransari and Alexander Toshev and Stephanie Wang and Dirk Groeneveld and Luca Soldaini and Pang Wei Koh and Jenia Jitsev and Thomas Kollar and Alexandros G. Dimakis and Yair Carmon and Achal Dave and Ludwig Schmidt and Vaishaal Shankar},
50
+ year={2024},
51
+ eprint={2406.11794},
52
+ archivePrefix={arXiv},
53
+ primaryClass={cs.LG},
54
+ url={https://arxiv.org/abs/2406.11794},
55
+ }
56
+ @misc{gu2024olmesstandardlanguagemodel,
57
+ title={OLMES: A Standard for Language Model Evaluations},
58
+ author={Yuling Gu and Oyvind Tafjord and Bailey Kuehl and Dany Haddad and Jesse Dodge and Hannaneh Hajishirzi},
59
+ year={2024},
60
+ eprint={2406.08446},
61
+ archivePrefix={arXiv},
62
+ primaryClass={cs.CL},
63
+ url={https://arxiv.org/abs/2406.08446},
64
+ }
65
+ @article{radford2019language,
66
+ title={Language Models are Unsupervised Multitask Learners},
67
+ author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
68
+ year={2019}
69
+ }
70
+ @inproceedings{barbaresi-2021-trafilatura,
71
+ title = {Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction},
72
+ author = "Barbaresi, Adrien",
73
+ booktitle = "Proceedings of the Joint Conference of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations",
74
+ pages = "122--131",
75
+ publisher = "Association for Computational Linguistics",
76
+ url = "https://aclanthology.org/2021.acl-demo.15",
77
+ year = 2021,
78
+ }
79
+ @misc{penedo2023refinedweb,
80
+ title={The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data, and Web Data Only},
81
+ author={Guilherme Penedo and Quentin Malartic and Daniel Hesslow and Ruxandra Cojocaru and Alessandro Cappelli and Hamza Alobeidli and Baptiste Pannier and Ebtesam Almazrouei and Julien Launay},
82
+ year={2023},
83
+ eprint={2306.01116},
84
+ archivePrefix={arXiv},
85
+ primaryClass={cs.CL}
86
+ }
87
+ @article{joulin2016fasttext,
88
+ title={FastText.zip: Compressing text classification models},
89
+ author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
90
+ journal={arXiv preprint arXiv:1612.03651},
91
+ year={2016}
92
+ }
93
+ @article{joulin2016bag,
94
+ title={Bag of Tricks for Efficient Text Classification},
95
+ author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
96
+ journal={arXiv preprint arXiv:1607.01759},
97
+ year={2016}
98
+ }
99
+ @misc{penedo2024datatrove,
100
+ author = {Penedo, Guilherme and Kydlíček, Hynek and Cappelli, Alessandro and Sasko, Mario and Wolf, Thomas},
101
+ title = {DataTrove: large scale data processing},
102
+ year = {2024},
103
+ publisher = {GitHub},
104
+ journal = {GitHub repository},
105
+ url = {https://github.com/huggingface/datatrove}
106
+ }
107
+ @misc{chiang2024chatbot,
108
+ title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
109
+ author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
110
+ year={2024},
111
+ eprint={2403.04132},
112
+ archivePrefix={arXiv},
113
+ primaryClass={cs.AI}
114
+ }
115
+ @misc{rae2022scaling,
116
+ title={Scaling Language Models: Methods, Analysis & Insights from Training Gopher},
117
+ author={Jack W. Rae and Sebastian Borgeaud and Trevor Cai and Katie Millican and Jordan Hoffmann and Francis Song and John Aslanides and Sarah Henderson and Roman Ring and Susannah Young and Eliza Rutherford and Tom Hennigan and Jacob Menick and Albin Cassirer and Richard Powell and George van den Driessche and Lisa Anne Hendricks and Maribeth Rauh and Po-Sen Huang and Amelia Glaese and Johannes Welbl and Sumanth Dathathri and Saffron Huang and Jonathan Uesato and John Mellor and Irina Higgins and Antonia Creswell and Nat McAleese and Amy Wu and Erich Elsen and Siddhant Jayakumar and Elena Buchatskaya and David Budden and Esme Sutherland and Karen Simonyan and Michela Paganini and Laurent Sifre and Lena Martens and Xiang Lorraine Li and Adhiguna Kuncoro and Aida Nematzadeh and Elena Gribovskaya and Domenic Donato and Angeliki Lazaridou and Arthur Mensch and Jean-Baptiste Lespiau and Maria Tsimpoukelli and Nikolai Grigorev and Doug Fritz and Thibault Sottiaux and Mantas Pajarskas and Toby Pohlen and Zhitao Gong and Daniel Toyama and Cyprien de Masson d'Autume and Yujia Li and Tayfun Terzi and Vladimir Mikulik and Igor Babuschkin and Aidan Clark and Diego de Las Casas and Aurelia Guy and Chris Jones and James Bradbury and Matthew Johnson and Blake Hechtman and Laura Weidinger and Iason Gabriel and William Isaac and Ed Lockhart and Simon Osindero and Laura Rimell and Chris Dyer and Oriol Vinyals and Kareem Ayoub and Jeff Stanway and Lorrayne Bennett and Demis Hassabis and Koray Kavukcuoglu and Geoffrey Irving},
118
+ year={2022},
119
+ eprint={2112.11446},
120
+ archivePrefix={arXiv},
121
+ primaryClass={cs.CL}
122
+ }
123
+ @misc{lee2022deduplicating,
124
+ title={Deduplicating Training Data Makes Language Models Better},
125
+ author={Katherine Lee and Daphne Ippolito and Andrew Nystrom and Chiyuan Zhang and Douglas Eck and Chris Callison-Burch and Nicholas Carlini},
126
+ year={2022},
127
+ eprint={2107.06499},
128
+ archivePrefix={arXiv},
129
+ primaryClass={cs.CL}
130
+ }
131
+ @misc{carlini2023quantifying,
132
+ title={Quantifying Memorization Across Neural Language Models},
133
+ author={Nicholas Carlini and Daphne Ippolito and Matthew Jagielski and Katherine Lee and Florian Tramer and Chiyuan Zhang},
134
+ year={2023},
135
+ eprint={2202.07646},
136
+ archivePrefix={arXiv},
137
+ primaryClass={cs.LG}
138
+ }
139
+ @misc{raffel2023exploring,
140
+ title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
141
+ author={Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
142
+ year={2023},
143
+ eprint={1910.10683},
144
+ archivePrefix={arXiv},
145
+ primaryClass={cs.LG}
146
+ }
147
+ @misc{touvron2023llama,
148
+ title={LLaMA: Open and Efficient Foundation Language Models},
149
+ author={Hugo Touvron and Thibaut Lavril and Gautier Izacard and Xavier Martinet and Marie-Anne Lachaux and Timothée Lacroix and Baptiste Rozière and Naman Goyal and Eric Hambro and Faisal Azhar and Aurelien Rodriguez and Armand Joulin and Edouard Grave and Guillaume Lample},
150
+ year={2023},
151
+ eprint={2302.13971},
152
+ archivePrefix={arXiv},
153
+ primaryClass={cs.CL}
154
+ }
155
+ @article{dolma,
156
+ title = {Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research},
157
+ author={
158
+ Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and
159
+ Russell Authur and Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and
160
+ Valentin Hofmann and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and
161
+ Nathan Lambert and Ian Magnusson and Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and
162
+ Crystal Nam and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and
163
+ Emma Strubell and Nishant Subramani and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and
164
+ Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo
165
+ },
166
+ year = {2024},
167
+ journal={arXiv preprint},
168
+ }
169
+ @article{gao2020pile,
170
+ title={The {P}ile: An 800{GB} dataset of diverse text for language modeling},
171
+ author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and others},
172
+ journal={arXiv preprint arXiv:2101.00027},
173
+ year={2020}
174
+ }
175
+ @misc{cerebras2023slimpajama,
176
+ author = {Soboleva, Daria and Al-Khateeb, Faisal and Myers, Robert and Steeves, Jacob R and Hestness, Joel and Dey, Nolan},
177
+ title = {SlimPajama: A 627B token cleaned and deduplicated version of RedPajama},
178
+ month = {June},
179
+ year = 2023,
180
+ url = {https://huggingface.co/datasets/cerebras/SlimPajama-627B},
181
+ }
182
+ @software{together2023redpajama,
183
+ author = {Together Computer},
184
+ title = {RedPajama: an Open Dataset for Training Large Language Models},
185
+ month = {October},
186
+ year = 2023,
187
+ url = {https://github.com/togethercomputer/RedPajama-Data}
188
+ }
189
+ @article{jaccard1912distribution,
190
+ title={The distribution of the flora in the alpine zone. 1},
191
+ author={Jaccard, Paul},
192
+ journal={New phytologist},
193
+ volume={11},
194
+ number={2},
195
+ pages={37--50},
196
+ year={1912},
197
+ publisher={Wiley Online Library}
198
+ }
199
+ @misc{albalak2024survey,
200
+ title={A Survey on Data Selection for Language Models},
201
+ author={Alon Albalak and Yanai Elazar and Sang Michael Xie and Shayne Longpre and Nathan Lambert and Xinyi Wang and Niklas Muennighoff and Bairu Hou and Liangming Pan and Haewon Jeong and Colin Raffel and Shiyu Chang and Tatsunori Hashimoto and William Yang Wang},
202
+ year={2024},
203
+ eprint={2402.16827},
204
+ archivePrefix={arXiv},
205
+ primaryClass={cs.CL}
206
+ }
207
+ @misc{longpre2023pretrainers,
208
+ title={A Pretrainer's Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity},
209
+ author={Shayne Longpre and Gregory Yauney and Emily Reif and Katherine Lee and Adam Roberts and Barret Zoph and Denny Zhou and Jason Wei and Kevin Robinson and David Mimno and Daphne Ippolito},
210
+ year={2023},
211
+ eprint={2305.13169},
212
+ archivePrefix={arXiv},
213
+ primaryClass={cs.CL}
214
+ }
215
+ @misc{wenzek2019ccnet,
216
+ title={CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data},
217
+ author={Guillaume Wenzek and Marie-Anne Lachaux and Alexis Conneau and Vishrav Chaudhary and Francisco Guzmán and Armand Joulin and Edouard Grave},
218
+ year={2019},
219
+ eprint={1911.00359},
220
+ archivePrefix={arXiv},
221
+ primaryClass={cs.CL}
222
+ }
223
+ @misc{soldaini2024dolma,
224
+ title={Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research},
225
+ author={Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and Russell Authur and Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and Valentin Hofmann and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and Nathan Lambert and Ian Magnusson and Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo},
226
+ year={2024},
227
+ eprint={2402.00159},
228
+ archivePrefix={arXiv},
229
+ primaryClass={cs.CL}
230
+ }
231
+ @misc{ouyang2022training,
232
+ title={Training language models to follow instructions with human feedback},
233
+ author={Long Ouyang and Jeff Wu and Xu Jiang and Diogo Almeida and Carroll L. Wainwright and Pamela Mishkin and Chong Zhang and Sandhini Agarwal and Katarina Slama and Alex Ray and John Schulman and Jacob Hilton and Fraser Kelton and Luke Miller and Maddie Simens and Amanda Askell and Peter Welinder and Paul Christiano and Jan Leike and Ryan Lowe},
234
+ year={2022},
235
+ eprint={2203.02155},
236
+ archivePrefix={arXiv},
237
+ primaryClass={cs.CL}
238
+ }
239
+ @misc{hoffmann2022training,
240
+ title={Training Compute-Optimal Large Language Models},
241
+ author={Jordan Hoffmann and Sebastian Borgeaud and Arthur Mensch and Elena Buchatskaya and Trevor Cai and Eliza Rutherford and Diego de Las Casas and Lisa Anne Hendricks and Johannes Welbl and Aidan Clark and Tom Hennigan and Eric Noland and Katie Millican and George van den Driessche and Bogdan Damoc and Aurelia Guy and Simon Osindero and Karen Simonyan and Erich Elsen and Jack W. Rae and Oriol Vinyals and Laurent Sifre},
242
+ year={2022},
243
+ eprint={2203.15556},
244
+ archivePrefix={arXiv},
245
+ primaryClass={cs.CL}
246
+ }
247
+ @misc{muennighoff2023scaling,
248
+ title={Scaling Data-Constrained Language Models},
249
+ author={Niklas Muennighoff and Alexander M. Rush and Boaz Barak and Teven Le Scao and Aleksandra Piktus and Nouamane Tazi and Sampo Pyysalo and Thomas Wolf and Colin Raffel},
250
+ year={2023},
251
+ eprint={2305.16264},
252
+ archivePrefix={arXiv},
253
+ primaryClass={cs.CL}
254
+ }
255
+ @misc{hernandez2022scaling,
256
+ title={Scaling Laws and Interpretability of Learning from Repeated Data},
257
+ author={Danny Hernandez and Tom Brown and Tom Conerly and Nova DasSarma and Dawn Drain and Sheer El-Showk and Nelson Elhage and Zac Hatfield-Dodds and Tom Henighan and Tristan Hume and Scott Johnston and Ben Mann and Chris Olah and Catherine Olsson and Dario Amodei and Nicholas Joseph and Jared Kaplan and Sam McCandlish},
258
+ year={2022},
259
+ eprint={2205.10487},
260
+ archivePrefix={arXiv},
261
+ primaryClass={cs.LG}
262
+ }
263
+ @article{llama3modelcard,
264
+
265
+ title={Llama 3 Model Card},
266
+
267
+ author={AI@Meta},
268
+
269
+ year={2024},
270
+
271
+ url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
272
+
273
+ }
274
+ @misc{jiang2024mixtral,
275
+ title={Mixtral of Experts},
276
+ author={Albert Q. Jiang and Alexandre Sablayrolles and Antoine Roux and Arthur Mensch and Blanche Savary and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Emma Bou Hanna and Florian Bressand and Gianna Lengyel and Guillaume Bour and Guillaume Lample and Lélio Renard Lavaud and Lucile Saulnier and Marie-Anne Lachaux and Pierre Stock and Sandeep Subramanian and Sophia Yang and Szymon Antoniak and Teven Le Scao and Théophile Gervet and Thibaut Lavril and Thomas Wang and Timothée Lacroix and William El Sayed},
277
+ year={2024},
278
+ eprint={2401.04088},
279
+ archivePrefix={arXiv},
280
+ primaryClass={cs.LG}
281
+ }
282
+ @article{yuan2024self,
283
+ title={Self-rewarding language models},
284
+ author={Yuan, Weizhe and Pang, Richard Yuanzhe and Cho, Kyunghyun and Sukhbaatar, Sainbayar and Xu, Jing and Weston, Jason},
285
+ journal={arXiv preprint arXiv:2401.10020},
286
+ year={2024}
287
+ }
288
+ @article{verga2024replacing,
289
+ title={Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models},
290
+ author={Verga, Pat and Hofstatter, Sebastian and Althammer, Sophia and Su, Yixuan and Piktus, Aleksandra and Arkhangorodsky, Arkady and Xu, Minjie and White, Naomi and Lewis, Patrick},
291
+ journal={arXiv preprint arXiv:2404.18796},
292
+ year={2024}
293
+ }
294
+ @article{abdin2024phi,
295
+ title={Phi-3 technical report: A highly capable language model locally on your phone},
296
+ author={Abdin, Marah and Jacobs, Sam Ade and Awan, Ammar Ahmad and Aneja, Jyoti and Awadallah, Ahmed and Awadalla, Hany and Bach, Nguyen and Bahree, Amit and Bakhtiari, Arash and Behl, Harkirat and others},
297
+ journal={arXiv preprint arXiv:2404.14219},
298
+ year={2024}
299
+ }
300
+ @misc{meta2024responsible,
301
+ title = {Our responsible approach to Meta AI and Meta Llama 3},
302
+ author = {Meta},
303
+ year = {2024},
304
+ url = {https://ai.meta.com/blog/meta-llama-3-meta-ai-responsibility/},
305
+ note = {Accessed: 2024-05-31}
306
+ }
307
+ @inproceedings{talmor-etal-2019-commonsenseqa,
308
+ title = "CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge",
309
+ author = "Talmor, Alon and
310
+ Herzig, Jonathan and
311
+ Lourie, Nicholas and
312
+ Berant, Jonathan",
313
+ booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
314
+ month = jun,
315
+ year = "2019",
316
+ address = "Minneapolis, Minnesota",
317
+ publisher = "Association for Computational Linguistics",
318
+ url = "https://aclanthology.org/N19-1421",
319
+ doi = "10.18653/v1/N19-1421",
320
+ pages = "4149--4158",
321
+ archivePrefix = "arXiv",
322
+ eprint = "1811.00937",
323
+ primaryClass = "cs",
324
+ }
325
+ @inproceedings{zellers-etal-2019-hellaswag,
326
+ title = "HellaSwag: Can a Machine Really Finish Your Sentence?",
327
+ author = "Zellers, Rowan and
328
+ Holtzman, Ari and
329
+ Bisk, Yonatan and
330
+ Farhadi, Ali and
331
+ Choi, Yejin",
332
+ editor = "Korhonen, Anna and
333
+ Traum, David and
334
+ M{\`a}rquez, Llu{\'\i}s",
335
+ booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
336
+ month = jul,
337
+ year = "2019",
338
+ address = "Florence, Italy",
339
+ publisher = "Association for Computational Linguistics",
340
+ url = "https://aclanthology.org/P19-1472",
341
+ doi = "10.18653/v1/P19-1472",
342
+ pages = "4791--4800",
343
+ abstract = "Recent work by Zellers et al. (2018) introduced a new task of commonsense natural language inference: given an event description such as {``}A woman sits at a piano,{''} a machine must select the most likely followup: {``}She sets her fingers on the keys.{''} With the introduction of BERT, near human-level performance was reached. Does this mean that machines can perform human level commonsense inference? In this paper, we show that commonsense inference still proves difficult for even state-of-the-art models, by presenting HellaSwag, a new challenge dataset. Though its questions are trivial for humans ({\textgreater}95{\%} accuracy), state-of-the-art models struggle ({\textless}48{\%}). We achieve this via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. AF proves to be surprisingly robust. The key insight is to scale up the length and complexity of the dataset examples towards a critical {`}Goldilocks{'} zone wherein generated text is ridiculous to humans, yet often misclassified by state-of-the-art models. Our construction of HellaSwag, and its resulting difficulty, sheds light on the inner workings of deep pretrained models. More broadly, it suggests a new path forward for NLP research, in which benchmarks co-evolve with the evolving state-of-the-art in an adversarial way, so as to present ever-harder challenges.",
344
+ }
345
+ @inproceedings{OpenBookQA2018,
346
+ title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
347
+ author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
348
+ booktitle={EMNLP},
349
+ year={2018}
350
+ }
351
+ @misc{bisk2019piqa,
352
+ title={PIQA: Reasoning about Physical Commonsense in Natural Language},
353
+ author={Yonatan Bisk and Rowan Zellers and Ronan Le Bras and Jianfeng Gao and Yejin Choi},
354
+ year={2019},
355
+ eprint={1911.11641},
356
+ archivePrefix={arXiv},
357
+ primaryClass={cs.CL}
358
+ }
359
+ @misc{sap2019socialiqa,
360
+ title={SocialIQA: Commonsense Reasoning about Social Interactions},
361
+ author={Maarten Sap and Hannah Rashkin and Derek Chen and Ronan LeBras and Yejin Choi},
362
+ year={2019},
363
+ eprint={1904.09728},
364
+ archivePrefix={arXiv},
365
+ primaryClass={cs.CL}
366
+ }
367
+ @misc{sakaguchi2019winogrande,
368
+ title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
369
+ author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
370
+ year={2019},
371
+ eprint={1907.10641},
372
+ archivePrefix={arXiv},
373
+ primaryClass={cs.CL}
374
+ }
375
+ @misc{clark2018think,
376
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
377
+ author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
378
+ year={2018},
379
+ eprint={1803.05457},
380
+ archivePrefix={arXiv},
381
+ primaryClass={cs.AI}
382
+ }
383
+ @misc{hendrycks2021measuring,
384
+ title={Measuring Massive Multitask Language Understanding},
385
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
386
+ year={2021},
387
+ eprint={2009.03300},
388
+ archivePrefix={arXiv},
389
+ primaryClass={cs.CY}
390
+ }
391
+ @misc{mitchell2023measuring,
392
+ title={Measuring Data},
393
+ author={Margaret Mitchell and Alexandra Sasha Luccioni and Nathan Lambert and Marissa Gerchick and Angelina McMillan-Major and Ezinwanne Ozoani and Nazneen Rajani and Tristan Thrush and Yacine Jernite and Douwe Kiela},
394
+ year={2023},
395
+ eprint={2212.05129},
396
+ archivePrefix={arXiv},
397
+ primaryClass={cs.AI}
398
+ }
app/src/colors.mjs ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ export const COLORS = [
2
+ "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
3
+ "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"
4
+ ];
5
+
6
+ export function getColor(index) {
7
+ return COLORS[index % COLORS.length];
8
+ }
app/src/distill.js ADDED
The diff for this file is too large to render. See raw diff
 
app/src/fine_tasks.js ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Papa from 'papaparse';
2
+ import { DataTable } from 'simple-datatables';
3
+
4
+ const languageMap = {
5
+ 'Arabic': 'ar',
6
+ 'Turkish': 'tr',
7
+ 'Swahili': 'sw',
8
+ 'Russian': 'ru',
9
+ 'Telugu': 'te',
10
+ 'Thai': 'th',
11
+ 'Chinese': 'zh',
12
+ 'French': 'fr',
13
+ 'Hindi': 'hi',
14
+ };
15
+
16
+ const metricTypes = [
17
+ { value: 'max_score', label: 'Max Score' },
18
+ { value: 'avg_snr', label: 'Low Noise' },
19
+ { value: 'avg_spearman', label: 'Monotonicity' },
20
+ { value: 'max_n_std', label: 'Non-Randomness' },
21
+ { value: 'avg_kendall_tau_a', label: 'Ordering Consistency' }
22
+ ];
23
+
24
+ const tableTypes = [
25
+ { value: 'gen', label: 'Generative' },
26
+ { value: 'mc', label: 'Multichoice' }
27
+ ];
28
+
29
+ const taskFolders = [
30
+ { value: 'selected', label: 'FineTasks' },
31
+ { value: 'non_selected', label: 'Non-Selected' }
32
+ ];
33
+
34
+ function createDropdown(options, onChange) {
35
+ const select = document.createElement('select');
36
+ options.forEach(option => {
37
+ const optionElement = document.createElement('option');
38
+ if (typeof option === 'object' && option.value && option.label) {
39
+ optionElement.value = option.value;
40
+ optionElement.textContent = option.label;
41
+ } else {
42
+ optionElement.value = option;
43
+ optionElement.textContent = option;
44
+ }
45
+ select.appendChild(optionElement);
46
+ });
47
+ select.addEventListener('change', onChange);
48
+ return select;
49
+ }
50
+
51
+ function createPerTaskResultsTable(data, tableType, metric) {
52
+ const tableWrapper = document.createElement('div');
53
+ tableWrapper.className = 'table-wrapper fine-tasks-table-wrapper';
54
+
55
+ const table = document.createElement('table');
56
+ table.className = 'results-table fine-tasks-results-table';
57
+
58
+ const columns = ['Task', 'Type', ...(tableType === 'gen' ? ['f1', 'prefix_match'] : ['acc', 'acc_norm', 'acc_norm_token', 'acc_norm_pmi'])];
59
+
60
+ const columnNameMap = {
61
+ // 'Task': 'Task',
62
+ // 'Type': 'Type',
63
+ // 'f1': 'f1',
64
+ // 'prefix_match': 'prefix_match',
65
+ // 'acc': 'acc',
66
+ 'acc_norm': 'acc_char',
67
+ 'acc_norm_token': 'acc_token',
68
+ 'acc_norm_pmi': 'acc_pmi',
69
+ 'prefix_match': 'prefix'
70
+ };
71
+
72
+ const taskMetricMap = {
73
+ 'max_score': 'score',
74
+ 'avg_snr': 'snr',
75
+ 'avg_spearman': 'monotonicity',
76
+ 'max_n_std': 'non-randomness',
77
+ 'avg_kendall_tau_a': 'ordering'
78
+ // 'avg_spearman': 'monotonicity',
79
+ }
80
+
81
+ const header = table.createTHead();
82
+ const headerRow = header.insertRow();
83
+ columns.forEach(column => {
84
+ const th = document.createElement('th');
85
+ th.textContent = columnNameMap[column] || column;
86
+
87
+ if (th.textContent !== "Task" && th.textContent !== "Type") {
88
+ th.textContent += " " + (taskMetricMap[metric] || metric);
89
+ }
90
+ th.title = th.textContent;
91
+ if (column === 'Type')
92
+ th.style.width = '40px';
93
+ headerRow.appendChild(th);
94
+ });
95
+
96
+ const body = table.createTBody();
97
+ data.forEach(row => {
98
+ if (Object.values(row).every(value => value === '' || value === undefined || value === null)) {
99
+ return;
100
+ }
101
+
102
+ const tr = body.insertRow();
103
+ columns.forEach(column => {
104
+ const td = tr.insertCell();
105
+ let value = row[column];
106
+ if (column === 'Task') {
107
+ const fullTaskName = value; // Store the full task name
108
+ const parts = value.split('|');
109
+ value = parts.length > 1 ? parts[1] : value;
110
+ value = value.split('_mcf')[0].split('_cf')[0];
111
+ td.title = fullTaskName; // Set the title attribute to show the full name on hover
112
+ } else if (column === 'Type') {
113
+ // Keep the task type as is
114
+ } else if (typeof value === 'number') {
115
+ value = value.toFixed(2);
116
+ } else if (value && !isNaN(parseFloat(value))) {
117
+ value = parseFloat(value).toFixed(2);
118
+ } else {
119
+ value = '';
120
+ }
121
+ td.textContent = value;
122
+ });
123
+ });
124
+
125
+ tableWrapper.appendChild(table);
126
+ return tableWrapper;
127
+ }
128
+
129
+ export function initFineTasks(containerId) {
130
+ const container = document.getElementById(containerId);
131
+ if (!container) return;
132
+
133
+ const perTaskTitleElement = document.createElement('h3');
134
+ perTaskTitleElement.textContent = 'Task Results';
135
+ perTaskTitleElement.className = 'fine-tasks-title';
136
+
137
+ const perTaskTableContainer = document.createElement('div');
138
+ perTaskTableContainer.className = 'table-container';
139
+
140
+ let perTaskDataTable;
141
+
142
+ function updatePerTaskResults() {
143
+ const language = languageDropdownPerTask.value;
144
+ const metric = metricDropdownPerTask.value;
145
+ const tableType = tableTypeDropdownPerTask.value;
146
+ const taskFolder = taskFolderDropdownPerTask.value;
147
+
148
+ const languageCode = languageMap[language];
149
+
150
+ if (!languageCode) {
151
+ console.error(`Language code not found for ${language}`);
152
+ perTaskTableContainer.innerHTML = `<p>Error: Language code not found for ${language}</p>`;
153
+ return;
154
+ }
155
+
156
+ let url = `data/tasks/${taskFolder}/${languageCode}/${metric}/${tableType}_stats.csv`;
157
+
158
+ fetch(url)
159
+ .then(response => {
160
+ if (!response.ok) {
161
+ throw new Error(`HTTP error! status: ${response.status}`);
162
+ }
163
+ return response.text();
164
+ })
165
+ .then(csvText => {
166
+ const results = Papa.parse(csvText, { header: true }).data;
167
+ perTaskTableContainer.innerHTML = '';
168
+ const tableWrapper = createPerTaskResultsTable(results, tableType, metric);
169
+ perTaskTableContainer.appendChild(tableWrapper);
170
+
171
+ if (perTaskDataTable) {
172
+ perTaskDataTable.destroy();
173
+ }
174
+
175
+ perTaskDataTable = new DataTable('.fine-tasks-results-table', {
176
+ perPage: 10,
177
+ perPageSelect: false,
178
+ searchable: false,
179
+ sortable: true,
180
+ fixedHeight: true,
181
+ labels: {
182
+ info: '' // This removes the "Showing 1 to X of Y entries" text
183
+ }
184
+ });
185
+
186
+ })
187
+ .catch(error => {
188
+ console.error('Error fetching CSV:', error);
189
+ perTaskTableContainer.innerHTML = `<p>Error loading data: ${error.message}</p>`;
190
+ });
191
+ }
192
+
193
+ const perTaskControls = document.createElement('div');
194
+ perTaskControls.className = 'controls fine-tasks-controls';
195
+
196
+ // Task folder control group
197
+ const taskFolderControlGroup = document.createElement('div');
198
+ taskFolderControlGroup.className = 'control-group';
199
+ const taskFolderLabelPerTask = document.createElement('label');
200
+ taskFolderLabelPerTask.textContent = 'Task Set: ';
201
+ const taskFolderDropdownPerTask = createDropdown(taskFolders, updatePerTaskResults);
202
+ taskFolderDropdownPerTask.value = 'selected'; // Set default to FineTasks
203
+ taskFolderControlGroup.appendChild(taskFolderLabelPerTask);
204
+ taskFolderControlGroup.appendChild(taskFolderDropdownPerTask);
205
+
206
+ // Language control group
207
+ const languageControlGroup = document.createElement('div');
208
+ languageControlGroup.className = 'control-group';
209
+ const languageLabelPerTask = document.createElement('label');
210
+ languageLabelPerTask.textContent = 'Language: ';
211
+ const languageDropdownPerTask = createDropdown(Object.keys(languageMap), updatePerTaskResults);
212
+ languageControlGroup.appendChild(languageLabelPerTask);
213
+ languageControlGroup.appendChild(languageDropdownPerTask);
214
+
215
+ // Table type control group
216
+ const tableTypeControlGroup = document.createElement('div');
217
+ tableTypeControlGroup.className = 'control-group';
218
+ const tableTypeLabelPerTask = document.createElement('label');
219
+ tableTypeLabelPerTask.textContent = 'Type: ';
220
+ const tableTypeDropdownPerTask = createDropdown(tableTypes, updatePerTaskResults);
221
+ tableTypeControlGroup.appendChild(tableTypeLabelPerTask);
222
+ tableTypeControlGroup.appendChild(tableTypeDropdownPerTask);
223
+
224
+ // Metric control group
225
+ const metricControlGroup = document.createElement('div');
226
+ metricControlGroup.className = 'control-group';
227
+ const metricLabelPerTask = document.createElement('label');
228
+ metricLabelPerTask.textContent = 'Criteria: ';
229
+ const metricDropdownPerTask = createDropdown(metricTypes, updatePerTaskResults);
230
+ metricDropdownPerTask.value = 'max_score'; // Set default to Max Score
231
+ metricControlGroup.appendChild(metricLabelPerTask);
232
+ metricControlGroup.appendChild(metricDropdownPerTask);
233
+
234
+ perTaskControls.appendChild(taskFolderControlGroup);
235
+ perTaskControls.appendChild(languageControlGroup);
236
+ perTaskControls.appendChild(tableTypeControlGroup);
237
+ perTaskControls.appendChild(metricControlGroup);
238
+
239
+ container.appendChild(perTaskControls);
240
+ // container.appendChild(perTaskTitleElement);
241
+ container.appendChild(perTaskTableContainer);
242
+
243
+ // Initialize with default values
244
+ updatePerTaskResults();
245
+ }
app/src/index.html ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
5
+ <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
7
+ <meta charset="utf8">
8
+ <base target="_blank">
9
+ <title>Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks</title>
10
+ <link rel="stylesheet" href="style.css">
11
+ </head>
12
+
13
+ <body>
14
+ <d-front-matter>
15
+ <script id='distill-front-matter' type="text/json">{
16
+ "title": "📝 Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks",
17
+ "description": "This blog covers a discussion on multilingual evaluation and task signal, the processes for selecting existing evaluation tasks based on signal resulting in FineTasks, and comparisson of open and closed sourced on the FineTasks.",
18
+ "published": "Oct 23, 2024",
19
+ "affiliation": {"name": "HuggingFace"},
20
+ "authors": [
21
+ {
22
+ "author":"Hynek Kydlíček",
23
+ "authorURL":"https://huggingface.co/hynky"
24
+ },
25
+ {
26
+ "author":"Guilherme Penedo",
27
+ "authorURL":"https://huggingface.co/guipenedo"
28
+ },
29
+ {
30
+ "author":"Clémentine Fourier",
31
+ "authorURL":"https://huggingface.co/clefourrier"
32
+ },
33
+ {
34
+ "author":"Nathan Habib",
35
+ "authorURL":"https://huggingface.co/SaylorTwift"
36
+ },
37
+ {
38
+ "author":"Thomas Wolf",
39
+ "authorURL":"https://huggingface.co/thomwolf"
40
+ }
41
+ ]
42
+ }</script>
43
+ </d-front-matter>
44
+
45
+ <d-title>
46
+ <h1 class="l-page" style="text-align: center; display: none;">📝 Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks</h1>
47
+ <div id="title-plot" class="main-plot-container l-page">
48
+ <figure>
49
+ <img src="assets/images/banner.png" alt="FineTasks">
50
+ </figure>
51
+ </div>
52
+ </d-title>
53
+ <d-byline></d-byline>
54
+ <d-article>
55
+ <d-contents>
56
+ </d-contents>
57
+
58
+ <p>Following the strong community reception of our FineWeb English dataset<d-cite key="penedo2024finewebdatasetsdecantingweb"></d-cite>, we have been hard at work on a <b>multilingual version</b>, which will cover 1000+ languages (that we hope to release <em>soon</em>!).</p>
59
+
60
+ <p>However, we quickly encountered a significant challenge: how can one effectively evaluate models across different languages during training?</p>
61
+
62
+ <p>For English, it's straightforward: we can utilize well-established benchmarks like <b>MMLU</b><d-cite key="hendryckstest2021"></d-cite> or <b>HellaSwag</b><d-cite key="zellers2019hellaswag"></d-cite>, widely used by most labs and implemented in all the major evaluation frameworks. Unfortunately, non-English tasks are often scarce and lack broader community validation and, when available, are frequently of questionable quality: many are machine-translated and may even include English words in their formulations. Additionally, they are often unsuitable for early pre-training evaluation due to suboptimal task formulations and/or too high difficulty resulting in random scores.</p>
63
+
64
+ <p>To address these challenges, we developed a <b>scalable and data-driven framework</b> for evaluation task selection, which allows anyone to choose strong model evaluations for their language from existing tasks! We then applied this framework to a set of <b>9 diverse languages</b>, resulting in the creation of <b>FineTasks</b> - a comprehensive and diverse multilingual evaluation suite.</p>
65
+
66
+ <p>In this blog post, we discuss:</p>
67
+ <ol>
68
+ <li>Our <b>data-driven process</b> to create a multilingual evaluation suite: <b>FineTasks</b></li>
69
+ <li>Results of evaluating <b>35 major open and closed-source models</b> on FineTasks</li>
70
+ <li>A guide for extending FineTasks to your <b>target language</b></li>
71
+ </ol>
72
+
73
+
74
+ <h2>What Makes a Task "Fine"?</h2>
75
+ <p>Covering all 7000+ languages spoken over the world would be monumental endeavor, so we settled on using <b>9 languages</b> that offered diversity in script, language family and resource availability: <b>Chinese, French, Arabic, Russian, Thai, Hindi, Turkish, Swahili, and Telugu</b>.</p>
76
+ <p>For these languages, we collected all available tasks that we could find, implementing a total of <b>185 tasks across languages</b> in <a href="https://github.com/huggingface/lighteval">LightEval</a>, HuggingFace's model evaluation library.</p>
77
+
78
+ <p>Then, we began task selection with two primary goals: ensuring <b>evaluation diversity</b>, and making sure each task provided a <b>reliable signal</b> during pre-training.</p>
79
+
80
+ <p>For evaluation diversity, we aimed to assess a broad range of model capabilities, including:</p>
81
+ <ul>
82
+ <li><b>Reading comprehension (RC)</b>: Understanding provided context and answering questions based on it.</li>
83
+ <li><b>General knowledge (GK)</b>: Answering questions about facts from various fields without added context.</li>
84
+ <li><b>Natural Language Understanding (NLU)</b>: Comprehending the semantics of provided input.</li>
85
+ <li><b>Common-sense reasoning (RES)</b>: Demonstrating the ability to perform simple reasoning requiring embodied knowledge.</li>
86
+ <li><b>Generative tasks</b>: Ability to generate text in the target language without the "help" of multiple choice options.</li>
87
+ </ul>
88
+
89
+ <p>We consider that tasks provide a reliable signal if they provide a dependable score. This means the score should be above the random baseline, increase as training progresses, show low variability across different seeds, and provide consistent model ranking at each training step<d-footnote>For similar sized models trained with the same hyperparameters on the same amount of data.</d-footnote>.</p>
90
+
91
+ <h3>Finding how much signal our tasks give during pre-training</h3>
92
+ <p>To thoroughly examine the signal our tasks provide, we trained many 1.5B parameter models for each language, using 30B tokens from subsets of the supported languages of the five largest openly available multilingual web datasets. These models were trained with the same hyperparameters and tokenizer. We then evaluated them at regular checkpoint intervals on the collected tasks (with no instruction and no system prompt in a 0-shot setting).</p>
93
+
94
+ <p>This process required multiple evaluation runs for each task due to iterations on its implementation, resulting in a total of <b>73 000 GPU hours consumed</b> 🔥!</p>
95
+
96
+ <p>With <b>49 models trained</b> we could finally define what a <b>reliable signal</b> means to us!</p>
97
+
98
+ <h4>Monotonicity</h4>
99
+ <p>One of our core requirements for a task is that it can be learned from training data and this <b>learning can be gradually observed as the training progresses</b>. Without this improvement through time, it's uncertain whether there will ever be an improvement in the future.</p>
100
+
101
+ <p>To measure this, we used the <b>Spearman rank correlation</b> to quantify the correlation between steps and score. Spearman rank correlation can capture monotonicity even when scores don't evolve linearly with the number of steps. We required each task to have at least an average correlation of 0.5 over all model training runs.</p>
102
+
103
+
104
+ <div style="display: flex; grid-column: middle">
105
+ <div class="task-signal-plot" data-language="French" data-task="mlmm_hellaswag_fra_cf" data-show-controls="false" data-task-metrics="monotonicity" data-metric="acc_norm_token" data-group-seeds="true" data-title="✅ Good monotonicity: mlmm_hellaswag_fra_cf [fr]"></div>
106
+ <div class="task-signal-plot" data-language="Arabic" data-task="mlmm_truthfulqa_ara_cf:mc1" data-show-controls="false" data-task-metrics="monotonicity" data-metric="acc_norm_token" data-group-seeds="true" data-title="❌ Bad monotonicity: mlmm_truthfulqa_ara_cf:mc1 [ar]"></div>
107
+ </div>
108
+
109
+ <h4>Low noise</h4>
110
+ <p>When comparing model performance on tasks, we need to consider whether differences are due to <b>evaluation noise or genuine performance variations</b>.</p>
111
+
112
+ <p>Noise can arise from the stochastic processes involved in model training, such as random token sampling, data shuffling, or model initialization.<d-cite key="madaan2024quantifyingvarianceevaluationbenchmarks"></d-cite> To measure how sensitive each task is to this noise, we trained four additional models on our own monolingual corpora (unfiltered CommonCrawl data in each language) using different seeds.</p>
113
+
114
+ <p>For each task, we computed:</p>
115
+ <ol>
116
+ <li>First, a <b></b>standard deviation of model scores for every step (approximately every 1B tokens), which we call the <b>per-step-std</b>.</li>
117
+ <li>Then, to obtain a global variability measurement, we averaged all the per-step-std values to get the <b>avg-std</b> over the full training. We assume this value is an upper-bound across model architectures and training datasets (as it was approximated by models trained on a "dirtier" dataset, therefore with higher variability).</li>
118
+ <li>Finally, we computed the <b>signal-to-noise ratio</b> (SNR) as the main metric for task variability. We calculate SNR as the mean score at 30B tokens of all runs divided by the avg-std. This metric measures how significant the overall score is relative to the score variations (noise).</li>
119
+ </ol>
120
+
121
+ <p>We aimed for each task to have an SNR > 20. The only exception to this rule are generative tasks, which typically have relatively low SNR, but are still worth including as they provide insights into how the model behaves when prompted to generate unconstrained (without answer options). In a multilingual setting, this is particularly relevant as some models trained on multiple languages can exhibit high task scores but then suddenly reply in the wrong language for generative tasks!</p>
122
+
123
+ <div style="display: flex; grid-column: middle">
124
+ <div class="task-signal-plot" data-language="Telugu" data-task="xstory_cloze_tel_cf" data-show-controls="false" data-task-metrics="snr" data-metric="acc_norm_token" data-group-seeds="false" data-title="✅ Good SNR: xstory_cloze_tel_cf [te]"></div>
125
+ <div class="task-signal-plot" data-language="Telugu" data-task="tydiqa_tel" data-show-controls="false" data-task-metrics="snr" data-metric="acc_norm_token" data-group-seeds="false" data-title="❌ Bad SNR: tydiqa_tel [te]"></div>
126
+ </div>
127
+
128
+ <aside>Assuming model performance is normally distributed across different seeds, we want the benchmark-run performance to be at least 3 final-stds above the benchmark random baseline. This would mean that 99.85% of seed scores are above the random baseline (formally, benchmark-run performance - benchmark random baseline > 3 * final-std).</aside>
129
+ <h4>Non-Random Performance</h4>
130
+ <p>Many model capabilities are acquired later in training, thus <b>many tasks</b> (especially harder ones, such as math-related ones) <b>show baseline-level performance for an extended period</b>. While these tasks are useful, they're not ideal for early pre-training evaluation, and <b>we did not want to keep them</b> for this setting.</p>
131
+
132
+ <p>We first computed the baseline random performance of the task (as the sum of 1/n_choices for all samples for multiple choice questions, and as zero for generative evaluations). Then we calculated the task's distance from the baseline as the maximum score across all models minus the baseline.</p>
133
+
134
+
135
+ <div style="display: flex; grid-column: middle">
136
+ <div class="task-signal-plot" data-language="Chinese" data-task="agieval_zho_cf:_average" data-show-controls="false" data-task-metrics="randomness" data-metric="acc_norm_pmi" data-group-seeds="true" data-title="✅ Non-random: agieval_zho_cf/acc_pmi [zh]"></div>
137
+ <div class="task-signal-plot" data-language="Chinese" data-task="agieval_zho_cf:_average" data-show-controls="false" data-task-metrics="randomness" data-metric="acc" data-group-seeds="true" data-title="❌ Random perf: agieval_zho_cf/acc [zh]"></div>
138
+ </div>
139
+
140
+ <h4>Model Ordering Consistency</h4>
141
+ <p>Let's not forget that the main goal of these evaluations is to compare models and datasets!</p>
142
+
143
+ <p>In the future, we want to use these evaluations to select the best datasets for full model pretraining. This means <b>our tasks should rank datasets trained using very few tokens (we typically run data ablations on 30B tokens), in the same order as they would when trained for longer, after significantly more steps.</b></p>
144
+
145
+ <p>In other words, we would like tasks to have <b>predictive capability regarding future performance during pre-training</b>: if pre-training dataset A outperforms pre-training dataset B at 30 billion tokens, we would like this trend to continue at 300 billion tokens.</p>
146
+
147
+ <p>Proving this is inherently impossible, but there is a necessary preliminary condition that we can test for: for the results to be consistent at large scales, they must also first show consistency at smaller scales!</p>
148
+
149
+ <p>To measure this consistency in task ordering, we computed the average <b>Kendall's Tau</b> of models ranking between every two consecutive steps. We only considered steps starting after 15B tokens of pre-training, as we found orderings before the range incredibly noisy. A high value of this metric indicates that the ordering remains consistent as training progresses.</p>
150
+
151
+ <aside>We had no strict minimum value requirement for this property, instead using it to establish comparisons between tasks.</aside>
152
+
153
+ <div style="display: flex; grid-column: middle">
154
+ <div class="task-signal-plot" data-language="Arabic" data-task="xcsqa_ara_cf" data-show-controls="false" data-task-metrics="ordering" data-metric="acc_norm_token" data-group-seeds="true" data-title="✅ Good ordering: xcsqa_ara_cf [ar]"></div>
155
+ <div class="task-signal-plot" data-language="Thai" data-task="thai_exams_tha_cf:_average" data-show-controls="false" data-task-metrics="ordering" data-metric="acc_norm_token" data-group-seeds="true" data-title="❌ Bad ordering: thai_exams_tha_cf [th]"></div>
156
+ </div>
157
+
158
+
159
+ <h2>Important properties of evaluation impacting stability</h2>
160
+ <p>Now that we covered what we were looking for in our tasks, let's examine two important aspects that can affect the above properties: task formulations and metric choice.</p>
161
+
162
+ <aside>Both of these aspects are thoroughly described and studied in the brilliant OLMES paper<d-cite key="gu2024olmesstandardlanguagemodel"></d-cite>, which greatly inspired our work.</aside>
163
+
164
+ <h3>Task Formulations</h3>
165
+ <p>The way tasks are presented to the model is crucial, particularly for multiple-choice (MC) tasks. In these scenarios, we must carefully determine how the choices are displayed and what the model is expected to predict.</p>
166
+
167
+ <p>There are two common approaches: <b>Cloze Formulation</b> (CF) and <b>Multi-Choice Formulation</b> (MCF). In CF, choices are not provided in context, allowing the model to predict each option directly. In contrast, MCF presents the choices in the prompt, using A/B/C/D prefixes, with the targets being those letter prefixes.</p>
168
+
169
+ <!-- side-by-side comparison of MCF vs. CF on a specific task -->
170
+
171
+ <p>It's important to know that:</p>
172
+ <ul>
173
+ <li>The choice of formulation significantly impacts task scores <d-cite key="open-llm-leaderboard-v2"></d-cite>.</li>
174
+ <li>Both formulations <b>behave very differently during training</b>. As noted by both OLMES<d-cite key="gu2024olmesstandardlanguagemodel"></d-cite> and DataComp-LM<d-cite key="li2024datacomplmsearchgenerationtraining"></d-cite>, when employing MCF, task scores initially show random performance over extended training periods before experiencing a sudden increase. Conversely, with CF, task scores improve right from the beginning but tend to plateau relatively early.</li>
175
+ </ul>
176
+
177
+ <p>Therefore, we decided to utilize CF for task selection and MCF for later evaluation of major open source models, as they have generally undergone enough training for these evaluations to have a signal.</p>
178
+
179
+ <h3>Metrics</h3>
180
+ <p>As the targets in CF of multiple choice tasks are choices themselves, each target can have a different number of tokens, characters, and unconditional probability (probability of generating the choice without a context prefix).</p>
181
+ <aside>Measuring accuracy without normalization would have the models prefer answers with fewer tokens, for example.</aside>
182
+
183
+ <p>To account for this, we consider the following accuracy variations:</p>
184
+ <ul>
185
+ <li><b>Accuracy</b> : <br>
186
+ <code>acc</code> = <d-math>\underset{i}{\arg\max}(ln(P (a_i|q)))</d-math></li>
187
+ <li><b>Accuracy normalized over character length</b> : <br>
188
+ <code>acc_char</code> = <d-math> \underset{i}{\arg\max}\frac{ln(P (a_i|q))}{num\_characters(a_i)}</d-math></li>
189
+ <li><b>Accuracy normalized over token length</b> :<br>
190
+ <code>acc_token</code> = <d-math> \underset{i}{\arg\max}\frac{ln(P (a_i|q))}{num\_tokens(a_i)}</d-math></li>
191
+ <li><b>PMI Accuracy</b> : <br>
192
+ <code>acc_pmi</code> = <d-math> \underset{i}{\arg\max}ln\frac{P (a_i|q)}{P (a_i|u)}</d-math>, where <d-math>u =</d-math>''Answer:''</li>
193
+ </ul>
194
+
195
+ <p>Where <d-math>a_i</d-math> is the answer choice <d-math>i</d-math>, <d-math>q</d-math> is a question prompt and <d-math>P (a_i|q)</d-math> is the probability of having <d-math>a_i</d-math> follow <d-math>q</d-math>. For more details see <d-cite key="gu2024olmesstandardlanguagemodel"></d-cite> and <d-cite key="biderman2024lessonstrenchesreproducibleevaluation"></d-cite>.</p>
196
+
197
+ <aside><code>acc_pmi</code> metric measures how much more likely a model is to predict A_i if provided with question context compared to if there was no context at all. This can be useful if the correct choice contains generally unlikely tokens, making the model less likely to choose such an answer.</aside>
198
+
199
+ <p>For our generative tasks on the other hand, we used the following metrics:</p>
200
+ <ul>
201
+ <li><code>prefix_match</code>: Exact match where only the prefix of the answer must match</li>
202
+ <li><code>f1</code>: F1 score computed over predicted/gold words extracted using a word tokenizer</li>
203
+ </ul>
204
+
205
+ <p>For both generative metrics, minor preprocessing is applied to remove articles and punctuation, and lowercase the text.</p>
206
+
207
+ <h2>The Fine selection</h2>
208
+ <p>With our goals and evaluation setup properly defined, we proceeded with <b>task selection</b>!</p>
209
+
210
+ <p>We reviewed tasks one by one, choosing based on the quantified properties. For each language, we aimed to have at least one task for each of the four categories outlined above. Additionally we wanted to have at least 1 generative task for each language.</p>
211
+
212
+ <p>In cases where multiple versions of a task existed (e.g., MMLU with different translation methods or native versions), we <b>prioritized native versions</b> as long as their metrics were reasonable, followed by human translations of English tasks. If no such version was available, we made our selection entirely based on metrics.</p>
213
+
214
+ <p>Thus, <b>after removing about half of the tasks</b>, we arrived at <b>96 final ones</b>, forming "FineTasks."</p>
215
+
216
+ <h3>Explore tasks</h3>
217
+ <p>Use the dropdowns below to navigate the list of tasks and how different metrics affect them.</p>
218
+ <div id="fine-tasks-results"></div>
219
+
220
+ <p>All tasks from the selection <b>comply with the criteria</b> outlined in previous sections, with the only exception being indicqa_tel, which we chose to include to ensure we had at least one generative task for Telugu. Overall we managed to cover all task categories for each language (the only exception being Thai Reasoning, where all tasks were unfortunately too noisy with low monotonicity to consider them).</p>
221
+
222
+ <p>One of the <b>biggest surprises</b> was that some tasks, even when translated using the same method, were <b>reliable in one language but not in others</b>. This was evident with xWinograd, which worked quite well for Russian but did not meet our conditions for French. An even more extreme example was XNLI, which performed well for 6 out of 7 languages, failing to satisfy the reliability properties for Chinese. We had to test four different implementations before finding a reliable version, which, interestingly, was the only one that was created by native speakers and not machine translated.</p>
223
+
224
+ <p>Feel free to use the dropdowns below to explore the evolution of scores over training for all tested tasks and metrics.</p>
225
+ <div class="task-signal-plot" data-language="French" data-task="frenchbench_hellaswag_fra_cf" data-show-controls="true" data-metric="acc_norm_token" data-group-seeds="true" data-title=""></div>
226
+
227
+
228
+ <h3>Metrics recommendation</h3>
229
+ <p>Selecting the best evaluation metrics proved to be a <b>challenging task</b>. Not only is there no single metric that consistently outperforms the rest, but we often encountered situations where one metric had better monotonicity while another had a higher signal-to-noise ratio. In such cases, we typically made our decision based on the selected metric for tasks' implementation in a different language. We are aware that such hand-picking is often not possible and thus offer the following recommendations:</p>
230
+
231
+ <h4>Multichoice Tasks</h4>
232
+ <ul>
233
+ <li>We found <b>base accuracy</b> to perform well for tasks with answer options varying subtly (e.g. Yes/No/Also), particularly NLI tasks. In such cases, where the answer options are often each a single token, the base accuracy is advisable to use.</li>
234
+ <li>While OLMES<d-cite key="gu2024olmesstandardlanguagemodel"></d-cite> recommends using PMI for tasks with unusual words, we found <b>PMI</b> to be highly effective for "difficult" reasoning and knowledge tasks like AGIEVAL or MMLU. In these cases, PMI provided the best results and was often the only metric delivering performance above random. That said, PMI was, on average, the weakest metric across all other tasks, while also being two times more expensive to compute. We therefore only recommend its use for complex reasoning and knowledge tasks.</li>
235
+ <li>The metrics we found to be <b>most reliable overall</b> were length normalization metrics (token or character-based). However, the best choice was dependent on language, rather than being consistent for a given task. Due to that, we recommend using the maximum of acc_char and acc_token for the most reliable results.<d-footnote>Note that acc_token is heavily tokenizer dependent. On our ablations all models were trained using the same tokenizer.</d-footnote></li>
236
+ </ul>
237
+
238
+ <h4>Generative Tasks</h4>
239
+ <p>For <b>generative metrics</b>, the choice is clearer: we suggest using the F1 score unless exact matching is required, as in math-related tasks. F1 is generally less noisy and more resilient to small changes in the generations.</p>
240
+
241
+
242
+ <h2>Open/Closed Source models tackle FineTasks</h2>
243
+ <p>Since we spent a lot of time and compute on task selection, we were interested in how well major <b>open-source</b> models would do on FineTasks. Given that our evaluation suite primarily targets pretrained models, we focused on these, with a few exceptions for models that don't offer a base (pretrained) version. These exceptions were included mainly out of curiosity, and their results should be interpreted with <b>caution</b>. Such models may significantly outperform other models due to the inclusion of supervised fine-tuning (SFT) data.</p>
244
+
245
+ <p>To assess the multilingual performance disparity between open-source and closed-source models, we expanded our selection by adding a closed source model: <b>gpt-4o-mini</b>.</p>
246
+
247
+ <p>As outlined in the task formulations, we are using MCF for this evaluation and employing a 5-shot approach, as recommended by OLMES<d-cite key="gu2024olmesstandardlanguagemodel"></d-cite> (and made possible by the large context size of the models).</p>
248
+
249
+ <h3>Computing a global "multilingual" score</h3>
250
+ <p>In the previous sections, we treated each task independently. However, to determine an overall "multilingual" score of a model, we need to <b>aggregate</b> the results from these tasks. We begin by <b>rescaling</b> the individual task scores in line with the OpenLLM leaderboard <d-cite key="open-llm-leaderboard-v2"></d-cite>. Then, we <b>average the scores</b> across task types (GK, RES, etc) for each language separately. To compute the score for each language, we take the average of the task type scores.<d-footnote>We first average by task type to properly measure all model capabilities without letting a single category dominate.</d-footnote></p>
251
+
252
+ <p>For the final global "multilingual" score we followed a different approach. Instead of averaging the language scores directly, we <b>ranked the model's performance across languages</b> in comparison to other models and then averaged those rank scores. This method ensures that the result reflects the overall model's performance across all languages, preventing an exceptionally high score in one language from skewing the final outcome.</p>
253
+
254
+ <h3>FineTasks Results</h3>
255
+ <p>After spending <b>even more compute</b> 🔥 on evaluating the selected models, we gathered the results in the following table. Here are our insights:</p>
256
+
257
+ <div id="leaderboard-results" class="l-middle" data-caption="Chat models are indicated by 💬 while 🟢 indicates a base model.">
258
+ </div>
259
+
260
+ <h4>Qwen family of models takes both top spots!</h4>
261
+ <p>The Qwen models <b>perform exceptionally well</b>, taking both first and second place with their 72B and 32B versions. Their key strength appears to be in handling high- and mid-resource languages (particularly Chinese), where they consistently ranked first. However, they <b>struggled with lower-resource languages</b>, especially Swahili and Telugu, where their performance lagged.</p>
262
+
263
+ <h4>General Knowledge: The curse of monolingual models</h4>
264
+ <p>The most surprising finding from our evaluation is how models explicitly trained to specialize in a <b>narrow set of languages</b> — like Sarvam-2B-v0.5 for Telugu, or Typhoon-v1.5-8B for Thai — tend to <b>perform exceptionally well on generative tasks</b>, while <b>falling short when it comes to reasoning</b> and general knowledge (GK) tasks, oftentimes getting close to random performance. We hypothesize two explanations: The models haven't undergone extensive enough training to be able to understand the MCF format or the higher exposure to various languages and especially English allows the non-specialized models to perform better at such GK/RES tasks. We note that good generative task performance reveals a good understanding of the target language.</p>
265
+
266
+ <p>The only <b>exceptions to this rule</b> are typhoon-v1.5-72b and Yi-1.5-34B, both tackling the RES/GK tasks well and managing to rank in the top 4 for their respective languages. We note that typhoon-v1.5-72b is based on Qwen models, and that Yi also included English in its training data.</p>
267
+
268
+ <h4>A lower resource winner: Gemma-2</h4>
269
+ <p>Although it didn't take first place, Gemma2 performed really well in the multilingual domain, especially <b>considering its size</b>. It showed consistent results across all the languages we tested, <b>excelling in low-resource languages</b> like Telugu and Swahili. For anyone working with low-resource languages, we highly recommend Gemma-2 as a strong option.</p>
270
+
271
+ <h4>Is there even a gap between open and closed source models?</h4>
272
+ <p>As mentioned in the beginning, comparing closed-source models requires extra caution. These models often undergo extensive supervised fine-tuning (SFT), employ highly optimized prompting techniques, and may even generate multiple responses and select the best one. <b>Despite these advantages, the o4-mini ranks only just above the medium-sized 27B Gemma-2.</b> Based on this evidence, <b>we believe that the gap between open-source and closed-source models is very narrow, if not entirely negligible.</b></p>
273
+
274
+ <h3>Evaluating on FineTasks</h3>
275
+ <p>If you would like to evaluate your models on FineTasks and expand the above table we made it easy for you. Simply run the following command with your model of choice:</p>
276
+
277
+ <pre><code>lighteval accelerate\
278
+ --model_args vllm,pretrained=model_name,pairwise_tokenization=True \
279
+ --custom_task lighteval.tasks.multilingual.tasks \
280
+ --tasks 'examples/tasks/finetasks/{cf,mcf}/{ara,fra,rus,tur,swa,hin,tel,tha,zho}' \
281
+ --max_samples '1000'</code></pre>
282
+
283
+ <h2>Can we cover all the languages of the world together?</h2>
284
+ <p>FineTasks is <b>just the beginning</b> of our multilingual journey. As a first step in the creation of the <b>future FineWeb multilingual release</b>, we are using this evaluation setup to curate a high quality pretraining dataset covering a large number of languages. You can expect more news from us soon! We plan to also continue working to make evaluation in non-English domains as seamless as it is in English—and <b>we need your help to achieve that</b>!</p>
285
+
286
+ <p>LightEval now supports <b>over 550 tasks</b> across various non-English languages, making it the evaluation framework with the best multilingual coverage available. However, there's still much more to do. For many languages, no tasks exist yet, despite our ongoing work. This is where we believe <b>the strong Hugging Face community can make a difference</b>.</p>
287
+
288
+ <p>We've made it <a href="https://github.com/huggingface/lighteval/wiki/Contributing-to-multilingual-evaluations"><b>incredibly easy</b> to contribute new tasks</a>, by developing a templating system which supports most of the popular task types, while maintaining authenticity of native language use, right down to correct punctuation. Even if you aren't able to contribute full evaluation tasks, you can still help. Many languages currently <b>lack translations</b> for anchor words used in evaluation, leaving hundreds of tasks unusable. You can help fill this gap by adding them following <a href="https://github.com/huggingface/lighteval/wiki/Contributing-to-multilingual-evaluations">our mini guide</a>.</p>
289
+
290
+ <p>We're looking forward to revisiting this analysis in the future, not with just 9 languages, but at least 50—thanks to community contributions! Let's level the playing field between English and other languages together! 🤗</p>
291
+ </d-article>
292
+
293
+ <d-appendix>
294
+ <d-bibliography src="bibliography.bib"></d-bibliography>
295
+ <style>
296
+ d-appendix .citation {
297
+ font-size: 11px;
298
+ line-height: 15px;
299
+ border-left: 1px solid rgba(0, 0, 0, 0.1);
300
+ padding-left: 18px;
301
+ border: 1px solid rgba(0,0,0,0.1);
302
+ background: rgba(0, 0, 0, 0.02);
303
+ padding: 10px 18px;
304
+ border-radius: 3px;
305
+ color: rgba(150, 150, 150, 1);
306
+ overflow: hidden;
307
+ margin-top: -12px;
308
+ white-space: pre-wrap;
309
+ word-wrap: break-word;
310
+ }
311
+ </style>
312
+
313
+ <h3 id="citation">Citation</h3>
314
+ <p>For attribution in academic contexts, please cite this work as</p>
315
+ <pre class="citation short">Kydlicek, et al., "FineTasks: Finding signal in a haystack of 200+ multilingual tasks", 2024.</pre>
316
+ <p>BibTeX citation</p>
317
+ <pre class="citation long">@misc{kydlicek2024finetasksmultilingualtasks,
318
+ title={FineTasks: Finding signal in a haystack of 200+ multilingual tasks},
319
+ author={Hynek Kydlíček and Guilherme Penedo and Clémentine Fourier and Nathan Habib and Thomas Wolf},
320
+ url={https://huggingface.co/spaces/HuggingFaceFW/blogpost-fine-tasks},
321
+ }</pre>
322
+ </d-appendix>
323
+
324
+ <script>
325
+ const article = document.querySelector('d-article');
326
+ const toc = document.querySelector('d-contents');
327
+ if (toc) {
328
+ const headings = article.querySelectorAll('h2, h3, h4');
329
+ let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`;
330
+ let prevLevel = 0;
331
+
332
+ for (const el of headings) {
333
+ // should element be included in TOC?
334
+ const isInTitle = el.parentElement.tagName == 'D-TITLE';
335
+ const isException = el.getAttribute('no-toc');
336
+ if (isInTitle || isException) continue;
337
+ el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
338
+ const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';
339
+
340
+ const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
341
+ while (prevLevel < level) {
342
+ ToC += '<ul>'
343
+ prevLevel++;
344
+ }
345
+ while (prevLevel > level) {
346
+ ToC += '</ul>'
347
+ prevLevel--;
348
+ }
349
+ if (level === 0)
350
+ ToC += '<div>' + link + '</div>';
351
+ else
352
+ ToC += '<li>' + link + '</li>';
353
+ }
354
+
355
+ while (prevLevel > 0) {
356
+ ToC += '</ul>'
357
+ prevLevel--;
358
+ }
359
+ ToC += '</nav>';
360
+ toc.innerHTML = ToC;
361
+ toc.setAttribute('prerendered', 'true');
362
+ const toc_links = document.querySelectorAll('d-contents > nav a');
363
+
364
+ window.addEventListener('scroll', (_event) => {
365
+ if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
366
+ // Then iterate forwards, on the first match highlight it and break
367
+ find_active: {
368
+ for (let i = headings.length - 1; i >= 0; i--) {
369
+ if (headings[i].getBoundingClientRect().top - 50 <= 0) {
370
+ if (!toc_links[i].classList.contains("active")) {
371
+ toc_links.forEach((link, _index) => {
372
+ link.classList.remove("active");
373
+ });
374
+ toc_links[i].classList.add('active');
375
+ }
376
+ break find_active;
377
+ }
378
+ }
379
+ toc_links.forEach((link, _index) => {
380
+ link.classList.remove("active");
381
+ });
382
+ }
383
+ }
384
+ });
385
+ }
386
+ </script>
387
+ </body>
388
+ </html>
app/src/index.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { initLeaderboardResults } from './leaderboard_results.js';
2
+ import { initFineTasks } from './fine_tasks.js';
3
+ import { initPlotApplets } from './plot_task.js';
4
+ import 'simple-datatables/dist/style.css';
5
+
6
+ document.addEventListener('DOMContentLoaded', () => {
7
+ initLeaderboardResults('leaderboard-results');
8
+ initFineTasks('fine-tasks-results');
9
+ initPlotApplets();
10
+ }, { once: true });
app/src/leaderboard_results.js ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Papa from 'papaparse';
2
+ import { DataTable } from 'simple-datatables';
3
+
4
+ const languageMap = {
5
+ 'All Languages': 'final_rankings.csv',
6
+ 'Arabic': 'results_ar.csv',
7
+ 'Turkish': 'results_tr.csv',
8
+ 'Swahili': 'results_sw.csv',
9
+ 'Russian': 'results_ru.csv',
10
+ 'Telugu': 'results_te.csv',
11
+ 'Thai': 'results_th.csv',
12
+ 'Chinese': 'results_zh.csv',
13
+ 'French': 'results_fr.csv',
14
+ 'Hindi': 'results_hi.csv',
15
+ };
16
+
17
+ const versionMap = {
18
+ 'v1': 'v1',
19
+ 'v2': 'v2'
20
+ };
21
+
22
+ const versionChangelog = {
23
+ 'v1': 'Initial release of FineTasks Leaderboard',
24
+ 'v2': 'Changes in v2:\n' +
25
+ '• Fixed a bug in the rescaling of scores\n' +
26
+ '• Switched to using Native choice prefixes for Thai/Telugu/Hindi/Arabics\n' +
27
+ '• Added Options: anchors before showing options for continuation tasks (e.g Hellawag) - consistent improvement in scores\n' +
28
+ '• Removed openai/gpt-4o-mini'
29
+ };
30
+
31
+ const columnNameMap = {
32
+ 'runname': 'Model',
33
+ 'agg_score_macro': 'Score',
34
+ 'agg_score_RES': 'RES Score',
35
+ 'agg_score_RC': 'RC Score',
36
+ 'agg_score_GK': 'GK Score',
37
+ 'agg_score_NLU': 'NLU Score',
38
+ 'avg_rank_macro': 'Multilingual Score',
39
+ 'rank': 'Rank'
40
+ };
41
+
42
+ function createDropdown(options, onChange, initialValue = null) {
43
+ const select = document.createElement('select');
44
+ options.forEach(option => {
45
+ const optionElement = document.createElement('option');
46
+ optionElement.value = option;
47
+ optionElement.textContent = option;
48
+ if (initialValue && option === initialValue) {
49
+ optionElement.selected = true;
50
+ }
51
+ select.appendChild(optionElement);
52
+ });
53
+ select.addEventListener('change', onChange);
54
+ return select;
55
+ }
56
+
57
+ function processTaskName(taskName) {
58
+ const parts = taskName.split('|');
59
+ let processedName = parts.length > 1 ? parts[1] : taskName;
60
+ processedName = processedName.split('_mcf')[0].split('_cf')[0];
61
+ return processedName;
62
+ }
63
+
64
+ function sanitizeColumnName(name) {
65
+ return name.replace(/[^a-zA-Z0-9-_]/g, '_');
66
+ }
67
+
68
+ function createResultsTable(data, extraColumn) {
69
+ const tableWrapper = document.createElement('div');
70
+ tableWrapper.className = 'table-wrapper leaderboard-table-wrapper';
71
+
72
+ const table = document.createElement('table');
73
+ table.className = 'results-table leaderboard-results-table';
74
+
75
+ const columns = extraColumn === 'All Languages'
76
+ ? ['rank', 'runname', 'avg_rank_macro']
77
+ : ['rank', 'runname', 'agg_score_macro', extraColumn].filter(Boolean);
78
+
79
+ const header = table.createTHead();
80
+ const headerRow = header.insertRow();
81
+ columns.forEach(column => {
82
+ const th = document.createElement('th');
83
+ th.textContent = columnNameMap[column] || processTaskName(column);
84
+ th.className = `column-${sanitizeColumnName(column)}`; // Sanitize the column name
85
+ headerRow.appendChild(th);
86
+ });
87
+
88
+ const body = table.createTBody();
89
+ data.forEach((row, index) => {
90
+ if (!row.runname) return; // Skip rows without a model name
91
+ const tr = body.insertRow();
92
+
93
+ // Add gradient background for top 3 positions
94
+ if (index < 3) {
95
+ const opacity = 1 - (index * 0.25); // Creates a fading effect: 1, 0.75, 0.5
96
+ tr.style.backgroundColor = `rgba(255, 165, 0, ${opacity * 0.2})`; // Light orange with fading opacity
97
+ tr.style.fontWeight = 600; // Make text slightly bolder for top 3
98
+ }
99
+
100
+ columns.forEach(column => {
101
+ const td = tr.insertCell();
102
+ td.className = `column-${sanitizeColumnName(column)}`;
103
+
104
+ if (column === 'rank') {
105
+ td.textContent = index + 1;
106
+ // Add special styling for top 3 ranks
107
+ if (index < 3) {
108
+ td.style.fontWeight = 'bold';
109
+ switch(index) {
110
+ case 0:
111
+ td.style.color = '#FFB800'; // Gold
112
+ break;
113
+ case 1:
114
+ td.style.color = '#C0C0C0'; // Silver
115
+ break;
116
+ case 2:
117
+ td.style.color = '#CD7F32'; // Bronze
118
+ break;
119
+ }
120
+ }
121
+ } else if (column === 'runname') {
122
+ const modelName = row[column];
123
+ let displayName;
124
+
125
+ // Check if it's a chat model
126
+ const chatModels = [
127
+ 'CohereForAI/c4ai-command-r-plus-08-2024',
128
+ 'openai/gpt-4o-mini',
129
+ 'silma-ai/SILMA-9B-Instruct-v1.0',
130
+ 'microsoft/Phi-3.5-mini-instruct',
131
+ 'TURKCELL/Turkcell-LLM-7b-v1'
132
+ ];
133
+
134
+ if (chatModels.some(chatModel => modelName.includes(chatModel))) {
135
+ displayName = `💬 ${modelName}`;
136
+ } else {
137
+ displayName = `🟢 ${modelName}`;
138
+ }
139
+
140
+ if (modelName.split("/")[0] !== "openai")
141
+ displayName = `<a href="https://huggingface.co/${modelName}">${displayName}</a>`;
142
+ td.innerHTML = displayName;
143
+ td.title = modelName; // Add full model name as tooltip
144
+ td.style.cursor = 'help'; // Change cursor to indicate hover functionality
145
+ } else {
146
+ const value = row[column];
147
+ td.textContent = typeof value === 'number' ? value.toFixed(2) : value;
148
+ }
149
+ });
150
+ });
151
+
152
+ tableWrapper.appendChild(table);
153
+ return tableWrapper;
154
+ }
155
+
156
+ function createChangelog() {
157
+ const changelogContainer = document.createElement('div');
158
+ changelogContainer.className = 'changelog-container';
159
+
160
+ const changelogHeader = document.createElement('div');
161
+ changelogHeader.className = 'changelog-header';
162
+
163
+ const arrow = document.createElement('span');
164
+ arrow.className = 'changelog-arrow';
165
+ arrow.textContent = '▶';
166
+
167
+ const label = document.createElement('span');
168
+ label.textContent = 'Changelog';
169
+ label.className = 'changelog-label';
170
+
171
+ const content = document.createElement('div');
172
+ content.className = 'changelog-content';
173
+ content.style.display = 'none';
174
+
175
+ changelogHeader.appendChild(arrow);
176
+ changelogHeader.appendChild(label);
177
+ changelogContainer.appendChild(changelogHeader);
178
+ changelogContainer.appendChild(content);
179
+
180
+ // Toggle changelog visibility
181
+ changelogHeader.addEventListener('click', () => {
182
+ const isVisible = content.style.display !== 'none';
183
+ content.style.display = isVisible ? 'none' : 'block';
184
+ arrow.textContent = isVisible ? '▶' : '▼';
185
+ });
186
+
187
+ return { container: changelogContainer, content };
188
+ }
189
+
190
+ export function initLeaderboardResults(containerId) {
191
+ const container = document.getElementById(containerId);
192
+ if (!container) return;
193
+
194
+ const titleElement = document.createElement('h3');
195
+ titleElement.textContent = 'FineTasks Leaderboard';
196
+ titleElement.className = 'leaderboard-title';
197
+
198
+ // Create changelog
199
+ const { container: changelogContainer, content: changelogContent } = createChangelog();
200
+
201
+ const tableContainer = document.createElement('div');
202
+ tableContainer.className = 'table-container';
203
+
204
+ let leaderboardDataTable;
205
+ let currentData = [];
206
+
207
+ // Create caption element
208
+ const captionElement = document.createElement('figcaption');
209
+ captionElement.className = 'table-caption';
210
+ captionElement.textContent = container.dataset.caption || '';
211
+
212
+ // Define update functions first
213
+ async function updateLanguageTable() {
214
+ const selectedVersion = versionDropdown.value;
215
+ const selectedLanguage = languageDropdown.value;
216
+ const csvFile = languageMap[selectedLanguage];
217
+
218
+ try {
219
+ const response = await fetch(`data/os_models/${selectedVersion}/${csvFile}`);
220
+ if (!response.ok) {
221
+ throw new Error(`HTTP error! status: ${response.status}`);
222
+ }
223
+ const csvText = await response.text();
224
+ const results = Papa.parse(csvText, { header: true, dynamicTyping: true }).data;
225
+ currentData = selectedLanguage === 'All Languages'
226
+ ? results.sort((a, b) => a.avg_rank_macro - b.avg_rank_macro)
227
+ : results.sort((a, b) => b.agg_score_macro - a.agg_score_macro);
228
+
229
+ if (selectedLanguage !== 'All Languages') {
230
+ const columnOptions = ['None'].concat(Object.keys(currentData[0]).filter(key =>
231
+ !['runname', 'seed', 'steps', 'agg_score_micro', 'rank', 'avg_rank_macro', ''].includes(key)
232
+ ));
233
+ extraColumnDropdown.innerHTML = '';
234
+ columnOptions.forEach(option => {
235
+ const optionElement = document.createElement('option');
236
+ optionElement.value = option;
237
+ optionElement.textContent = option === 'None' ? 'None' : processTaskName(option);
238
+ extraColumnDropdown.appendChild(optionElement);
239
+ });
240
+
241
+ extraColumnDropdown.value = 'None';
242
+ extraColumnLabel.style.display = 'inline';
243
+ extraColumnDropdown.style.display = 'inline';
244
+ } else {
245
+ extraColumnLabel.style.display = 'none';
246
+ extraColumnDropdown.style.display = 'none';
247
+ }
248
+
249
+ updateTable();
250
+ updateChangelog();
251
+ } catch (error) {
252
+ console.error('Error fetching CSV:', error);
253
+ tableContainer.innerHTML = `<p>Error loading data: ${error.message}</p>`;
254
+ }
255
+ }
256
+
257
+ function updateTable() {
258
+ const extraColumn = languageDropdown.value === 'All Languages' ? 'All Languages' :
259
+ (extraColumnDropdown.value === 'None' ? null : extraColumnDropdown.value);
260
+
261
+ tableContainer.innerHTML = '';
262
+ const tableWrapper = createResultsTable(currentData, extraColumn);
263
+ tableContainer.appendChild(tableWrapper);
264
+
265
+ if (leaderboardDataTable) {
266
+ leaderboardDataTable.destroy();
267
+ }
268
+
269
+ leaderboardDataTable = new DataTable('.leaderboard-results-table', {
270
+ perPage: 10,
271
+ perPageSelect: false,
272
+ searchable: false,
273
+ sortable: true,
274
+ fixedHeight: true,
275
+ labels: {
276
+ info: ''
277
+ }
278
+ });
279
+
280
+ setTimeout(adjustColumnWidths, 0);
281
+ }
282
+
283
+ function updateChangelog() {
284
+ const selectedVersion = versionDropdown.value;
285
+ changelogContent.textContent = versionChangelog[selectedVersion];
286
+ }
287
+
288
+ // Add this function to get URL parameters
289
+ function getUrlParameter(name) {
290
+ const urlParams = new URLSearchParams(window.location.search);
291
+ return urlParams.get(name);
292
+ }
293
+
294
+ // Add this function to set URL parameters
295
+ function updateUrlParameter(key, value) {
296
+ const urlParams = new URLSearchParams(window.location.search);
297
+ if (value) {
298
+ urlParams.set(key, value);
299
+ } else {
300
+ urlParams.delete(key);
301
+ }
302
+ const newUrl = `${window.location.pathname}${urlParams.toString() ? '?' + urlParams.toString() : ''}`;
303
+ window.history.pushState({ path: newUrl }, '', newUrl);
304
+ }
305
+
306
+ // Get initial language from URL
307
+ const urlLanguage = getUrlParameter('language');
308
+ const initialLanguage = urlLanguage && Object.keys(languageMap).includes(urlLanguage)
309
+ ? urlLanguage
310
+ : 'All Languages';
311
+
312
+ // Create dropdowns with initial values
313
+ const languageLabel = document.createElement('label');
314
+ languageLabel.textContent = 'Language: ';
315
+ const languageDropdown = createDropdown(
316
+ Object.keys(languageMap),
317
+ (e) => {
318
+ updateLanguageTable();
319
+ updateUrlParameter('language', e.target.value === 'All Languages' ? null : e.target.value);
320
+ },
321
+ initialLanguage
322
+ );
323
+
324
+ const extraColumnLabel = document.createElement('label');
325
+ extraColumnLabel.textContent = 'Task: ';
326
+ const extraColumnDropdown = createDropdown(['None'], updateTable);
327
+
328
+ const versionLabel = document.createElement('label');
329
+ versionLabel.textContent = 'Version: ';
330
+ const versionDropdown = createDropdown(Object.keys(versionMap), updateLanguageTable);
331
+
332
+ // Create controls
333
+ const controls = document.createElement('div');
334
+ controls.className = 'controls leaderboard-controls fine-tasks-controls';
335
+
336
+ const versionControlGroup = document.createElement('div');
337
+ versionControlGroup.className = 'control-group';
338
+ versionControlGroup.appendChild(versionLabel);
339
+ versionControlGroup.appendChild(versionDropdown);
340
+
341
+ const languageControlGroup = document.createElement('div');
342
+ languageControlGroup.className = 'control-group';
343
+ languageControlGroup.appendChild(languageLabel);
344
+ languageControlGroup.appendChild(languageDropdown);
345
+
346
+ const extraColumnControlGroup = document.createElement('div');
347
+ extraColumnControlGroup.className = 'control-group';
348
+ extraColumnControlGroup.appendChild(extraColumnLabel);
349
+ extraColumnControlGroup.appendChild(extraColumnDropdown);
350
+
351
+ controls.appendChild(versionControlGroup);
352
+ controls.appendChild(languageControlGroup);
353
+ controls.appendChild(extraColumnControlGroup);
354
+
355
+ // Add elements to container in new order
356
+ container.appendChild(titleElement);
357
+ container.appendChild(tableContainer);
358
+ container.appendChild(captionElement);
359
+ container.appendChild(controls);
360
+ container.appendChild(changelogContainer);
361
+
362
+ // Initialize with URL language if present
363
+ versionDropdown.value = 'v2';
364
+ languageDropdown.value = initialLanguage;
365
+ updateLanguageTable();
366
+ }
367
+
368
+ function adjustColumnWidths() {
369
+ const table = document.querySelector('.leaderboard-results-table');
370
+ if (!table) return;
371
+
372
+ const columns = table.querySelectorAll('th');
373
+ columns.forEach((column, index) => {
374
+ const columnClass = column.className;
375
+ const cells = table.querySelectorAll(`td.${columnClass}`);
376
+ let maxWidth = column.offsetWidth;
377
+ cells.forEach(cell => {
378
+ maxWidth = Math.max(maxWidth, cell.offsetWidth);
379
+ });
380
+
381
+ let adjustedWidth;
382
+ if (index === 0) { // Rank column
383
+ adjustedWidth = 50;
384
+ } else if (index === 1) { // Model name column
385
+ adjustedWidth = 200;
386
+ } else if (index === 2) { // Macro score column
387
+ adjustedWidth = 100;
388
+ } else { // Extra column or any other column
389
+ adjustedWidth = Math.min(maxWidth, 150); // Set a maximum width of 150px for other columns
390
+ }
391
+
392
+ column.style.width = `${adjustedWidth}px`;
393
+ cells.forEach(cell => {
394
+ cell.style.width = `${adjustedWidth}px`;
395
+ });
396
+ });
397
+ }
app/src/plot_task.js ADDED
@@ -0,0 +1,619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Plotly from 'plotly.js-basic-dist-min';
2
+ import Papa from 'papaparse';
3
+ import _ from 'lodash';
4
+ import { getColor } from './colors.mjs';
5
+
6
+ const languageMap = {
7
+ 'Arabic': 'ar',
8
+ 'Turkish': 'tr',
9
+ 'Swahili': 'sw',
10
+ 'Russian': 'ru',
11
+ 'Telugu': 'te',
12
+ 'Thai': 'th',
13
+ 'Chinese': 'zh',
14
+ 'French': 'fr',
15
+ 'Hindi': 'hi'
16
+ };
17
+
18
+ const runNameMap = {
19
+ "orion": "Dataset-A",
20
+ "helios": "Dataset-B",
21
+ "lynx": "Dataset-C",
22
+ "aquila": "Dataset-D",
23
+ "commoncrawl": "CommonCrawl",
24
+ "baseline": "Baseline"
25
+ };
26
+
27
+ const taskLists = {
28
+ ar: ['acva_ara:_average', 'alfgahafa_mlqa_ara_cf', 'alghafa_arc_ara_cf:easy', 'alghafa_facts_ara_cf', 'alghafa_meta_dialects_ara_cf', 'alghafa_mmlu_ara_cf:_average', 'alghafa_openbookqa_ara_cf', 'alghafa_piqa_ara_cf', 'alghafa_race_ara_cf', 'alghafa_rating_sentiment_ara_cf', 'alghafa_rating_sentiment_no_neutral_ara_cf', 'alghafa_sciqa_ara_cf', 'alghafa_sentiment_ara_cf', 'arcd_ara', 'belebele_arb_Arab_cf', 'boolq_ara', 'exams_ara_cf:_average', 'mkqa_ara:_average', 'mlmm_arc_ara_cf:challenge', 'mlmm_hellaswag_ara_cf', 'mlmm_mmlu_ara_cf:_average', 'mlmm_truthfulqa_ara_cf:mc1', 'mlmm_truthfulqa_ara_cf:mc2', 'mlqa_ara', 'mmlu_ara_cf:_average', 'soqal_ara_cf', 'toxigen_ara_cf', 'tydiqa_ara', 'xcodah_ara_cf', 'xcopa_ara_cf', 'xcsqa_ara_cf', 'xnli2.0_ara_cf', 'xnli_ara_cf', 'xquad_ara', 'xstory_cloze_ara_cf'],
29
+ fr: ['belebele_fra_Latn_cf', 'community_boolq_fra_cf', 'exams_fra_cf:_average', 'fquadv2_fra', 'frenchbench_arc_fra_cf:challenge', 'frenchbench_hellaswag_fra_cf', 'meta_mmlu_fra_cf:_average', 'mintaka_fra', 'mkqa_fra:_average', 'mlmm_arc_fra_cf:challenge', 'mlmm_hellaswag_fra_cf', 'mlmm_mmlu_fra_cf:_average', 'mlmm_truthfulqa_fra_cf:mc1', 'mlmm_truthfulqa_fra_cf:mc2', 'pawsx_fra_cf', 'xcodah_fra_cf', 'xcsqa_fra_cf', 'xnli2.0_fra_cf', 'xwinograd_fra_cf'],
30
+ hi: ['belebele_hin_Deva_cf', 'community_arc_hin_cf:challenge', 'community_arc_hin_cf:easy', 'community_boolq_hin', 'community_hellaswag_hin_cf', 'indicnxnli_hin_cf', 'indicqa_hin', 'indicxcopa_hin_cf', 'meta_mmlu_hin_cf:_average', 'mintaka_hin', 'mlmm_arc_hin_cf:challenge', 'mlmm_hellaswag_hin_cf', 'mlmm_mmlu_hin_cf:_average', 'mlmm_truthfulqa_hin_cf:mc1', 'mlmm_truthfulqa_hin_cf:mc2', 'mlqa_hin', 'xcodah_hin_cf', 'xcsqa_hin_cf', 'xnli2.0_hin_cf', 'xnli_hin_cf', 'xquad_hin', 'xstory_cloze_hin_cf'],
31
+ ru: ['belebele_rus_Cyrl_cf', 'chegeka_rus', 'mathlogic_qa_rus_cf', 'mera_openbookqa_rus_cf', 'mera_worldtree_rus_cf', 'mkqa_rus:_average', 'mlmm_arc_rus_cf:challenge', 'mlmm_hellaswag_rus_cf', 'mlmm_mmlu_rus_cf:_average', 'mlmm_truthfulqa_rus_cf:mc1', 'mlmm_truthfulqa_rus_cf:mc2', 'parus_rus_cf', 'rcb_rus_cf', 'rummlu_rus_cf:_average', 'sber_squad_rus', 'tydiqa_rus', 'xcodah_rus_cf', 'xcsqa_rus_cf', 'xnli2.0_rus_cf', 'xquad_rus', 'xstory_cloze_rus_cf', 'xwinograd_rus_cf'],
32
+ sw: ['afric_mmlu_swa_cf:_average', 'afric_xnli_swa_cf', 'belebele_swh_Latn_cf', 'community_arc_swa_cf:challenge', 'community_arc_swa_cf:easy', 'community_mmlu_swa_cf', 'kenswquad_swa', 'm3exams_swa_cf', 'openai_mmlu_swa_cf:_average', 'tydiqa_swa', 'xcodah_swa_cf', 'xcopa_swa_cf', 'xcsqa_swa_cf', 'xnli2.0_swa_cf', 'xnli_swa_cf', 'xstory_cloze_swa_cf'],
33
+ te: ['belebele_tel_Telu_cf', 'community_hellaswag_tel_cf', 'indicnxnli_tel_cf', 'indicqa_tel', 'indicxcopa_tel_cf', 'mlmm_arc_tel_cf:challenge', 'mlmm_hellaswag_tel_cf', 'mlmm_mmlu_tel_cf:_average', 'mlmm_truthfulqa_tel_cf:mc1', 'mlmm_truthfulqa_tel_cf:mc2', 'tydiqa_tel', 'xstory_cloze_tel_cf'],
34
+ th: ['belebele_tha_Thai_cf', 'community_hellaswag_tha_cf', 'm3exams_tha_cf', 'meta_mmlu_tha_cf:_average', 'mkqa_tha:_average', 'thai_exams_tha_cf:_average', 'thai_exams_tha_cf:tgat', 'thaiqa_tha', 'wsci_tha_cf', 'xcopa_tha_cf', 'xnli2.0_tha_cf', 'xnli_tha_cf', 'xquad_tha'],
35
+ tr: ['belebele_tur_Latn_cf', 'community_arc_tur_cf:easy', 'community_hellaswag_tur_cf', 'community_mmlu_tur_cf:_average', 'community_truthfulqa_tur_cf:mc1', 'community_truthfulqa_tur_cf:mc2', 'community_xwinograd_tur_cf', 'exams_tur_cf:_average', 'mkqa_tur:_average', 'tquadv2_tur', 'xcopa_tur_cf', 'xnli2.0_tur_cf', 'xnli_tur_cf', 'xquad_tur'],
36
+ zh: ['agieval_zho_cf:_average', 'belebele_zho_Hans_cf', 'c3_zho_cf', 'ceval_zho_cf:_average', 'chinese_squad_zho', 'cmath_zho_cf', 'cmmlu_zho_cf:_average', 'cmnli_zho_cf', 'cmrc2018_zho', 'm3exams_zho_cf', 'mkqa_zho:_average', 'mlmm_arc_zho_cf:challenge', 'mlmm_hellaswag_zho_cf', 'mlmm_mmlu_zho_cf:_average', 'mlmm_truthfulqa_zho_cf:mc1', 'mlmm_truthfulqa_zho_cf:mc2', 'ocnli_zho_cf', 'pawsx_zho_cf', 'xcodah_zho_cf', 'xcopa_zho_cf', 'xcsqa_zho_cf', 'xnli2.0_zho_cf', 'xnli_zho_cf', 'xquad_zho', 'xstory_cloze_zho_cf', 'xwinograd_zho_cf']
37
+ };
38
+
39
+ const LINE_SETTINGS = {
40
+ width: 2.5,
41
+ type: "scatter",
42
+ mode: "lines+markers",
43
+ };
44
+
45
+ const DEFAULT_LAYOUT = {
46
+ font: {
47
+ family: "apple-system, Arial, sans-serif",
48
+ },
49
+ title: {
50
+ font: {
51
+ size: 15,
52
+ },
53
+ },
54
+ xaxis: {
55
+ title: {
56
+ text: "Training Tokens (billions)",
57
+ font: {
58
+ size: 14,
59
+ },
60
+ },
61
+ tickfont: {
62
+ size: 12,
63
+ },
64
+ showgrid: false,
65
+ mirror: true,
66
+ ticks: "outside",
67
+ showline: true,
68
+ },
69
+ yaxis: {
70
+ title: {
71
+ font: {
72
+ size: 14,
73
+ },
74
+ standoff: 10,
75
+ },
76
+ showgrid: false,
77
+ mirror: true,
78
+ ticks: "outside",
79
+ showline: true,
80
+ tickfont: {
81
+ size: 12,
82
+ },
83
+ },
84
+ height: 300, // You can adjust this value
85
+ autosize: true,
86
+ legend: {
87
+ orientation: 'h', // Set to 'h' for horizontal legend (required for columns)
88
+ yanchor: 'bottom',
89
+ y: 0, // Position at the bottom
90
+ xanchor: 'right',
91
+ x: 1, // Position at the right
92
+ traceorder: 'normal',
93
+ font: { size: 12 },
94
+ tracegroupgap: 0, // Space between legend items
95
+ bgcolor: 'rgba(255, 255, 255, 0.8)' // White background with 70% transparency (1 - 0.3 = 70%)
96
+ },
97
+ margin: {
98
+ t: 25,
99
+ b: 60,
100
+ l: 60,
101
+ r: 40,
102
+ },
103
+ };
104
+
105
+ export function initPlotApplets() {
106
+ const plotContainers = document.querySelectorAll('.task-signal-plot');
107
+ plotContainers.forEach(container => {
108
+ initPlotApplet(container);
109
+ });
110
+ }
111
+
112
+ function initPlotApplet(container) {
113
+ const defaultLanguage = container.dataset.language || 'Arabic';
114
+ const defaultTask = container.dataset.task || '';
115
+ const defaultMetric = container.dataset.metric || '';
116
+ const groupSeeds = container.dataset.groupSeeds === 'true';
117
+ const showControls = container.dataset.showControls === 'true';
118
+ const taskMetrics = (container.dataset.taskMetrics || 'monotonicity,snr,ordering,randomness').split(",");
119
+
120
+ const controls = createControls(container, defaultLanguage, defaultTask, defaultMetric, taskMetrics);
121
+ if (!showControls)
122
+ controls.style.display = 'none';
123
+ container.appendChild(controls);
124
+
125
+ const plotContainer = document.createElement('div');
126
+ plotContainer.className = 'plot-container';
127
+ container.appendChild(plotContainer);
128
+
129
+ const statsContainer = document.createElement('div');
130
+ statsContainer.className = 'stats-container';
131
+ container.appendChild(statsContainer);
132
+
133
+
134
+ // Create an initial empty plot
135
+ Plotly.newPlot(plotContainer, []);
136
+
137
+ // Set up the resize function
138
+ const resizePlot = () => {
139
+ const width = container.offsetWidth;
140
+ Plotly.relayout(plotContainer, { width: width });
141
+ };
142
+
143
+ // Add resize listener
144
+ window.addEventListener('resize', resizePlot);
145
+
146
+ // Initial resize
147
+ resizePlot();
148
+
149
+ // Load the initial data
150
+ updateLanguageTasks(container, defaultTask, defaultMetric, groupSeeds, taskMetrics);
151
+ }
152
+
153
+ function createControls(container, defaultLanguage, defaultTask, defaultMetric, taskMetrics) {
154
+ const controls = document.createElement('div');
155
+ controls.className = 'controls';
156
+
157
+ const languageSelect = createSelect('language', Object.keys(languageMap), () => updateLanguageTasks(container, '', '', true, taskMetrics));
158
+ languageSelect.value = defaultLanguage;
159
+
160
+ const taskSelect = createSelect('task', [], () => updateMetrics(container, '', true, taskMetrics));
161
+ const metricSelect = createSelect('metric', [], () => updatePlot(container, taskMetrics));
162
+
163
+ controls.appendChild(createControlGroup('Language:', languageSelect));
164
+ controls.appendChild(createControlGroup('Task:', taskSelect));
165
+ controls.appendChild(createControlGroup('Metric:', metricSelect));
166
+
167
+ return controls;
168
+ }
169
+
170
+ function createSelect(id, options, onChangeHandler) {
171
+ const select = document.createElement('select');
172
+ select.id = id;
173
+ options.forEach(option => {
174
+ const optionElement = document.createElement('option');
175
+ optionElement.value = option;
176
+ optionElement.textContent = option;
177
+ select.appendChild(optionElement);
178
+ });
179
+ select.addEventListener('change', onChangeHandler);
180
+ return select;
181
+ }
182
+
183
+ function createControlGroup(labelText, inputElement) {
184
+ const group = document.createElement('div');
185
+ group.className = 'control-group';
186
+
187
+ const label = document.createElement('label');
188
+ label.textContent = labelText;
189
+ label.className = 'control-label';
190
+
191
+ group.appendChild(label);
192
+ group.appendChild(inputElement);
193
+
194
+ return group;
195
+ }
196
+
197
+ async function updateLanguageTasks(container, defaultTask = '', defaultMetric = '', groupSeeds, taskMetrics) {
198
+ const languageSelect = container.querySelector('#language');
199
+ const taskSelect = container.querySelector('#task');
200
+ const language = languageSelect.value;
201
+ const langCode = languageMap[language];
202
+
203
+ taskSelect.innerHTML = '<option value="">Loading tasks...</option>';
204
+
205
+ try {
206
+ const tasks = await getTasksForLanguage(langCode);
207
+
208
+ taskSelect.innerHTML = '';
209
+ if (tasks.length > 0) {
210
+ tasks.forEach(task => {
211
+ const option = document.createElement('option');
212
+ option.value = task;
213
+ option.textContent = truncateText(task, 25); // Reduced from 30 to 25
214
+ option.title = task; // Set full task name as title for tooltip
215
+ taskSelect.appendChild(option);
216
+ });
217
+
218
+ if (defaultTask && tasks.includes(defaultTask)) {
219
+ taskSelect.value = defaultTask;
220
+ } else {
221
+ taskSelect.selectedIndex = 0;
222
+ }
223
+
224
+ await updateMetrics(container, defaultMetric, groupSeeds, taskMetrics);
225
+ } else {
226
+ taskSelect.innerHTML = '<option value="">No tasks available</option>';
227
+ clearPlot(container);
228
+ }
229
+ } catch (error) {
230
+ console.error('Error fetching tasks:', error);
231
+ taskSelect.innerHTML = '<option value="">Error loading tasks</option>';
232
+ clearPlot(container);
233
+ }
234
+ }
235
+
236
+ async function getTasksForLanguage(langCode) {
237
+ return taskLists[langCode] || [];
238
+ }
239
+
240
+ async function updateMetrics(container, defaultMetric = '', groupSeeds, taskMetrics) {
241
+ const language = container.querySelector('#language').value;
242
+ const task = container.querySelector('#task').value;
243
+ const langCode = languageMap[language];
244
+ const metricSelect = container.querySelector('#metric');
245
+
246
+ metricSelect.innerHTML = '<option value="">Loading metrics...</option>';
247
+
248
+ try {
249
+ const metrics = await getMetricsForTask(langCode, task);
250
+
251
+ metricSelect.innerHTML = '';
252
+ metrics.forEach(metric => {
253
+ const option = document.createElement('option');
254
+ option.value = metric;
255
+ option.textContent = metric;
256
+ metricSelect.appendChild(option);
257
+ });
258
+
259
+ if (defaultMetric && metrics.includes(defaultMetric)) {
260
+ metricSelect.value = defaultMetric;
261
+ } else if (metricSelect.options.length > 0) {
262
+ metricSelect.selectedIndex = 0;
263
+ }
264
+
265
+ await updatePlot(container, taskMetrics);
266
+ } catch (error) {
267
+ console.error('Error fetching metrics:', error);
268
+ metricSelect.innerHTML = '<option value="">Error loading metrics</option>';
269
+ clearPlot(container);
270
+ }
271
+ }
272
+
273
+ async function getMetricsForTask(langCode, task) {
274
+ return new Promise((resolve, reject) => {
275
+ Papa.parse(`data/nanotron_tasks/${langCode}/${task}_stats.csv`, {
276
+ download: true,
277
+ header: true,
278
+ complete: function(results) {
279
+ const metrics = [...new Set(results.data.map(row => row.metric).filter(metric => metric))];
280
+ resolve(metrics);
281
+ },
282
+ error: function(error) {
283
+ console.error('Error fetching metrics:', error);
284
+ reject(error);
285
+ }
286
+ });
287
+ });
288
+ }
289
+
290
+ function updatePlot(container, taskMetrics) {
291
+ const language = container.querySelector('#language').value;
292
+ const task = container.querySelector('#task').value;
293
+ const metric = container.querySelector('#metric').value;
294
+ const title = container.dataset.title;
295
+ const langCode = languageMap[language];
296
+
297
+ if (!langCode || !task || !metric) {
298
+ clearPlot(container);
299
+ return;
300
+ }
301
+
302
+ const dataUrl = `data/nanotron_tasks/${langCode}/${task}_data.csv`;
303
+ const statsUrl = `data/nanotron_tasks/${langCode}/${task}_stats.csv`;
304
+
305
+ Promise.all([
306
+ new Promise((resolve, reject) => {
307
+ Papa.parse(dataUrl, {
308
+ download: true,
309
+ header: true,
310
+ dynamicTyping: true,
311
+ complete: resolve,
312
+ error: reject
313
+ });
314
+ }),
315
+ new Promise((resolve, reject) => {
316
+ Papa.parse(statsUrl, {
317
+ download: true,
318
+ header: true,
319
+ dynamicTyping: true,
320
+ complete: resolve,
321
+ error: reject
322
+ });
323
+ })
324
+ ]).then(([dataResult, statsResult]) => {
325
+ const taskData = dataResult.data;
326
+ const statsData = statsResult.data;
327
+ plotData(container, taskData, statsData, metric, title, taskMetrics);
328
+ }).catch(error => {
329
+ console.error('Error parsing CSV:', error);
330
+ clearPlot(container);
331
+ });
332
+ }
333
+
334
+ function plotData(container, data, stats, metric, title, taskMetrics) {
335
+ const groupSeeds = container.dataset.groupSeeds === 'true';
336
+ const sortedData = sortDataByTokens(data);
337
+ const groupedData = groupDataByRunname(sortedData, groupSeeds, metric);
338
+ const interpolatedData = interpolateData(groupedData, metric);
339
+ const smoothedData = smoothData(interpolatedData, metric);
340
+ const traces = createTraces(smoothedData, metric);
341
+
342
+ const plotContainer = container.querySelector('.plot-container');
343
+
344
+ const layout = _.merge({}, DEFAULT_LAYOUT, {
345
+ title: { text: `${title}` },
346
+ xaxis: {
347
+ title: { text: 'Training Tokens (billions)' },
348
+ tickvals: [0, 5, 10, 15, 20, 25],
349
+ ticktext: ['0', '5B', '10B', '15B', '20B', '25B'],
350
+ tickangle: 45,
351
+ range: [0, 30], // Set the range to start from 0 and end at 30B
352
+ },
353
+ yaxis: {
354
+ title: { text: 'Score' },
355
+ range: [Math.min(...traces.flatMap(trace => trace.y)) * 0.95, Math.max(...traces.flatMap(trace => trace.y)) * 1.05], // Add 5% padding to the top and bottom
356
+ },
357
+ width: container.offsetWidth,
358
+ });
359
+
360
+ Plotly.newPlot(plotContainer, traces, layout, {responsive: true});
361
+
362
+ // Display statistics
363
+ displayStatistics(container, stats, metric, taskMetrics);
364
+ }
365
+
366
+ function displayStatistics(container, stats, metric, taskMetrics) {
367
+ const statsContainer = container.querySelector('.stats-container');
368
+ const metricStats = stats.find(stat => stat.metric === metric);
369
+ if (metricStats) {
370
+ statsContainer.innerHTML = `
371
+ <div class="compact-stats${taskMetrics.length === 1 ? '-single' : ''}">
372
+ ${taskMetrics.includes('monotonicity') ? '<span title="Average Spearman Correlation">Monotonicity: ' + metricStats.avg_spearman.toFixed(2) + '</span>' : ''}
373
+ ${taskMetrics.includes('snr') ? '<span title="Average Signal-to-Noise Ratio">Signal-to-Noise: ' + metricStats.avg_snr.toFixed(2) + '</span>' : ''}
374
+ ${taskMetrics.includes('ordering') ? '<span title="Average Kendall Tau-a">Ordering Consistency: ' + metricStats.avg_kendall_tau_a.toFixed(2) + '</span>' : ''}
375
+ ${taskMetrics.includes('randomness') ? '<span title="Max N Standard Deviations">Non-Randomness: ' + metricStats.max_n_std.toFixed(2) + '</span>' : ''}
376
+ </div>
377
+ `;
378
+ } else {
379
+ statsContainer.innerHTML = '<p>No statistics available for this metric.</p>';
380
+ }
381
+ }
382
+
383
+ function getReducedTickValues(tokens) {
384
+ const uniqueTokens = [...new Set(tokens)].sort((a, b) => a - b);
385
+ const tokenCount = uniqueTokens.length;
386
+ const targetTickCount = 10; // Adjust this value to increase/decrease the number of ticks
387
+
388
+ if (tokenCount <= targetTickCount) {
389
+ return uniqueTokens;
390
+ }
391
+
392
+ const stride = Math.ceil(tokenCount / targetTickCount);
393
+ return uniqueTokens.filter((_, index) => index % stride === 0);
394
+ }
395
+
396
+ function formatTickLabel(value) {
397
+ if (value >= 1e9) {
398
+ return (value / 1e9).toFixed(1) + 'B';
399
+ } else if (value >= 1e6) {
400
+ return (value / 1e6).toFixed(1) + 'M';
401
+ } else if (value >= 1e3) {
402
+ return (value / 1e3).toFixed(1) + 'K';
403
+ }
404
+ return value.toString();
405
+ }
406
+
407
+ function computeStatistics(data, metric) {
408
+ const stats = {
409
+ avg_spearman: 0,
410
+ avg_kendall_tau_a: 0,
411
+ avg_snr: 0,
412
+ max_n_std: 0
413
+ };
414
+
415
+ const baselineRun = Object.keys(data).find(key => key.toLowerCase().includes('baseline'));
416
+ const nonBaselineRuns = Object.keys(data).filter(key => key !== baselineRun);
417
+
418
+ // Compute statistics for each non-baseline run
419
+ nonBaselineRuns.forEach(run => {
420
+ const runData = data[run];
421
+ const tokens = runData.map(row => row.tokens);
422
+ const scores = runData.map(row => row[metric]);
423
+
424
+ // Spearman correlation
425
+ stats.avg_spearman += spearmanCorrelation(tokens, scores);
426
+
427
+ // Kendall Tau-a
428
+ const lastHalf = Math.floor(runData.length / 2);
429
+ const kendallTauValues = [];
430
+ for (let i = lastHalf; i < runData.length - 1; i++) {
431
+ kendallTauValues.push(kendallTauA(scores.slice(0, i + 1), scores.slice(0, i + 2)));
432
+ }
433
+ stats.avg_kendall_tau_a += _.mean(kendallTauValues);
434
+
435
+ // SNR and max_n_std
436
+ if (baselineRun) {
437
+ const baselineScores = data[baselineRun].map(row => row[metric]);
438
+ const stdDev = standardDeviation(scores);
439
+ stats.avg_snr += _.mean(scores) / stdDev;
440
+ stats.max_n_std = Math.max(stats.max_n_std, (_.max(scores) - _.mean(baselineScores)) / stdDev);
441
+ }
442
+ });
443
+
444
+ // Average the statistics
445
+ const numRuns = nonBaselineRuns.length;
446
+ stats.avg_spearman /= numRuns;
447
+ stats.avg_kendall_tau_a /= numRuns;
448
+ stats.avg_snr /= numRuns;
449
+
450
+ return stats;
451
+ }
452
+
453
+ function spearmanCorrelation(x, y) {
454
+ const n = x.length;
455
+ const rankX = rankData(x);
456
+ const rankY = rankData(y);
457
+
458
+ let sum_d_squared = 0;
459
+ for (let i = 0; i < n; i++) {
460
+ const d = rankX[i] - rankY[i];
461
+ sum_d_squared += d * d;
462
+ }
463
+
464
+ return 1 - (6 * sum_d_squared) / (n * (n * n - 1));
465
+ }
466
+
467
+ function rankData(data) {
468
+ const sorted = [...data].sort((a, b) => a - b);
469
+ return data.map(x => sorted.indexOf(x) + 1);
470
+ }
471
+
472
+ function kendallTauA(x, y) {
473
+ const n = x.length;
474
+ let concordant = 0;
475
+ let discordant = 0;
476
+
477
+ for (let i = 0; i < n; i++) {
478
+ for (let j = i + 1; j < n; j++) {
479
+ const sign_x = Math.sign(x[j] - x[i]);
480
+ const sign_y = Math.sign(y[j] - y[i]);
481
+ if (sign_x * sign_y > 0) concordant++;
482
+ else if (sign_x * sign_y < 0) discordant++;
483
+ }
484
+ }
485
+
486
+ return (concordant - discordant) / (n * (n - 1) / 2);
487
+ }
488
+
489
+ function standardDeviation(values) {
490
+ const mean = _.mean(values);
491
+ const squareDiffs = values.map(value => {
492
+ const diff = value - mean;
493
+ return diff * diff;
494
+ });
495
+ const avgSquareDiff = _.mean(squareDiffs);
496
+ return Math.sqrt(avgSquareDiff);
497
+ }
498
+
499
+ function interpolateData(data, metric) {
500
+ return _.mapValues(data, (rows) => {
501
+ const sortedRows = _.sortBy(rows, 'tokens');
502
+ const allTokens = _.uniq(_.flatMap(Object.values(data), rows => rows.map(r => r.tokens))).sort((a, b) => a - b);
503
+
504
+ return allTokens.map(token => {
505
+ const exactMatch = _.find(sortedRows, { tokens: token });
506
+ if (exactMatch) return exactMatch;
507
+
508
+ const lowerRow = _.findLast(sortedRows, r => r.tokens < token);
509
+ const upperRow = _.find(sortedRows, r => r.tokens > token);
510
+
511
+ if (!lowerRow) return { ...upperRow, tokens: token };
512
+ if (!upperRow) return { ...lowerRow, tokens: token };
513
+
514
+ const ratio = (token - lowerRow.tokens) / (upperRow.tokens - lowerRow.tokens);
515
+ const interpolatedMetric = lowerRow[metric] + (upperRow[metric] - lowerRow[metric]) * ratio;
516
+
517
+ return {
518
+ ...lowerRow,
519
+ tokens: token,
520
+ [metric]: interpolatedMetric
521
+ };
522
+ });
523
+ });
524
+ }
525
+
526
+ function smoothData(data, metric, windowSize = 3) {
527
+ return _.mapValues(data, (rows) => {
528
+ return rows.map((row, index, array) => {
529
+ const window = array.slice(Math.max(0, index - windowSize + 1), index + 1);
530
+ const smoothedMetric = _.meanBy(window, r => r[metric]);
531
+ return { ...row, [metric]: smoothedMetric };
532
+ });
533
+ });
534
+ }
535
+
536
+ function sortDataByTokens(data) {
537
+ return _.sortBy(data, 'tokens');
538
+ }
539
+
540
+ function groupDataByRunname(data, groupSeeds, metric) {
541
+ // Remove null or undefined runs
542
+ data = data.filter(row => row.runname != null && row.runname !== 'null_undefined');
543
+
544
+ if (!groupSeeds) {
545
+ return _.groupBy(data, row => `${processRunName(row.runname)}_${row.seed}`);
546
+ }
547
+
548
+ const grouped = _.groupBy(data, row => processRunName(row.runname));
549
+
550
+ return _.mapValues(grouped, (rows) => {
551
+ const stepGroups = _.groupBy(rows, 'tokens');
552
+ return _.map(stepGroups, (stepRows) => {
553
+ const meanMetric = _.meanBy(stepRows, row => parseFloat(row[metric]) || 0);
554
+ return {
555
+ ...stepRows[0],
556
+ [metric]: meanMetric
557
+ };
558
+ });
559
+ });
560
+ }
561
+
562
+ function processRunName(runname) {
563
+ for (const [key, value] of Object.entries(runNameMap)) {
564
+ if (runname.includes(key)) {
565
+ return value;
566
+ }
567
+ }
568
+ return runname;
569
+ }
570
+
571
+ function createTraces(groupedData, metric) {
572
+ const colorsMapping = new Map();
573
+ const sortedRunnames = Object.keys(groupedData).sort((a, b) => {
574
+ if (a.includes('baseline')) return 1;
575
+ if (b.includes('baseline')) return -1;
576
+ return a.localeCompare(b);
577
+ });
578
+
579
+ return sortedRunnames.map((runname, index) => {
580
+ const color = getColorForTrace(runname, colorsMapping, index);
581
+ return {
582
+ x: groupedData[runname].map(row => row.tokens),
583
+ y: groupedData[runname].map(row => row[metric]),
584
+ name: runname,
585
+ line: {
586
+ color: color,
587
+ shape: 'spline',
588
+ ...LINE_SETTINGS
589
+ },
590
+ marker: {
591
+ color: color,
592
+ size: 6,
593
+ },
594
+ mode: 'lines+markers',
595
+ };
596
+ });
597
+ }
598
+
599
+ function getColorForTrace(traceName, colorsMapping, index) {
600
+ const reusedColor = colorsMapping.get(traceName);
601
+ if (reusedColor) {
602
+ return reusedColor;
603
+ }
604
+
605
+ const color = getColor(index);
606
+ colorsMapping.set(traceName, color);
607
+ return color;
608
+ }
609
+
610
+ function clearPlot(container) {
611
+ const plotContainer = container.querySelector('.plot-container');
612
+ Plotly.purge(plotContainer);
613
+ }
614
+
615
+ function truncateText(text, maxLength) {
616
+ if (text.length <= maxLength) return text;
617
+ return text.substr(0, maxLength - 2) + '..';
618
+ }
619
+
app/src/stats.js ADDED
File without changes
app/src/style.css ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* style.css */
2
+ /* Define colors */
3
+ :root {
4
+ --distill-gray: rgb(107, 114, 128);
5
+ --distill-gray-light: rgb(185, 185, 185);
6
+ --distill-gray-lighter: rgb(228, 228, 228);
7
+ --distill-gray-lightest: rgb(245, 245, 245);
8
+ --distill-blue: #007BFF;
9
+ }
10
+
11
+ d-byline .byline {
12
+ grid-template-columns: 1fr;
13
+ grid-column: text;
14
+ font-size: 0.9rem;
15
+ line-height: 1.8em;
16
+ }
17
+
18
+ @media (min-width: 768px) {
19
+ d-byline .byline {
20
+ grid-template-columns: 5fr 1fr 1fr;
21
+ }
22
+ }
23
+
24
+ d-contents > nav a.active {
25
+ text-decoration: underline;
26
+ }
27
+
28
+ @media (max-width: 1199px) {
29
+ d-contents {
30
+ display: none;
31
+ justify-self: start;
32
+ align-self: start;
33
+ padding-bottom: 0.5em;
34
+ margin-bottom: 1em;
35
+ padding-left: 0.25em;
36
+ border-bottom: 1px solid rgba(0, 0, 0, 0.1);
37
+ border-bottom-width: 1px;
38
+ border-bottom-style: solid;
39
+ border-bottom-color: rgba(0, 0, 0, 0.1);
40
+ }
41
+ }
42
+
43
+ d-contents a:hover {
44
+ border-bottom: none;
45
+ }
46
+
47
+
48
+ @media (min-width: 1200px) {
49
+ d-article {
50
+ /* Ensure d-article does not prevent sticky positioning */
51
+ overflow: visible;
52
+ }
53
+
54
+ d-contents {
55
+ align-self: start;
56
+ grid-column-start: 1 !important;
57
+ grid-column-end: 4 !important;
58
+ grid-row: auto / span 6;
59
+ justify-self: end;
60
+ margin-top: 0em;
61
+ padding-right: 3em;
62
+ padding-left: 2em;
63
+ /*border-right: 1px solid rgba(0, 0, 0, 0.1);*/
64
+ /*border-right-width: 1px;*/
65
+ /*border-right-style: solid;*/
66
+ /*border-right-color: rgba(0, 0, 0, 0.1);*/
67
+ position: -webkit-sticky; /* For Safari */
68
+ position: sticky;
69
+ top: 10px; /* Adjust this value if needed */
70
+ z-index: -1;
71
+ }
72
+ }
73
+
74
+ d-contents nav h3 {
75
+ margin-top: 0;
76
+ margin-bottom: 1em;
77
+ }
78
+
79
+ d-contents nav div {
80
+ color: rgba(0, 0, 0, 0.8);
81
+ font-weight: bold;
82
+ }
83
+
84
+ d-contents nav a {
85
+ color: rgba(0, 0, 0, 0.8);
86
+ border-bottom: none;
87
+ text-decoration: none;
88
+ }
89
+
90
+ d-contents li {
91
+ list-style-type: none;
92
+ }
93
+
94
+ d-contents ul, d-article d-contents ul {
95
+ padding-left: 1em;
96
+ }
97
+
98
+ d-contents nav ul li {
99
+ margin-bottom: .25em;
100
+ }
101
+
102
+ d-contents nav a:hover {
103
+ text-decoration: underline solid rgba(0, 0, 0, 0.6);
104
+ }
105
+
106
+ d-contents nav ul {
107
+ margin-top: 0;
108
+ margin-bottom: 6px;
109
+ }
110
+
111
+
112
+ d-contents nav > div {
113
+ display: block;
114
+ outline: none;
115
+ margin-bottom: 0.5em;
116
+ }
117
+
118
+ d-contents nav > div > a {
119
+ font-size: 13px;
120
+ font-weight: 600;
121
+ }
122
+
123
+ d-article aside {
124
+ display: none;
125
+ }
126
+
127
+ @media (min-width: 768px) {
128
+ d-article aside {
129
+ display: block;
130
+ height: 0px;
131
+ overflow: visible;
132
+ margin-bottom: 1em;
133
+ z-index: 1000;
134
+ }
135
+ }
136
+
137
+ @media (min-width: 768px) {
138
+ d-article aside {
139
+ margin-bottom: 0;
140
+ }
141
+ }
142
+
143
+ d-contents nav > div > a:hover,
144
+ d-contents nav > ul > li > a:hover {
145
+ text-decoration: none;
146
+ }
147
+
148
+
149
+ /* Controlls for Plotting Applet */
150
+ .controls {
151
+ display: flex;
152
+ flex-wrap: nowrap;
153
+ gap: 10px;
154
+ justify-content: center;
155
+ }
156
+
157
+ .control-group {
158
+ display: flex;
159
+ flex-direction: column;
160
+ align-items: center;
161
+ }
162
+
163
+ .controls select {
164
+ padding: 2px 4px;
165
+ line-height: 1.5em;
166
+ text-align: center;
167
+ border-radius: 4px;
168
+ font-size: 0.7em;
169
+ background-color: var(--distill-gray-lightest);
170
+ outline: none;
171
+ }
172
+
173
+ .controls label {
174
+ font-size: 0.8em;
175
+ font-weight: bold;
176
+ }
177
+
178
+
179
+ /* Specific style for the task dropdown */
180
+ #task {
181
+ max-width: 180px;
182
+ }
183
+
184
+ .controls select option {
185
+ max-width: 300px;
186
+ overflow: visible;
187
+ }
188
+
189
+ .task-signal-plot {
190
+ width: 100%;
191
+ max-width: 500px; /* Adjust this value as needed */
192
+ margin: 0 auto;
193
+ }
194
+
195
+
196
+ .stats-container {
197
+ margin-bottom: 5px;
198
+ }
199
+ .compact-stats {
200
+ display: grid;
201
+ place-items: center;
202
+ grid-template-columns: 1fr 1fr;
203
+ gap: 5px;
204
+ font-weight: bold;
205
+ font-size: 12px;
206
+ }
207
+ .compact-stats-single {
208
+ display: grid;
209
+ place-items: center;
210
+ font-weight: bold;
211
+ font-size: 12px;
212
+ }
213
+
214
+ .fine-tasks-controls,
215
+ .leaderboard-controls {
216
+ margin-bottom: 20px;
217
+ }
218
+
219
+ .fine-tasks-table-wrapper {
220
+ margin-top: 20px;
221
+ }
222
+
223
+ .fine-tasks-results-table {
224
+ width: 100%;
225
+ border-collapse: separate;
226
+ border-spacing: 0;
227
+ table-layout: fixed; /* This ensures that the table respects column widths */
228
+ }
229
+
230
+ .fine-tasks-results-table th,
231
+ .fine-tasks-results-table td {
232
+ border: 1px solid #ddd;
233
+ padding: 8px;
234
+ text-align: left;
235
+ overflow: hidden;
236
+ text-overflow: ellipsis;
237
+ white-space: nowrap;
238
+ }
239
+
240
+ .fine-tasks-results-table th {
241
+ background-color: #f9f9f9;
242
+ font-weight: bold;
243
+ }
244
+
245
+ .fine-tasks-results-table tr:nth-child(even) {
246
+ background-color: #f2f2f2;
247
+ }
248
+
249
+ .fine-tasks-results-table tr:nth-child(odd) {
250
+ background-color: #fff;
251
+ }
252
+
253
+ .fine-tasks-results-table tr:hover {
254
+ background-color: #ddd;
255
+ }
256
+
257
+ /* Remove the horizontal line above the pagination */
258
+ .fine-tasks-table-wrapper .datatable-bottom {
259
+ border-top: none;
260
+ }
261
+
262
+ /* Hide the "Showing X to Y of Z entries" text */
263
+ .fine-tasks-table-wrapper .datatable-info {
264
+ display: none;
265
+ }
266
+
267
+ .fine-tasks-title {
268
+ text-align: center;
269
+ margin-top: 20px;
270
+ margin-bottom: 20px;
271
+ }
272
+
273
+ .fine-tasks-results-table td[title] {
274
+ cursor: help;
275
+ }
276
+
277
+ .leaderboard-title {
278
+ text-align: center;
279
+ margin-top: 20px;
280
+ margin-bottom: 20px;
281
+ }
282
+
283
+
284
+ .leaderboard-results-table {
285
+ width: 100%;
286
+ border-collapse: separate;
287
+ border-spacing: 0;
288
+ table-layout: fixed;
289
+ }
290
+
291
+ .leaderboard-results-table th,
292
+ .leaderboard-results-table td {
293
+ border: 1px solid #ddd;
294
+ padding: 8px;
295
+ text-align: left;
296
+ overflow: hidden;
297
+ text-overflow: ellipsis;
298
+ white-space: nowrap;
299
+ }
300
+
301
+ .leaderboard-results-table th:first-child,
302
+ .leaderboard-results-table td:first-child {
303
+ width: 50px; /* For the rank column */
304
+ }
305
+
306
+ .leaderboard-results-table th:nth-child(2),
307
+ .leaderboard-results-table td:nth-child(2) {
308
+ width: 200px; /* For the model name column */
309
+ }
310
+
311
+ .leaderboard-results-table th:nth-child(3),
312
+ .leaderboard-results-table td:nth-child(3) {
313
+ width: 100px; /* For the macro score column */
314
+ }
315
+
316
+ .leaderboard-results-table th:nth-child(4),
317
+ .leaderboard-results-table td:nth-child(4) {
318
+ width: 150px; /* For the extra column */
319
+ }
320
+
321
+ .leaderboard-results-table th {
322
+ background-color: #f9f9f9;
323
+ font-weight: bold;
324
+ }
325
+
326
+ .leaderboard-results-table tr:nth-child(even) {
327
+ background-color: #f2f2f2;
328
+ }
329
+
330
+ .leaderboard-results-table tr:hover {
331
+ background-color: #ddd;
332
+ }
333
+
334
+ /* Remove the horizontal line above the pagination */
335
+ .leaderboard-table-wrapper .datatable-bottom {
336
+ border-top: none;
337
+ }
338
+
339
+ /* Hide the "Showing X to Y of Z entries" text */
340
+ .leaderboard-table-wrapper .datatable-info {
341
+ display: none;
342
+ }
343
+
344
+ .leaderboard-results-table td[title] {
345
+ cursor: help;
346
+ }
347
+
348
+ /* Tooltip styles */
349
+ .leaderboard-results-table td[title]:hover::after {
350
+ content: attr(title);
351
+ position: absolute;
352
+ left: 0;
353
+ top: 100%;
354
+ background-color: #f9f9f9;
355
+ color: #000;
356
+ padding: 5px;
357
+ border: 1px solid #ddd;
358
+ border-radius: 4px;
359
+ z-index: 1000;
360
+ white-space: nowrap;
361
+ overflow: hidden;
362
+ text-overflow: ellipsis;
363
+ max-width: 300px;
364
+ }
365
+
366
+ .table-caption {
367
+ text-align: center;
368
+ margin-bottom: 10px;
369
+ font-style: italic;
370
+ color: #666;
371
+ width: 100%; /* Ensure the figcaption takes full width */
372
+ display: block; /* Make it a block element */
373
+ }
374
+
375
+ html, body {
376
+ height: 100%;
377
+ overflow-y: auto;
378
+ -webkit-overflow-scrolling: touch; /* for smoother scrolling on iOS */
379
+ }
380
+
381
+ .main-container {
382
+ -webkit-overflow-scrolling: touch;
383
+ }
384
+
385
+ body {
386
+ -webkit-text-size-adjust: 100%;
387
+ }
388
+
389
+ /* Changelog styles */
390
+
391
+ .changelog-container {
392
+ margin-top: 20px;
393
+ font-size: 14px;
394
+ }
395
+
396
+ .changelog-header {
397
+ cursor: pointer;
398
+ display: flex;
399
+ align-items: center;
400
+ gap: 12px; /* Increased space between arrow and label */
401
+ padding: 8px;
402
+ background-color: #f5f5f5;
403
+ border-radius: 4px;
404
+ }
405
+
406
+ .changelog-header:hover {
407
+ background-color: #ebebeb;
408
+ }
409
+
410
+ .changelog-arrow {
411
+ font-size: 12px;
412
+ color: #666;
413
+ }
414
+
415
+ .changelog-label {
416
+ font-weight: 600;
417
+ color: #333;
418
+ }
419
+
420
+ .changelog-content {
421
+ margin-top: 8px;
422
+ padding: 12px;
423
+ background-color: #fff;
424
+ border: 1px solid #eee;
425
+ border-radius: 4px;
426
+ line-height: 1.4;
427
+ white-space: pre-line;
428
+ }
app/webpack.config.js ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const path = require("path");
2
+ const { CleanWebpackPlugin } = require("clean-webpack-plugin");
3
+ const CopyPlugin = require("copy-webpack-plugin");
4
+ const BundleAnalyzerPlugin = require("webpack-bundle-analyzer").BundleAnalyzerPlugin;
5
+
6
+ const COLOR_KEYS = ["color", "bgColor", "fillcolor"];
7
+
8
+ const transformDataColors = async (data, path) => {
9
+ const {getNamedColor} = await import('./src/colors.mjs');
10
+ // if not json file, return
11
+ if (!path.endsWith(".json")) {
12
+ return data;
13
+ }
14
+ const parsedData = JSON.parse(data);
15
+ // Change the color of the data
16
+ const deepIterateAndSetColor = (key, val) => {
17
+ if (val === null) {
18
+ return null;
19
+ }
20
+ if (val == undefined) {
21
+ return undefined;
22
+ }
23
+ if (Array.isArray(val)) {
24
+ return val.map(item => deepIterateAndSetColor(key, item));
25
+ }
26
+ if (typeof val === "object") {
27
+ return Object.entries(val).reduce((newObj, [key, value]) => {
28
+ newObj[key] = deepIterateAndSetColor(key, value);
29
+ return newObj;
30
+ }, {});
31
+ }
32
+ if (COLOR_KEYS.includes(key)) {
33
+ const [colorName, opacity, ...rest] = val.trim().split(/\s+/);
34
+ const floatOpacity = parseFloat(opacity);
35
+ const newColor = getNamedColor(colorName, floatOpacity);
36
+ if (newColor !== undefined && rest.length === 0 && !isNaN(floatOpacity)) {
37
+ console.log(`key: ${key} in file ${path} changed from ${val} to ${newColor}`);
38
+ return newColor;
39
+ } else {
40
+ return val;
41
+ }
42
+ }
43
+ return val;
44
+ };
45
+ return JSON.stringify(deepIterateAndSetColor(undefined, parsedData))
46
+ };
47
+
48
+ module.exports = {
49
+ entry: {
50
+ distill: "./src/distill.js",
51
+ main: "./src/index.js",
52
+ },
53
+ output: {
54
+ filename: "[name].bundle.js",
55
+ path: path.resolve(__dirname, "dist"),
56
+ },
57
+ module: {
58
+ rules: [
59
+ {
60
+ test: /\.(js|mjs)$/,
61
+ exclude: /node_modules/,
62
+ use: {
63
+ loader: "babel-loader",
64
+ options: {
65
+ presets: ["@babel/preset-env"],
66
+ },
67
+ },
68
+ },
69
+ {
70
+ test: /\.css$/,
71
+ use: ['style-loader', 'css-loader'],
72
+ },
73
+ ],
74
+ },
75
+ plugins: [
76
+ new CleanWebpackPlugin(),
77
+ new CopyPlugin({
78
+ patterns: [
79
+ {
80
+ from: "assets",
81
+ to: "assets",
82
+ },
83
+ { from: "src/style.css", to: "style.css" },
84
+ { from: "src/bibliography.bib", to: "bibliography.bib" },
85
+ { from: "src/index.html", to: "index.html" },
86
+ {
87
+ from: "../analysis/data",
88
+ to: "data",
89
+ globOptions: {
90
+ ignore: ["**/*.json"],
91
+ },
92
+ },
93
+ ],
94
+ }),
95
+ ],
96
+ devtool: process.env.NODE_ENV === 'production' ? 'source-map' : 'eval-source-map',
97
+ devServer: {
98
+ static: "./dist",
99
+ open: process.env.NODE_ENV !== 'production',
100
+ hot: process.env.NODE_ENV !== 'production',
101
+ liveReload: process.env.NODE_ENV !== 'production',
102
+ headers: {
103
+ "Access-Control-Allow-Origin": "*",
104
+ "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, PATCH, OPTIONS",
105
+ "Access-Control-Allow-Headers": "X-Requested-With, content-type, Authorization"
106
+ }
107
+ },
108
+ mode: process.env.NODE_ENV === 'production' ? 'production' : 'development',
109
+ };
110
+
111
+ console.log(process.env.NODE_ENV)
app/yarn.lock ADDED
The diff for this file is too large to render. See raw diff
 
nginx.conf ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ worker_processes auto;
2
+ pid /tmp/nginx.pid;
3
+
4
+ events {
5
+ worker_connections 1024;
6
+ use epoll;
7
+ multi_accept on;
8
+ }
9
+
10
+ http {
11
+ include /etc/nginx/mime.types;
12
+ default_type application/octet-stream;
13
+
14
+ access_log /tmp/access.log;
15
+ error_log /tmp/error.log;
16
+
17
+ sendfile on;
18
+ keepalive_timeout 65;
19
+
20
+ server {
21
+ listen 8080;
22
+ server_name localhost;
23
+
24
+ root /usr/share/nginx/html;
25
+ index index.html;
26
+
27
+ location / {
28
+ try_files $uri $uri/ /index.html;
29
+ }
30
+ }
31
+ }