Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -178,7 +178,7 @@ def main():
|
|
| 178 |
new_dataset_comparison1 = pd.DataFrame(
|
| 179 |
{
|
| 180 |
"Data Source": [
|
| 181 |
-
"CommonCrawl",
|
| 182 |
"Papers",
|
| 183 |
"Wikipedia",
|
| 184 |
"FreeLaw",
|
|
@@ -193,7 +193,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 193 |
|
| 194 |
],
|
| 195 |
"TxT360": [
|
| 196 |
-
"99
|
| 197 |
"5 Sources",
|
| 198 |
"310+ Languages",
|
| 199 |
"Included",
|
|
@@ -207,7 +207,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 207 |
"**",
|
| 208 |
],
|
| 209 |
"FineWeb": [
|
| 210 |
-
"96
|
| 211 |
"-",
|
| 212 |
"-",
|
| 213 |
"-",
|
|
@@ -221,7 +221,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 221 |
"-",
|
| 222 |
],
|
| 223 |
"RefinedWeb": [
|
| 224 |
-
"90
|
| 225 |
"-",
|
| 226 |
"-",
|
| 227 |
"-",
|
|
@@ -234,8 +234,8 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 234 |
"-",
|
| 235 |
"-",
|
| 236 |
],
|
| 237 |
-
"
|
| 238 |
-
"84
|
| 239 |
"-",
|
| 240 |
"-",
|
| 241 |
"-",
|
|
@@ -249,7 +249,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 249 |
"-",
|
| 250 |
],
|
| 251 |
"C4": [
|
| 252 |
-
"1
|
| 253 |
"-",
|
| 254 |
"-",
|
| 255 |
"-",
|
|
@@ -263,7 +263,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 263 |
"-",
|
| 264 |
],
|
| 265 |
"Dolma": [
|
| 266 |
-
"24
|
| 267 |
"1 Source",
|
| 268 |
"checkmark",
|
| 269 |
"-",
|
|
@@ -276,8 +276,8 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 276 |
"-",
|
| 277 |
"Included",
|
| 278 |
],
|
| 279 |
-
"
|
| 280 |
-
"5
|
| 281 |
"1 Source",
|
| 282 |
"checkmark",
|
| 283 |
"",
|
|
@@ -291,7 +291,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
| 291 |
"Included",
|
| 292 |
],
|
| 293 |
"The Pile": [
|
| 294 |
-
"0.6% of 74
|
| 295 |
"4 Sources",
|
| 296 |
"English Only",
|
| 297 |
"Included",
|
|
@@ -636,8 +636,8 @@ def intro():
|
|
| 636 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
| 637 |
),
|
| 638 |
new_table_div_1,
|
| 639 |
-
table_div_1,
|
| 640 |
-
table_div_2,
|
| 641 |
P(
|
| 642 |
"In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
|
| 643 |
),
|
|
|
|
| 178 |
new_dataset_comparison1 = pd.DataFrame(
|
| 179 |
{
|
| 180 |
"Data Source": [
|
| 181 |
+
"CommonCrawl Snapshots",
|
| 182 |
"Papers",
|
| 183 |
"Wikipedia",
|
| 184 |
"FreeLaw",
|
|
|
|
| 193 |
|
| 194 |
],
|
| 195 |
"TxT360": [
|
| 196 |
+
"99",
|
| 197 |
"5 Sources",
|
| 198 |
"310+ Languages",
|
| 199 |
"Included",
|
|
|
|
| 207 |
"**",
|
| 208 |
],
|
| 209 |
"FineWeb": [
|
| 210 |
+
"96",
|
| 211 |
"-",
|
| 212 |
"-",
|
| 213 |
"-",
|
|
|
|
| 221 |
"-",
|
| 222 |
],
|
| 223 |
"RefinedWeb": [
|
| 224 |
+
"90",
|
| 225 |
"-",
|
| 226 |
"-",
|
| 227 |
"-",
|
|
|
|
| 234 |
"-",
|
| 235 |
"-",
|
| 236 |
],
|
| 237 |
+
"PedPajamaV2": [
|
| 238 |
+
"84",
|
| 239 |
"-",
|
| 240 |
"-",
|
| 241 |
"-",
|
|
|
|
| 249 |
"-",
|
| 250 |
],
|
| 251 |
"C4": [
|
| 252 |
+
"1",
|
| 253 |
"-",
|
| 254 |
"-",
|
| 255 |
"-",
|
|
|
|
| 263 |
"-",
|
| 264 |
],
|
| 265 |
"Dolma": [
|
| 266 |
+
"24",
|
| 267 |
"1 Source",
|
| 268 |
"checkmark",
|
| 269 |
"-",
|
|
|
|
| 276 |
"-",
|
| 277 |
"Included",
|
| 278 |
],
|
| 279 |
+
"RedPajamaV1": [
|
| 280 |
+
"5",
|
| 281 |
"1 Source",
|
| 282 |
"checkmark",
|
| 283 |
"",
|
|
|
|
| 291 |
"Included",
|
| 292 |
],
|
| 293 |
"The Pile": [
|
| 294 |
+
"0.6% of 74",
|
| 295 |
"4 Sources",
|
| 296 |
"English Only",
|
| 297 |
"Included",
|
|
|
|
| 636 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
| 637 |
),
|
| 638 |
new_table_div_1,
|
| 639 |
+
#table_div_1,
|
| 640 |
+
#table_div_2,
|
| 641 |
P(
|
| 642 |
"In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
|
| 643 |
),
|