Spaces:
Runtime error
Runtime error
Update curated.py
Browse files- curated.py +25 -1
curated.py
CHANGED
|
@@ -458,6 +458,7 @@ filtering_process = Div(
|
|
| 458 |
),
|
| 459 |
),
|
| 460 |
Section(
|
|
|
|
| 461 |
H3("ArXiv"),
|
| 462 |
H4("Download and Extraction"),
|
| 463 |
P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
|
@@ -474,8 +475,10 @@ filtering_process = Div(
|
|
| 474 |
Li("Local dedup was done with all papers combined."),
|
| 475 |
),
|
| 476 |
table_div_arx,
|
|
|
|
| 477 |
),
|
| 478 |
Section(
|
|
|
|
| 479 |
H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
|
| 480 |
H4("Download and Extraction"),
|
| 481 |
Ol(
|
|
@@ -509,8 +512,10 @@ filtering_process = Div(
|
|
| 509 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
| 510 |
),
|
| 511 |
table_div_s2o,
|
|
|
|
| 512 |
),
|
| 513 |
Section(
|
|
|
|
| 514 |
H3("PubMed - need to update with abstract vs central"),
|
| 515 |
H4("Download and Extraction"),
|
| 516 |
Ol(
|
|
@@ -538,8 +543,10 @@ filtering_process = Div(
|
|
| 538 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
| 539 |
),
|
| 540 |
table_div_med,
|
|
|
|
| 541 |
),
|
| 542 |
Section(
|
|
|
|
| 543 |
H3("Phil Papers"),
|
| 544 |
H4("Download and Extraction"),
|
| 545 |
P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
|
@@ -552,8 +559,10 @@ filtering_process = Div(
|
|
| 552 |
Li("Local dedup was done with all papers combined."),
|
| 553 |
),
|
| 554 |
table_div_phil,
|
|
|
|
| 555 |
),
|
| 556 |
Section(
|
|
|
|
| 557 |
H3("Europarl"),
|
| 558 |
H4("Download and Extraction"),
|
| 559 |
P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
|
@@ -565,7 +574,9 @@ filtering_process = Div(
|
|
| 565 |
),
|
| 566 |
table_div_up,
|
| 567 |
),
|
|
|
|
| 568 |
Section(
|
|
|
|
| 569 |
H3("HackerNews"),
|
| 570 |
H4("Download and Extraction"),
|
| 571 |
P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
|
@@ -581,8 +592,10 @@ filtering_process = Div(
|
|
| 581 |
Li("Local dedup was done within hackernews itself"),
|
| 582 |
),
|
| 583 |
table_div_hn,
|
|
|
|
| 584 |
),
|
| 585 |
Section(
|
|
|
|
| 586 |
H3("USPTO"),
|
| 587 |
H4("Download and Extraction"),
|
| 588 |
P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
|
|
@@ -597,8 +610,10 @@ filtering_process = Div(
|
|
| 597 |
Li("Local dedup was done within USPTO itself"),
|
| 598 |
),
|
| 599 |
table_div_uspto,
|
|
|
|
| 600 |
),
|
| 601 |
Section(
|
|
|
|
| 602 |
H3("FreeLaw"),
|
| 603 |
H4("Download and Extraction"),
|
| 604 |
#P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
|
@@ -623,8 +638,10 @@ filtering_process = Div(
|
|
| 623 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
| 624 |
),
|
| 625 |
table_div_freelaw,
|
|
|
|
| 626 |
),
|
| 627 |
Section(
|
|
|
|
| 628 |
H3("StackExchange"),
|
| 629 |
H4("Download and Extraction"),
|
| 630 |
P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
|
|
@@ -648,8 +665,10 @@ filtering_process = Div(
|
|
| 648 |
Li("Local dedup was done within stackexchange itself"),
|
| 649 |
),
|
| 650 |
table_div_se,
|
|
|
|
| 651 |
),
|
| 652 |
Section(
|
|
|
|
| 653 |
H3("Ubuntu IRC"),
|
| 654 |
H4("Download and Extraction"),
|
| 655 |
P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
|
|
@@ -675,9 +694,11 @@ filtering_process = Div(
|
|
| 675 |
Li("Local dedup was done within Ubuntu IRC itself"),
|
| 676 |
),
|
| 677 |
table_div_uirc,
|
|
|
|
| 678 |
),
|
| 679 |
Section(
|
| 680 |
-
|
|
|
|
| 681 |
H4("Download and Extraction"),
|
| 682 |
P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
|
| 683 |
D_code("""
|
|
@@ -692,8 +713,10 @@ filtering_process = Div(
|
|
| 692 |
Li("None"),
|
| 693 |
),
|
| 694 |
table_div_dmm,
|
|
|
|
| 695 |
),
|
| 696 |
Section(
|
|
|
|
| 697 |
H3("PG19"),
|
| 698 |
H4("Download and Extraction"),
|
| 699 |
Ol(
|
|
@@ -710,6 +733,7 @@ filtering_process = Div(
|
|
| 710 |
Li("Local dedup was done within PG19 itself"),
|
| 711 |
),
|
| 712 |
table_div_pg19,
|
|
|
|
| 713 |
),
|
| 714 |
)
|
| 715 |
|
|
|
|
| 458 |
),
|
| 459 |
),
|
| 460 |
Section(
|
| 461 |
+
Div(
|
| 462 |
H3("ArXiv"),
|
| 463 |
H4("Download and Extraction"),
|
| 464 |
P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
|
|
|
| 475 |
Li("Local dedup was done with all papers combined."),
|
| 476 |
),
|
| 477 |
table_div_arx,
|
| 478 |
+
),
|
| 479 |
),
|
| 480 |
Section(
|
| 481 |
+
Div(
|
| 482 |
H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
|
| 483 |
H4("Download and Extraction"),
|
| 484 |
Ol(
|
|
|
|
| 512 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
| 513 |
),
|
| 514 |
table_div_s2o,
|
| 515 |
+
),
|
| 516 |
),
|
| 517 |
Section(
|
| 518 |
+
Div(
|
| 519 |
H3("PubMed - need to update with abstract vs central"),
|
| 520 |
H4("Download and Extraction"),
|
| 521 |
Ol(
|
|
|
|
| 543 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
| 544 |
),
|
| 545 |
table_div_med,
|
| 546 |
+
),
|
| 547 |
),
|
| 548 |
Section(
|
| 549 |
+
Div(
|
| 550 |
H3("Phil Papers"),
|
| 551 |
H4("Download and Extraction"),
|
| 552 |
P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
|
|
|
| 559 |
Li("Local dedup was done with all papers combined."),
|
| 560 |
),
|
| 561 |
table_div_phil,
|
| 562 |
+
),
|
| 563 |
),
|
| 564 |
Section(
|
| 565 |
+
Div(
|
| 566 |
H3("Europarl"),
|
| 567 |
H4("Download and Extraction"),
|
| 568 |
P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
|
|
|
| 574 |
),
|
| 575 |
table_div_up,
|
| 576 |
),
|
| 577 |
+
),
|
| 578 |
Section(
|
| 579 |
+
Div(
|
| 580 |
H3("HackerNews"),
|
| 581 |
H4("Download and Extraction"),
|
| 582 |
P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
|
|
|
| 592 |
Li("Local dedup was done within hackernews itself"),
|
| 593 |
),
|
| 594 |
table_div_hn,
|
| 595 |
+
),
|
| 596 |
),
|
| 597 |
Section(
|
| 598 |
+
Div(
|
| 599 |
H3("USPTO"),
|
| 600 |
H4("Download and Extraction"),
|
| 601 |
P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
|
|
|
|
| 610 |
Li("Local dedup was done within USPTO itself"),
|
| 611 |
),
|
| 612 |
table_div_uspto,
|
| 613 |
+
),
|
| 614 |
),
|
| 615 |
Section(
|
| 616 |
+
Div(
|
| 617 |
H3("FreeLaw"),
|
| 618 |
H4("Download and Extraction"),
|
| 619 |
#P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
|
|
|
| 638 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
| 639 |
),
|
| 640 |
table_div_freelaw,
|
| 641 |
+
),
|
| 642 |
),
|
| 643 |
Section(
|
| 644 |
+
Div(
|
| 645 |
H3("StackExchange"),
|
| 646 |
H4("Download and Extraction"),
|
| 647 |
P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
|
|
|
|
| 665 |
Li("Local dedup was done within stackexchange itself"),
|
| 666 |
),
|
| 667 |
table_div_se,
|
| 668 |
+
),
|
| 669 |
),
|
| 670 |
Section(
|
| 671 |
+
Div(
|
| 672 |
H3("Ubuntu IRC"),
|
| 673 |
H4("Download and Extraction"),
|
| 674 |
P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
|
|
|
|
| 694 |
Li("Local dedup was done within Ubuntu IRC itself"),
|
| 695 |
),
|
| 696 |
table_div_uirc,
|
| 697 |
+
),
|
| 698 |
),
|
| 699 |
Section(
|
| 700 |
+
Div(
|
| 701 |
+
H3("DM Math"),
|
| 702 |
H4("Download and Extraction"),
|
| 703 |
P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
|
| 704 |
D_code("""
|
|
|
|
| 713 |
Li("None"),
|
| 714 |
),
|
| 715 |
table_div_dmm,
|
| 716 |
+
),
|
| 717 |
),
|
| 718 |
Section(
|
| 719 |
+
Div(
|
| 720 |
H3("PG19"),
|
| 721 |
H4("Download and Extraction"),
|
| 722 |
Ol(
|
|
|
|
| 733 |
Li("Local dedup was done within PG19 itself"),
|
| 734 |
),
|
| 735 |
table_div_pg19,
|
| 736 |
+
),
|
| 737 |
),
|
| 738 |
)
|
| 739 |
|