davanstrien HF staff commited on
Commit
bae8f11
1 Parent(s): b5f94b5
Files changed (1) hide show
  1. prep_viewer_data.py +3 -0
prep_viewer_data.py CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import list_datasets
7
  from tqdm import tqdm
8
  from tqdm.asyncio import tqdm_asyncio
9
 
 
10
  # Initialize the HTTP client
11
  client = httpx.AsyncClient(timeout=60, http2=True)
12
 
@@ -115,6 +116,8 @@ async def prep_data(sample_size=200_000, min_likes=1):
115
  df = pl.read_parquet(
116
  "hf://datasets/davanstrien/dataset-viewer-descriptions-processed/data/train-00000-of-00001.parquet"
117
  )
 
 
118
  in_train_or_test = set(df["dataset_id"].unique().to_list())
119
 
120
  # Get all datasets
 
7
  from tqdm import tqdm
8
  from tqdm.asyncio import tqdm_asyncio
9
 
10
+
11
  # Initialize the HTTP client
12
  client = httpx.AsyncClient(timeout=60, http2=True)
13
 
 
116
  df = pl.read_parquet(
117
  "hf://datasets/davanstrien/dataset-viewer-descriptions-processed/data/train-00000-of-00001.parquet"
118
  )
119
+ # remove datasets that are already in the train or test set we can remove this later once the model works okay
120
+
121
  in_train_or_test = set(df["dataset_id"].unique().to_list())
122
 
123
  # Get all datasets