Spaces:
Running
Running
# Tokenization/label_tokens.py | |
# Domain tags | |
DOMAIN_TAGS = { | |
"physics": "[PHYS]", | |
"biology": "[BIO]", | |
"materials": "[MAT]", | |
"education": "[GEN]", | |
} | |
# Task tags | |
TASK_TAGS = { | |
"hypothesis": "[HYP]", | |
"method": "[MTH]", | |
"experiment": "[EXP]", | |
} | |
# Section tags (for further granularity, e.g., for long-context or future models) | |
SECTION_TAGS = { | |
"abstract": "[ABSTRACT]", | |
"introduction": "[INTRO]", | |
"results": "[RESULTS]", | |
"discussion": "[DISCUSSION]", | |
"conclusion": "[CONCLUSION]", | |
"method": "[MTH]", | |
"experiment": "[EXP]", | |
} | |
# Routing tags | |
ROUTING_TAGS = { | |
"general": "[GEN]", | |
"specific": "[SPEC]", | |
} | |
# Token/word limits for validation and filtering | |
MIN_WORDS = 8 | |
MAX_TOKENS = 1024 | |
MAX_TOTAL_TOKENS = 327680000 # Example: 325M tokens | |
# Token targets for different corpus types | |
TOKEN_TARGETS = { | |
"warm_start": 100_000_000, | |
"scientific": 225_000_000, | |
"instruction": 30_000_000, | |
"default": 325_000_000, | |
} | |
def build_tag_string( | |
domain: str, | |
task: str = None, | |
section: str = None, | |
routing: str = "general", | |
subdomain: str = None | |
) -> str: | |
""" | |
Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics] | |
""" | |
tags = [] | |
if domain in DOMAIN_TAGS: | |
tags.append(DOMAIN_TAGS[domain]) | |
if task in TASK_TAGS: | |
tags.append(TASK_TAGS[task]) | |
if section in SECTION_TAGS: | |
tags.append(SECTION_TAGS[section]) | |
if routing == "general": | |
tags.append(ROUTING_TAGS["general"]) | |
elif routing == "specific" and subdomain: | |
tags.append(f"[SPEC:{subdomain}]") | |
return "".join(tags) | |