Nexa_Data_Studio / Tokenization /Label_tokens.py
Allanatrix's picture
Upload 50 files
ef4c8c3 verified
# Tokenization/label_tokens.py
# Domain tags
DOMAIN_TAGS = {
"physics": "[PHYS]",
"biology": "[BIO]",
"materials": "[MAT]",
"education": "[GEN]",
}
# Task tags
TASK_TAGS = {
"hypothesis": "[HYP]",
"method": "[MTH]",
"experiment": "[EXP]",
}
# Section tags (for further granularity, e.g., for long-context or future models)
SECTION_TAGS = {
"abstract": "[ABSTRACT]",
"introduction": "[INTRO]",
"results": "[RESULTS]",
"discussion": "[DISCUSSION]",
"conclusion": "[CONCLUSION]",
"method": "[MTH]",
"experiment": "[EXP]",
}
# Routing tags
ROUTING_TAGS = {
"general": "[GEN]",
"specific": "[SPEC]",
}
# Token/word limits for validation and filtering
MIN_WORDS = 8
MAX_TOKENS = 1024
MAX_TOTAL_TOKENS = 327680000 # Example: 325M tokens
# Token targets for different corpus types
TOKEN_TARGETS = {
"warm_start": 100_000_000,
"scientific": 225_000_000,
"instruction": 30_000_000,
"default": 325_000_000,
}
def build_tag_string(
domain: str,
task: str = None,
section: str = None,
routing: str = "general",
subdomain: str = None
) -> str:
"""
Build a tag string for a sample, e.g. [PHYS][HYP][GEN] or [BIO][MTH][SPEC: Genomics]
"""
tags = []
if domain in DOMAIN_TAGS:
tags.append(DOMAIN_TAGS[domain])
if task in TASK_TAGS:
tags.append(TASK_TAGS[task])
if section in SECTION_TAGS:
tags.append(SECTION_TAGS[section])
if routing == "general":
tags.append(ROUTING_TAGS["general"])
elif routing == "specific" and subdomain:
tags.append(f"[SPEC:{subdomain}]")
return "".join(tags)