|
import streamlit as st |
|
from huggingface_hub import login, HfApi, snapshot_download |
|
from datasets import load_dataset |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments |
|
from transformers import BertTokenizer, BertForSequenceClassification |
|
import pandas as pd |
|
import os |
|
|
|
|
|
st.set_page_config(page_title="Katsukiai Dataset Trainer", layout="wide") |
|
|
|
|
|
st.sidebar.title("Navigation") |
|
tabs = ["Train", "Train with DeepSeek-V3", "Select Dataset and Format", "About", "Settings"] |
|
selected_tab = st.sidebar.radio("Select Tab", tabs) |
|
|
|
|
|
if "settings" not in st.session_state: |
|
st.session_state.settings = { |
|
"token": "", |
|
"username": "", |
|
"use_torch": False, |
|
"use_bert": False |
|
} |
|
|
|
|
|
def load_katsukiai_dataset(dataset_name): |
|
return load_dataset(f"Katsukiai/{dataset_name}", token=st.session_state.settings["token"] if st.session_state.settings["token"] else None) |
|
|
|
def train_with_bert(dataset, model_name="bert-base-uncased"): |
|
tokenizer = BertTokenizer.from_pretrained(model_name) |
|
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2) |
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], padding="max_length", truncation=True) |
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) |
|
|
|
training_args = TrainingArguments( |
|
output_dir=f"./converted/results_{st.session_state.settings['username']}", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=8, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset["train"], |
|
eval_dataset=tokenized_dataset["test"] |
|
) |
|
trainer.train() |
|
return "BERT Training Complete" |
|
|
|
def train_with_deepseek(dataset, model_name="deepseek-ai/DeepSeek-V3"): |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) |
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) |
|
|
|
training_args = TrainingArguments( |
|
output_dir=f"./deepseek/results_{st.session_state.settings['username']}", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=4, |
|
gradient_accumulation_steps=2, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
fp16=True |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset["train"], |
|
) |
|
trainer.train() |
|
return "DeepSeek-V3 Training Complete" |
|
|
|
|
|
if selected_tab == "Train": |
|
st.title("Train Katsukiai Dataset") |
|
api = HfApi() |
|
datasets_list = [d.id.split("/")[-1] for d in api.list_datasets(author="katsukiai")] |
|
dataset_name = st.selectbox("Select Dataset", datasets_list) |
|
if st.button("Start Training"): |
|
dataset = load_katsukiai_dataset(dataset_name) |
|
if st.session_state.settings["use_bert"]: |
|
result = train_with_bert(dataset) |
|
st.success(result) |
|
elif st.session_state.settings["use_torch"]: |
|
st.write("Training with Torch (custom implementation required)") |
|
else: |
|
st.write("Basic training (no specific model selected)") |
|
|
|
elif selected_tab == "Train with DeepSeek-V3": |
|
st.title("Train with DeepSeek-V3") |
|
dataset_name = st.selectbox("Select Dataset", [d.id.split("/")[-1] for d in api.list_datasets(author="katsukiai")]) |
|
if st.button("Train with DeepSeek"): |
|
if st.session_state.settings["token"]: |
|
login(st.session_state.settings["token"]) |
|
dataset = load_katsukiai_dataset(dataset_name) |
|
result = train_with_deepseek(dataset) |
|
st.success(result) |
|
else: |
|
st.error("Please set Hugging Face token in Settings") |
|
|
|
elif selected_tab == "Select Dataset and Format": |
|
st.title("Select Dataset and Format") |
|
api = HfApi() |
|
datasets_list = [d.id.split("/")[-1] for d in api.list_datasets(author="katsukiai")] |
|
dataset_name = st.selectbox("Select Dataset", datasets_list) |
|
format_option = st.selectbox("Select Format", ["csv", "json", "parquet"]) |
|
if st.button("Load Dataset"): |
|
dataset = load_katsukiai_dataset(dataset_name) |
|
df = pd.DataFrame(dataset["train"]) |
|
if format_option == "csv": |
|
st.download_button("Download CSV", df.to_csv(index=False), "dataset.csv") |
|
elif format_option == "json": |
|
st.download_button("Download JSON", df.to_json(), "dataset.json") |
|
else: |
|
st.download_button("Download Parquet", df.to_parquet(), "dataset.parquet") |
|
|
|
elif selected_tab == "About": |
|
st.title("About") |
|
st.write("This app trains models on Katsukiai datasets from Hugging Face.") |
|
st.write("Features:") |
|
st.write("- Train with BERT or custom Torch models") |
|
st.write("- Train using DeepSeek-V3 from Hugging Face") |
|
st.write("- Dataset selection and format conversion") |
|
st.write("Built with Streamlit, Hugging Face Hub, and PyTorch.") |
|
|
|
elif selected_tab == "Settings": |
|
st.title("Settings") |
|
token = st.text_input("Hugging Face Token", value=st.session_state.settings["token"]) |
|
username = st.text_input("Username (for output folder)", value=st.session_state.settings["username"]) |
|
use_torch = st.checkbox("Use Torch", value=st.session_state.settings["use_torch"]) |
|
use_bert = st.checkbox("Use BERT & Tokenizer", value=st.session_state.settings["use_bert"]) |
|
|
|
if st.button("Save Settings"): |
|
st.session_state.settings.update({ |
|
"token": token, |
|
"username": username, |
|
"use_torch": use_torch, |
|
"use_bert": use_bert |
|
}) |
|
if username and not os.path.exists(f"./converted/results_{username}"): |
|
os.makedirs(f"./converted/results_{username}") |
|
st.success("Settings saved!") |