import streamlit as st from huggingface_hub import login, HfApi, snapshot_download from datasets import load_dataset import torch from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments from transformers import BertTokenizer, BertForSequenceClassification import pandas as pd import os # Streamlit app configuration st.set_page_config(page_title="Katsukiai Dataset Trainer", layout="wide") # Sidebar for navigation st.sidebar.title("Navigation") tabs = ["Train", "Train with DeepSeek-V3", "Select Dataset and Format", "About", "Settings"] selected_tab = st.sidebar.radio("Select Tab", tabs) # Settings state if "settings" not in st.session_state: st.session_state.settings = { "token": "", "username": "", "use_torch": False, "use_bert": False } # Functions def load_katsukiai_dataset(dataset_name): return load_dataset(f"Katsukiai/{dataset_name}", token=st.session_state.settings["token"] if st.session_state.settings["token"] else None) def train_with_bert(dataset, model_name="bert-base-uncased"): tokenizer = BertTokenizer.from_pretrained(model_name) model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2) def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenized_dataset = dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) training_args = TrainingArguments( output_dir=f"./converted/results_{st.session_state.settings['username']}", num_train_epochs=3, per_device_train_batch_size=8, save_steps=10_000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["test"] ) trainer.train() return "BERT Training Complete" def train_with_deepseek(dataset, model_name="deepseek-ai/DeepSeek-V3"): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) tokenized_dataset = dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) training_args = TrainingArguments( output_dir=f"./deepseek/results_{st.session_state.settings['username']}", num_train_epochs=3, per_device_train_batch_size=4, gradient_accumulation_steps=2, save_steps=10_000, save_total_limit=2, fp16=True # Mixed precision for efficiency ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], ) trainer.train() return "DeepSeek-V3 Training Complete" # Tab content if selected_tab == "Train": st.title("Train Katsukiai Dataset") api = HfApi() datasets_list = [d.id.split("/")[-1] for d in api.list_datasets(author="katsukiai")] dataset_name = st.selectbox("Select Dataset", datasets_list) if st.button("Start Training"): dataset = load_katsukiai_dataset(dataset_name) if st.session_state.settings["use_bert"]: result = train_with_bert(dataset) st.success(result) elif st.session_state.settings["use_torch"]: st.write("Training with Torch (custom implementation required)") else: st.write("Basic training (no specific model selected)") elif selected_tab == "Train with DeepSeek-V3": st.title("Train with DeepSeek-V3") dataset_name = st.selectbox("Select Dataset", [d.id.split("/")[-1] for d in api.list_datasets(author="katsukiai")]) if st.button("Train with DeepSeek"): if st.session_state.settings["token"]: login(st.session_state.settings["token"]) dataset = load_katsukiai_dataset(dataset_name) result = train_with_deepseek(dataset) st.success(result) else: st.error("Please set Hugging Face token in Settings") elif selected_tab == "Select Dataset and Format": st.title("Select Dataset and Format") api = HfApi() datasets_list = [d.id.split("/")[-1] for d in api.list_datasets(author="katsukiai")] dataset_name = st.selectbox("Select Dataset", datasets_list) format_option = st.selectbox("Select Format", ["csv", "json", "parquet"]) if st.button("Load Dataset"): dataset = load_katsukiai_dataset(dataset_name) df = pd.DataFrame(dataset["train"]) if format_option == "csv": st.download_button("Download CSV", df.to_csv(index=False), "dataset.csv") elif format_option == "json": st.download_button("Download JSON", df.to_json(), "dataset.json") else: st.download_button("Download Parquet", df.to_parquet(), "dataset.parquet") elif selected_tab == "About": st.title("About") st.write("This app trains models on Katsukiai datasets from Hugging Face.") st.write("Features:") st.write("- Train with BERT or custom Torch models") st.write("- Train using DeepSeek-V3 from Hugging Face") st.write("- Dataset selection and format conversion") st.write("Built with Streamlit, Hugging Face Hub, and PyTorch.") elif selected_tab == "Settings": st.title("Settings") token = st.text_input("Hugging Face Token", value=st.session_state.settings["token"]) username = st.text_input("Username (for output folder)", value=st.session_state.settings["username"]) use_torch = st.checkbox("Use Torch", value=st.session_state.settings["use_torch"]) use_bert = st.checkbox("Use BERT & Tokenizer", value=st.session_state.settings["use_bert"]) if st.button("Save Settings"): st.session_state.settings.update({ "token": token, "username": username, "use_torch": use_torch, "use_bert": use_bert }) if username and not os.path.exists(f"./converted/results_{username}"): os.makedirs(f"./converted/results_{username}") st.success("Settings saved!")