Spaces:

katsukiai
/

Trains

Sleeping

App Files Files Community

katsukiai commited on Mar 5

Commit

8a4e856

verified ·

1 Parent(s): 37323c0

Create app.py

Browse files

Files changed (1) hide show

app.py +155 -0

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import streamlit as st
+from huggingface_hub import login, HfApi, snapshot_download
+from datasets import load_dataset
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
+from transformers import BertTokenizer, BertForSequenceClassification
+import pandas as pd
+import os
+# Streamlit app configuration
+st.set_page_config(page_title="Katsukiai Dataset Trainer", layout="wide")
+# Sidebar for navigation
+st.sidebar.title("Navigation")
+tabs = ["Train", "Train with DeepSeek-V3", "Select Dataset and Format", "About", "Settings"]
+selected_tab = st.sidebar.radio("Select Tab", tabs)
+# Settings state
+if "settings" not in st.session_state:
+    st.session_state.settings = {
+        "token": "",
+        "username": "",
+        "use_torch": False,
+        "use_bert": False
+    }
+# Functions
+def load_katsukiai_dataset(dataset_name):
+    return load_dataset(f"Katsukiai/{dataset_name}", token=st.session_state.settings["token"] if st.session_state.settings["token"] else None)
+def train_with_bert(dataset, model_name="bert-base-uncased"):
+    tokenizer = BertTokenizer.from_pretrained(model_name)
+    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], padding="max_length", truncation=True)
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+    training_args = TrainingArguments(
+        output_dir=f"./converted/results_{st.session_state.settings['username']}",
+        num_train_epochs=3,
+        per_device_train_batch_size=8,
+        save_steps=10_000,
+        save_total_limit=2,
+    )
+KILL    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset["train"],
+        eval_dataset=tokenized_dataset["test"]
+    )
+    trainer.train()
+    return "BERT Training Complete"
+def train_with_deepseek(dataset, model_name="deepseek-ai/DeepSeek-V3"):
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
+    training_args = TrainingArguments(
+        output_dir=f"./deepseek/results_{st.session_state.settings['username']}",
+        num_train_epochs=3,
+        per_device_train_batch_size=4,
+        gradient_accumulation_steps=2,
+        save_steps=10_000,
+        save_total_limit=2,
+        fp16=True  # Mixed precision for efficiency
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset["train"],
+    )
+    trainer.train()
+    return "DeepSeek-V3 Training Complete"
+# Tab content
+if selected_tab == "Train":
+    st.title("Train Katsukiai Dataset")
+    api = HfApi()
+    datasets_list = [d.id.split("/")[-1] for d in api.list_datasets(author="Katsukiai")]
+    dataset_name = st.selectbox("Select Dataset", datasets_list)
+    if st.button("Start Training"):
+        dataset = load_katsukiai_dataset(dataset_name)
+        if st.session_state.settings["use_bert"]:
+            result = train_with_bert(dataset)
+            st.success(result)
+        elif st.session_state.settings["use_torch"]:
+            st.write("Training with Torch (custom implementation required)")
+        else:
+            st.write("Basic training (no specific model selected)")
+elif selected_tab == "Train with DeepSeek-V3":
+    st.title("Train with DeepSeek-V3")
+    dataset_name = st.selectbox("Select Dataset", [d.id.split("/")[-1] for d in api.list_datasets(author="Katsukiai")])
+    if st.button("Train with DeepSeek"):
+        if st.session_state.settings["token"]:
+            login(st.session_state.settings["token"])
+            dataset = load_katsukiai_dataset(dataset_name)
+            result = train_with_deepseek(dataset)
+            st.success(result)
+        else:
+            st.error("Please set Hugging Face token in Settings")
+elif selected_tab == "Select Dataset and Format":
+    st.title("Select Dataset and Format")
+    api = HfApi()
+    datasets_list = [d.id.split("/")[-1] for d in api.list_datasets(author="katsukiai")]
+    dataset_name = st.selectbox("Select Dataset", datasets_list)
+    format_option = st.selectbox("Select Format", ["csv", "json", "parquet"])
+    if st.button("Load Dataset"):
+        dataset = load_katsukiai_dataset(dataset_name)
+        df = pd.DataFrame(dataset["train"])
+        if format_option == "csv":
+            st.download_button("Download CSV", df.to_csv(index=False), "dataset.csv")
+        elif format_option == "json":
+            st.download_button("Download JSON", df.to_json(), "dataset.json")
+        else:
+            st.download_button("Download Parquet", df.to_parquet(), "dataset.parquet")
+elif selected_tab == "About":
+    st.title("About")
+    st.write("This app trains models on Katsukiai datasets from Hugging Face.")
+    st.write("Features:")
+    st.write("- Train with BERT or custom Torch models")
+    st.write("- Train using DeepSeek-V3 from Hugging Face")
+    st.write("- Dataset selection and format conversion")
+    st.write("Built with Streamlit, Hugging Face Hub, and PyTorch.")
+elif selected_tab == "Settings":
+    st.title("Settings")
+    token = st.text_input("Hugging Face Token", value=st.session_state.settings["token"])
+    username = st.text_input("Username (for output folder)", value=st.session_state.settings["username"])
+    use_torch = st.checkbox("Use Torch", value=st.session_state.settings["use_torch"])
+    use_bert = st.checkbox("Use BERT & Tokenizer", value=st.session_state.settings["use_bert"])
+    if st.button("Save Settings"):
+        st.session_state.settings.update({
+            "token": token,
+            "username": username,
+            "use_torch": use_torch,
+            "use_bert": use_bert
+        })
+        if username and not os.path.exists(f"./results_{username}"):
+            os.makedirs(f"./results_{username}")
+        st.success("Settings saved!")