Spaces:

scikit-learn
/

baseline-trainer

Paused

App Files Files Community

baseline-trainer / app.py

merve HF Staff

Update app.py

c557583 over 3 years ago

raw

history blame

7.65 kB

	import gradio as gr
	import pandas as pd
	from huggingface_hub.hf_api import create_repo, upload_folder, upload_file, HfApi
	from huggingface_hub.repository import Repository
	import subprocess
	import os
	import tempfile
	from uuid import uuid4
	import pickle
	import sweetviz as sv
	import dabl
	import re


	def analyze_datasets(dataset, token, column=None, pairwise="off"):
	df = pd.read_csv(dataset.name)
	username = HfApi().whoami(token=token)["name"]
	if column is not None:
	analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
	else:
	analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
	dataset_name = dataset.name.split("/")[-1].strip(".csv")
	analyze_report.show_html('./index.html', open_browser=False)

	repo_url = create_repo(f"{username}/{dataset_name}-report", repo_type = "space", token = token, space_sdk = "static", private=False)

	upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)
	readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
	with open("README.md", "w+") as f:
	f.write(readme)
	upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)

	return f"Your dataset report will be ready at {repo_url}"


	from sklearn.utils import estimator_html_repr


	def extract_estimator_config(model):
	hyperparameter_dict = model.get_params(deep=True)
	table = "\| Hyperparameters \| Value \|\n\| :-- \| :-- \|\n"
	for hyperparameter, value in hyperparameter_dict.items():
	table += f"\| {hyperparameter} \| {value} \|\n"
	return table

	def detect_training(df, column):
	if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
	trainer = dabl.SimpleRegressor()
	task = "regression"
	elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
	trainer = dabl.SimpleClassifier()
	task = "classification"
	return trainer, task

	def edit_types(df):
	types = dabl.detect_types(df)
	low_cardinality = types[types["low_card_int"] == True].index.tolist()
	dirty_float = types[types["dirty_float"] == True].index.tolist()
	type_hints = {}
	for col in low_cardinality:
	type_hints[col] = "categorical"
	for col in dirty_float:
	type_hints[col] = "continuous"
	df_clean = dabl.clean(df, type_hints=type_hints)
	return df_clean

	def train_baseline(dataset, token, column):
	df = pd.read_csv(dataset.name)
	dataset_name = dataset.name.split("/")[-1].strip(".csv")
	df_clean = edit_types(df)
	fc, task = detect_training(df_clean, column)
	X = df_clean.drop(column, axis = 1)
	y = df_clean[column]

	with tempfile.TemporaryDirectory() as tmpdirname:
	from contextlib import redirect_stdout

	with open(f'{tmpdirname}/logs.txt', 'w') as f:
	with redirect_stdout(f):
	print('Logging training')
	fc.fit(X, y)
	username = HfApi().whoami(token=token)["name"]
	repo_url = create_repo(repo_id = f"{username}/{dataset_name}-{column}-{task}", token = token)
	if task == "regression":
	task_metadata = "tabular-regression"
	else:
	task_metadata = "tabular-classification"
	readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\ntags:\n- {task_metadata}\n- baseline-trainer\n---\n\n"
	readme += f"## Baseline Model trained on {dataset_name} to apply {task} on {column}\n\n"
	readme+="Metrics of the best model:\n\n"
	for elem in str(fc.current_best_).split("\n"):
	readme+= f"{elem}\n\n"
	readme+= "\n\nSee model plot below:\n\n"
	readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
	readme+= "\n\nDisclaimer: This model is trained with dabl library as a baseline, for better results, use [AutoTrain](https://huggingface.co/autotrain).\n\n"
	readme+= "Logs of training including the models tried in the process can be found in logs.txt"
	with open(f"{tmpdirname}/README.md", "w+") as f:
	f.write(readme)
	with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
	pickle.dump(fc, file=f)
	upload_folder(repo_id =f"{username}/{dataset_name}-{column}-{task}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./")

	return f"Your model will be ready at {repo_url}"



	with gr.Blocks() as demo:
	main_title = gr.Markdown("""# Baseline Trainer 🪄🌟✨""")
	main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card. For better results, use [AutoTrain](https://huggingface.co/autotrain).""")


	with gr.Tabs():
	with gr.TabItem("Baseline Trainer") as baseline_trainer:
	with gr.Row():
	with gr.Column():
	title = gr.Markdown(""" ## Train a supervised baseline model 🪄""")
	description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
	dataset = gr.File(label = "CSV Dataset")
	column = gr.Text(label = "Enter target variable:")
	pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token. You can find your token [here](https://huggingface.co/settings/tokens)")
	token = gr.Textbox(label = "Your Hugging Face Token")
	inference_run = gr.Button("Train")
	inference_progress = gr.StatusTracker(cover_container=True)

	outcome = gr.outputs.Textbox(label = "Progress")
	inference_run.click(
	train_baseline,
	inputs=[dataset, token, column],
	outputs=outcome,
	status_tracker=inference_progress,
	)
	with gr.TabItem("Analyze") as analyze:
	with gr.Row():
	with gr.Column():
	title = gr.Markdown(""" ## Analyze Dataset 🪄""")
	description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
	dataset = gr.File(label = "CSV Dataset")
	column = gr.Text(label = "Compare dataset against a target variable (Optional)")
	pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
	token = gr.Textbox(label = "Your Hugging Face Token")
	pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token. You can find your token [here](https://huggingface.co/settings/tokens)")
	inference_run = gr.Button("Infer")
	inference_progress = gr.StatusTracker(cover_container=True)
	outcome = gr.outputs.Textbox()
	inference_run.click(
	analyze_datasets,
	inputs=[dataset, token, column, pairwise],
	outputs=outcome,
	status_tracker=inference_progress,
	)

	demo.launch(debug=True)