jzou19950715's picture
Update app.py
5293476 verified
raw
history blame
5.8 kB
from transformers import Tool
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
from sklearn import preprocessing, decomposition, metrics
# 1. Data Loading and Preprocessing Tool
class DataPreprocessingTool(Tool):
name = "data_preprocessor"
description = "Handles data loading, cleaning, and preprocessing tasks"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"operation": {"type": "string", "description": "Operation to perform: clean/encode/normalize/impute"}
}
output_type = "dict"
def forward(self, data: dict, operation: str) -> dict:
df = pd.DataFrame(data)
if operation == "clean":
# Handle duplicates, missing values
df = df.drop_duplicates()
df = df.fillna(df.mean(numeric_only=True))
elif operation == "encode":
# Encode categorical variables
le = preprocessing.LabelEncoder()
for col in df.select_dtypes(include=['object']):
df[col] = le.fit_transform(df[col].astype(str))
elif operation == "normalize":
# Normalize numeric columns
scaler = preprocessing.StandardScaler()
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
return df.to_dict()
# 2. Statistical Analysis Tool
class StatisticalAnalysisTool(Tool):
name = "statistical_analyzer"
description = "Performs statistical analysis on data"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"analysis_type": {"type": "string", "description": "Type of analysis: descriptive/inferential/correlation"}
}
output_type = "dict"
def forward(self, data: dict, analysis_type: str) -> dict:
df = pd.DataFrame(data)
if analysis_type == "descriptive":
return {
"summary": df.describe().to_dict(),
"skewness": df.skew().to_dict(),
"kurtosis": df.kurtosis().to_dict()
}
elif analysis_type == "inferential":
# Perform statistical tests
results = {}
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
from scipy import stats
stat, p_value = stats.normaltest(df[col].dropna())
results[col] = {"statistic": stat, "p_value": p_value}
return results
return df.corr().to_dict()
# 3. Advanced Visualization Tool
class AdvancedVisualizationTool(Tool):
name = "advanced_visualizer"
description = "Creates advanced statistical and ML visualizations"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"viz_type": {"type": "string", "description": "Type of visualization"},
"params": {"type": "dict", "description": "Additional parameters"}
}
output_type = "dict"
def forward(self, data: dict, viz_type: str, params: dict) -> dict:
df = pd.DataFrame(data)
if viz_type == "pca":
# PCA visualization
pca = decomposition.PCA(n_components=2)
numeric_cols = df.select_dtypes(include=[np.number]).columns
pca_result = pca.fit_transform(df[numeric_cols])
fig = px.scatter(x=pca_result[:, 0], y=pca_result[:, 1],
title='PCA Visualization')
return {"plot": fig.to_dict()}
elif viz_type == "cluster":
# Clustering visualization
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=params.get("n_clusters", 3))
numeric_cols = df.select_dtypes(include=[np.number]).columns
clusters = kmeans.fit_predict(df[numeric_cols])
fig = px.scatter(df, x=params.get("x"), y=params.get("y"),
color=clusters, title='Cluster Visualization')
return {"plot": fig.to_dict()}
return {}
# 4. Machine Learning Tool
class MLModelTool(Tool):
name = "ml_modeler"
description = "Trains and evaluates machine learning models"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"target": {"type": "string", "description": "Target column name"},
"model_type": {"type": "string", "description": "Type of model to train"}
}
output_type = "dict"
def forward(self, data: dict, target: str, model_type: str) -> dict:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
df = pd.DataFrame(data)
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
if model_type == "regression":
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return {
"mse": mean_squared_error(y_test, y_pred),
"r2": model.score(X_test, y_test),
"coefficients": dict(zip(X.columns, model.coef_))
}
elif model_type == "classification":
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return {
"accuracy": accuracy_score(y_test, y_pred),
"feature_importance": dict(zip(X.columns, model.feature_importances_))
}
return {}