|
from transformers import Tool |
|
import pandas as pd |
|
import numpy as np |
|
import plotly.express as px |
|
import seaborn as sns |
|
from sklearn import preprocessing, decomposition, metrics |
|
|
|
|
|
class DataPreprocessingTool(Tool): |
|
name = "data_preprocessor" |
|
description = "Handles data loading, cleaning, and preprocessing tasks" |
|
|
|
inputs = { |
|
"data": {"type": "dict", "description": "Input data dictionary"}, |
|
"operation": {"type": "string", "description": "Operation to perform: clean/encode/normalize/impute"} |
|
} |
|
output_type = "dict" |
|
|
|
def forward(self, data: dict, operation: str) -> dict: |
|
df = pd.DataFrame(data) |
|
if operation == "clean": |
|
|
|
df = df.drop_duplicates() |
|
df = df.fillna(df.mean(numeric_only=True)) |
|
elif operation == "encode": |
|
|
|
le = preprocessing.LabelEncoder() |
|
for col in df.select_dtypes(include=['object']): |
|
df[col] = le.fit_transform(df[col].astype(str)) |
|
elif operation == "normalize": |
|
|
|
scaler = preprocessing.StandardScaler() |
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) |
|
return df.to_dict() |
|
|
|
|
|
class StatisticalAnalysisTool(Tool): |
|
name = "statistical_analyzer" |
|
description = "Performs statistical analysis on data" |
|
|
|
inputs = { |
|
"data": {"type": "dict", "description": "Input data dictionary"}, |
|
"analysis_type": {"type": "string", "description": "Type of analysis: descriptive/inferential/correlation"} |
|
} |
|
output_type = "dict" |
|
|
|
def forward(self, data: dict, analysis_type: str) -> dict: |
|
df = pd.DataFrame(data) |
|
if analysis_type == "descriptive": |
|
return { |
|
"summary": df.describe().to_dict(), |
|
"skewness": df.skew().to_dict(), |
|
"kurtosis": df.kurtosis().to_dict() |
|
} |
|
elif analysis_type == "inferential": |
|
|
|
results = {} |
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
for col in numeric_cols: |
|
from scipy import stats |
|
stat, p_value = stats.normaltest(df[col].dropna()) |
|
results[col] = {"statistic": stat, "p_value": p_value} |
|
return results |
|
return df.corr().to_dict() |
|
|
|
|
|
class AdvancedVisualizationTool(Tool): |
|
name = "advanced_visualizer" |
|
description = "Creates advanced statistical and ML visualizations" |
|
|
|
inputs = { |
|
"data": {"type": "dict", "description": "Input data dictionary"}, |
|
"viz_type": {"type": "string", "description": "Type of visualization"}, |
|
"params": {"type": "dict", "description": "Additional parameters"} |
|
} |
|
output_type = "dict" |
|
|
|
def forward(self, data: dict, viz_type: str, params: dict) -> dict: |
|
df = pd.DataFrame(data) |
|
if viz_type == "pca": |
|
|
|
pca = decomposition.PCA(n_components=2) |
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
pca_result = pca.fit_transform(df[numeric_cols]) |
|
fig = px.scatter(x=pca_result[:, 0], y=pca_result[:, 1], |
|
title='PCA Visualization') |
|
return {"plot": fig.to_dict()} |
|
elif viz_type == "cluster": |
|
|
|
from sklearn.cluster import KMeans |
|
kmeans = KMeans(n_clusters=params.get("n_clusters", 3)) |
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
clusters = kmeans.fit_predict(df[numeric_cols]) |
|
fig = px.scatter(df, x=params.get("x"), y=params.get("y"), |
|
color=clusters, title='Cluster Visualization') |
|
return {"plot": fig.to_dict()} |
|
return {} |
|
|
|
|
|
class MLModelTool(Tool): |
|
name = "ml_modeler" |
|
description = "Trains and evaluates machine learning models" |
|
|
|
inputs = { |
|
"data": {"type": "dict", "description": "Input data dictionary"}, |
|
"target": {"type": "string", "description": "Target column name"}, |
|
"model_type": {"type": "string", "description": "Type of model to train"} |
|
} |
|
output_type = "dict" |
|
|
|
def forward(self, data: dict, target: str, model_type: str) -> dict: |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import mean_squared_error, accuracy_score |
|
|
|
df = pd.DataFrame(data) |
|
X = df.drop(columns=[target]) |
|
y = df[target] |
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) |
|
|
|
if model_type == "regression": |
|
from sklearn.linear_model import LinearRegression |
|
model = LinearRegression() |
|
model.fit(X_train, y_train) |
|
y_pred = model.predict(X_test) |
|
return { |
|
"mse": mean_squared_error(y_test, y_pred), |
|
"r2": model.score(X_test, y_test), |
|
"coefficients": dict(zip(X.columns, model.coef_)) |
|
} |
|
elif model_type == "classification": |
|
from sklearn.ensemble import RandomForestClassifier |
|
model = RandomForestClassifier() |
|
model.fit(X_train, y_train) |
|
y_pred = model.predict(X_test) |
|
return { |
|
"accuracy": accuracy_score(y_test, y_pred), |
|
"feature_importance": dict(zip(X.columns, model.feature_importances_)) |
|
} |
|
return {} |