Spaces:
Build error
Build error
from transformers import Tool | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import seaborn as sns | |
from sklearn import preprocessing, decomposition, metrics | |
# 1. Data Loading and Preprocessing Tool | |
class DataPreprocessingTool(Tool): | |
name = "data_preprocessor" | |
description = "Handles data loading, cleaning, and preprocessing tasks" | |
inputs = { | |
"data": {"type": "dict", "description": "Input data dictionary"}, | |
"operation": {"type": "string", "description": "Operation to perform: clean/encode/normalize/impute"} | |
} | |
output_type = "dict" | |
def forward(self, data: dict, operation: str) -> dict: | |
df = pd.DataFrame(data) | |
if operation == "clean": | |
# Handle duplicates, missing values | |
df = df.drop_duplicates() | |
df = df.fillna(df.mean(numeric_only=True)) | |
elif operation == "encode": | |
# Encode categorical variables | |
le = preprocessing.LabelEncoder() | |
for col in df.select_dtypes(include=['object']): | |
df[col] = le.fit_transform(df[col].astype(str)) | |
elif operation == "normalize": | |
# Normalize numeric columns | |
scaler = preprocessing.StandardScaler() | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) | |
return df.to_dict() | |
# 2. Statistical Analysis Tool | |
class StatisticalAnalysisTool(Tool): | |
name = "statistical_analyzer" | |
description = "Performs statistical analysis on data" | |
inputs = { | |
"data": {"type": "dict", "description": "Input data dictionary"}, | |
"analysis_type": {"type": "string", "description": "Type of analysis: descriptive/inferential/correlation"} | |
} | |
output_type = "dict" | |
def forward(self, data: dict, analysis_type: str) -> dict: | |
df = pd.DataFrame(data) | |
if analysis_type == "descriptive": | |
return { | |
"summary": df.describe().to_dict(), | |
"skewness": df.skew().to_dict(), | |
"kurtosis": df.kurtosis().to_dict() | |
} | |
elif analysis_type == "inferential": | |
# Perform statistical tests | |
results = {} | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
for col in numeric_cols: | |
from scipy import stats | |
stat, p_value = stats.normaltest(df[col].dropna()) | |
results[col] = {"statistic": stat, "p_value": p_value} | |
return results | |
return df.corr().to_dict() | |
# 3. Advanced Visualization Tool | |
class AdvancedVisualizationTool(Tool): | |
name = "advanced_visualizer" | |
description = "Creates advanced statistical and ML visualizations" | |
inputs = { | |
"data": {"type": "dict", "description": "Input data dictionary"}, | |
"viz_type": {"type": "string", "description": "Type of visualization"}, | |
"params": {"type": "dict", "description": "Additional parameters"} | |
} | |
output_type = "dict" | |
def forward(self, data: dict, viz_type: str, params: dict) -> dict: | |
df = pd.DataFrame(data) | |
if viz_type == "pca": | |
# PCA visualization | |
pca = decomposition.PCA(n_components=2) | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
pca_result = pca.fit_transform(df[numeric_cols]) | |
fig = px.scatter(x=pca_result[:, 0], y=pca_result[:, 1], | |
title='PCA Visualization') | |
return {"plot": fig.to_dict()} | |
elif viz_type == "cluster": | |
# Clustering visualization | |
from sklearn.cluster import KMeans | |
kmeans = KMeans(n_clusters=params.get("n_clusters", 3)) | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
clusters = kmeans.fit_predict(df[numeric_cols]) | |
fig = px.scatter(df, x=params.get("x"), y=params.get("y"), | |
color=clusters, title='Cluster Visualization') | |
return {"plot": fig.to_dict()} | |
return {} | |
# 4. Machine Learning Tool | |
class MLModelTool(Tool): | |
name = "ml_modeler" | |
description = "Trains and evaluates machine learning models" | |
inputs = { | |
"data": {"type": "dict", "description": "Input data dictionary"}, | |
"target": {"type": "string", "description": "Target column name"}, | |
"model_type": {"type": "string", "description": "Type of model to train"} | |
} | |
output_type = "dict" | |
def forward(self, data: dict, target: str, model_type: str) -> dict: | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import mean_squared_error, accuracy_score | |
df = pd.DataFrame(data) | |
X = df.drop(columns=[target]) | |
y = df[target] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
if model_type == "regression": | |
from sklearn.linear_model import LinearRegression | |
model = LinearRegression() | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
return { | |
"mse": mean_squared_error(y_test, y_pred), | |
"r2": model.score(X_test, y_test), | |
"coefficients": dict(zip(X.columns, model.coef_)) | |
} | |
elif model_type == "classification": | |
from sklearn.ensemble import RandomForestClassifier | |
model = RandomForestClassifier() | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
return { | |
"accuracy": accuracy_score(y_test, y_pred), | |
"feature_importance": dict(zip(X.columns, model.feature_importances_)) | |
} | |
return {} |