from transformers import Tool import pandas as pd import numpy as np import plotly.express as px import seaborn as sns from sklearn import preprocessing, decomposition, metrics # 1. Data Loading and Preprocessing Tool class DataPreprocessingTool(Tool): name = "data_preprocessor" description = "Handles data loading, cleaning, and preprocessing tasks" inputs = { "data": {"type": "dict", "description": "Input data dictionary"}, "operation": {"type": "string", "description": "Operation to perform: clean/encode/normalize/impute"} } output_type = "dict" def forward(self, data: dict, operation: str) -> dict: df = pd.DataFrame(data) if operation == "clean": # Handle duplicates, missing values df = df.drop_duplicates() df = df.fillna(df.mean(numeric_only=True)) elif operation == "encode": # Encode categorical variables le = preprocessing.LabelEncoder() for col in df.select_dtypes(include=['object']): df[col] = le.fit_transform(df[col].astype(str)) elif operation == "normalize": # Normalize numeric columns scaler = preprocessing.StandardScaler() numeric_cols = df.select_dtypes(include=[np.number]).columns df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) return df.to_dict() # 2. Statistical Analysis Tool class StatisticalAnalysisTool(Tool): name = "statistical_analyzer" description = "Performs statistical analysis on data" inputs = { "data": {"type": "dict", "description": "Input data dictionary"}, "analysis_type": {"type": "string", "description": "Type of analysis: descriptive/inferential/correlation"} } output_type = "dict" def forward(self, data: dict, analysis_type: str) -> dict: df = pd.DataFrame(data) if analysis_type == "descriptive": return { "summary": df.describe().to_dict(), "skewness": df.skew().to_dict(), "kurtosis": df.kurtosis().to_dict() } elif analysis_type == "inferential": # Perform statistical tests results = {} numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: from scipy import stats stat, p_value = stats.normaltest(df[col].dropna()) results[col] = {"statistic": stat, "p_value": p_value} return results return df.corr().to_dict() # 3. Advanced Visualization Tool class AdvancedVisualizationTool(Tool): name = "advanced_visualizer" description = "Creates advanced statistical and ML visualizations" inputs = { "data": {"type": "dict", "description": "Input data dictionary"}, "viz_type": {"type": "string", "description": "Type of visualization"}, "params": {"type": "dict", "description": "Additional parameters"} } output_type = "dict" def forward(self, data: dict, viz_type: str, params: dict) -> dict: df = pd.DataFrame(data) if viz_type == "pca": # PCA visualization pca = decomposition.PCA(n_components=2) numeric_cols = df.select_dtypes(include=[np.number]).columns pca_result = pca.fit_transform(df[numeric_cols]) fig = px.scatter(x=pca_result[:, 0], y=pca_result[:, 1], title='PCA Visualization') return {"plot": fig.to_dict()} elif viz_type == "cluster": # Clustering visualization from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=params.get("n_clusters", 3)) numeric_cols = df.select_dtypes(include=[np.number]).columns clusters = kmeans.fit_predict(df[numeric_cols]) fig = px.scatter(df, x=params.get("x"), y=params.get("y"), color=clusters, title='Cluster Visualization') return {"plot": fig.to_dict()} return {} # 4. Machine Learning Tool class MLModelTool(Tool): name = "ml_modeler" description = "Trains and evaluates machine learning models" inputs = { "data": {"type": "dict", "description": "Input data dictionary"}, "target": {"type": "string", "description": "Target column name"}, "model_type": {"type": "string", "description": "Type of model to train"} } output_type = "dict" def forward(self, data: dict, target: str, model_type: str) -> dict: from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, accuracy_score df = pd.DataFrame(data) X = df.drop(columns=[target]) y = df[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) if model_type == "regression": from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) return { "mse": mean_squared_error(y_test, y_pred), "r2": model.score(X_test, y_test), "coefficients": dict(zip(X.columns, model.coef_)) } elif model_type == "classification": from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) return { "accuracy": accuracy_score(y_test, y_pred), "feature_importance": dict(zip(X.columns, model.feature_importances_)) } return {}