File size: 5,795 Bytes
5293476 bcd9ccf e279441 ad9e004 5293476 e279441 5293476 dbaa1d5 ad9e004 5293476 cedb0a7 5293476 e279441 5293476 ad9e004 5293476 ad9e004 5293476 ad9e004 5293476 ad9e004 5293476 ad9e004 5293476 ad9e004 5293476 ad9e004 5293476 e279441 5293476 ad9e004 5293476 e279441 5293476 e279441 5293476 e279441 5293476 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
from transformers import Tool
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
from sklearn import preprocessing, decomposition, metrics
# 1. Data Loading and Preprocessing Tool
class DataPreprocessingTool(Tool):
name = "data_preprocessor"
description = "Handles data loading, cleaning, and preprocessing tasks"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"operation": {"type": "string", "description": "Operation to perform: clean/encode/normalize/impute"}
}
output_type = "dict"
def forward(self, data: dict, operation: str) -> dict:
df = pd.DataFrame(data)
if operation == "clean":
# Handle duplicates, missing values
df = df.drop_duplicates()
df = df.fillna(df.mean(numeric_only=True))
elif operation == "encode":
# Encode categorical variables
le = preprocessing.LabelEncoder()
for col in df.select_dtypes(include=['object']):
df[col] = le.fit_transform(df[col].astype(str))
elif operation == "normalize":
# Normalize numeric columns
scaler = preprocessing.StandardScaler()
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
return df.to_dict()
# 2. Statistical Analysis Tool
class StatisticalAnalysisTool(Tool):
name = "statistical_analyzer"
description = "Performs statistical analysis on data"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"analysis_type": {"type": "string", "description": "Type of analysis: descriptive/inferential/correlation"}
}
output_type = "dict"
def forward(self, data: dict, analysis_type: str) -> dict:
df = pd.DataFrame(data)
if analysis_type == "descriptive":
return {
"summary": df.describe().to_dict(),
"skewness": df.skew().to_dict(),
"kurtosis": df.kurtosis().to_dict()
}
elif analysis_type == "inferential":
# Perform statistical tests
results = {}
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
from scipy import stats
stat, p_value = stats.normaltest(df[col].dropna())
results[col] = {"statistic": stat, "p_value": p_value}
return results
return df.corr().to_dict()
# 3. Advanced Visualization Tool
class AdvancedVisualizationTool(Tool):
name = "advanced_visualizer"
description = "Creates advanced statistical and ML visualizations"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"viz_type": {"type": "string", "description": "Type of visualization"},
"params": {"type": "dict", "description": "Additional parameters"}
}
output_type = "dict"
def forward(self, data: dict, viz_type: str, params: dict) -> dict:
df = pd.DataFrame(data)
if viz_type == "pca":
# PCA visualization
pca = decomposition.PCA(n_components=2)
numeric_cols = df.select_dtypes(include=[np.number]).columns
pca_result = pca.fit_transform(df[numeric_cols])
fig = px.scatter(x=pca_result[:, 0], y=pca_result[:, 1],
title='PCA Visualization')
return {"plot": fig.to_dict()}
elif viz_type == "cluster":
# Clustering visualization
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=params.get("n_clusters", 3))
numeric_cols = df.select_dtypes(include=[np.number]).columns
clusters = kmeans.fit_predict(df[numeric_cols])
fig = px.scatter(df, x=params.get("x"), y=params.get("y"),
color=clusters, title='Cluster Visualization')
return {"plot": fig.to_dict()}
return {}
# 4. Machine Learning Tool
class MLModelTool(Tool):
name = "ml_modeler"
description = "Trains and evaluates machine learning models"
inputs = {
"data": {"type": "dict", "description": "Input data dictionary"},
"target": {"type": "string", "description": "Target column name"},
"model_type": {"type": "string", "description": "Type of model to train"}
}
output_type = "dict"
def forward(self, data: dict, target: str, model_type: str) -> dict:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
df = pd.DataFrame(data)
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
if model_type == "regression":
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return {
"mse": mean_squared_error(y_test, y_pred),
"r2": model.score(X_test, y_test),
"coefficients": dict(zip(X.columns, model.coef_))
}
elif model_type == "classification":
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return {
"accuracy": accuracy_score(y_test, y_pred),
"feature_importance": dict(zip(X.columns, model.feature_importances_))
}
return {} |