import streamlit as st import pandas as pd import numpy as np def process_with_smolagents(dataset, operation, custom_code=None): """ Process dataset using SmolaAgents for various operations. Args: dataset: Pandas DataFrame to process operation: Type of processing operation custom_code: Custom code to execute (for custom processing) Returns: Processed pandas DataFrame """ if dataset is None: raise ValueError("No dataset provided") # Create a copy to avoid modifying the original processed_df = dataset.copy() try: if operation == "Data Cleaning": processed_df = clean_dataset(processed_df) elif operation == "Feature Engineering": processed_df = engineer_features(processed_df) elif operation == "Data Transformation": processed_df = transform_dataset(processed_df) elif operation == "Custom Processing" and custom_code: # Execute custom code # Note: This is a security risk in a real application # Should be replaced with a safer approach local_vars = {"df": processed_df} exec(custom_code, {"pd": pd, "np": np}, local_vars) processed_df = local_vars["df"] else: raise ValueError(f"Unsupported operation: {operation}") return processed_df except Exception as e: st.error(f"Error during processing: {str(e)}") raise def clean_dataset(df): """ Clean the dataset by handling missing values, duplicates, and outliers. Args: df: Pandas DataFrame to clean Returns: Cleaned pandas DataFrame """ # Create a copy to avoid modifying the original cleaned_df = df.copy() # Remove duplicate rows cleaned_df = cleaned_df.drop_duplicates() # Handle missing values for col in cleaned_df.columns: # For numeric columns if pd.api.types.is_numeric_dtype(cleaned_df[col]): # If more than 20% missing, leave as is if cleaned_df[col].isna().mean() > 0.2: continue # Otherwise impute with median cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median()) # For categorical columns elif pd.api.types.is_object_dtype(cleaned_df[col]): # If more than 20% missing, leave as is if cleaned_df[col].isna().mean() > 0.2: continue # Otherwise impute with mode mode_value = cleaned_df[col].mode()[0] if not cleaned_df[col].mode().empty else "Unknown" cleaned_df[col] = cleaned_df[col].fillna(mode_value) # Handle outliers in numeric columns for col in cleaned_df.select_dtypes(include=[np.number]).columns: # Skip if too many missing values if cleaned_df[col].isna().mean() > 0.1: continue # Calculate IQR q1 = cleaned_df[col].quantile(0.25) q3 = cleaned_df[col].quantile(0.75) iqr = q3 - q1 # Define bounds lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr # Cap outliers instead of removing cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound) return cleaned_df def engineer_features(df): """ Perform basic feature engineering on the dataset. Args: df: Pandas DataFrame to process Returns: DataFrame with engineered features """ # Create a copy to avoid modifying the original engineered_df = df.copy() # Get numeric columns numeric_cols = engineered_df.select_dtypes(include=[np.number]).columns # Skip if less than 2 numeric columns if len(numeric_cols) >= 2: # Create interaction features for pairs of numeric columns # Limit to first 5 columns to avoid feature explosion for i, col1 in enumerate(numeric_cols[:5]): for col2 in numeric_cols[i+1:5]: # Product interaction engineered_df[f"{col1}_{col2}_product"] = engineered_df[col1] * engineered_df[col2] # Ratio interaction (avoid division by zero) denominator = engineered_df[col2].replace(0, np.nan) engineered_df[f"{col1}_{col2}_ratio"] = engineered_df[col1] / denominator # Create binary features from categorical columns cat_cols = engineered_df.select_dtypes(include=['object', 'category']).columns for col in cat_cols: # Skip if too many unique values (>10) if engineered_df[col].nunique() > 10: continue # One-hot encode dummies = pd.get_dummies(engineered_df[col], prefix=col, drop_first=True) engineered_df = pd.concat([engineered_df, dummies], axis=1) # Create aggregated features if len(numeric_cols) >= 3: # Sum of all numeric features engineered_df['sum_numeric'] = engineered_df[numeric_cols].sum(axis=1) # Mean of all numeric features engineered_df['mean_numeric'] = engineered_df[numeric_cols].mean(axis=1) # Standard deviation of numeric features engineered_df['std_numeric'] = engineered_df[numeric_cols].std(axis=1) return engineered_df def transform_dataset(df): """ Perform data transformations on the dataset. Args: df: Pandas DataFrame to transform Returns: Transformed pandas DataFrame """ from sklearn.preprocessing import StandardScaler, MinMaxScaler # Create a copy to avoid modifying the original transformed_df = df.copy() # Get numeric columns numeric_cols = transformed_df.select_dtypes(include=[np.number]).columns if len(numeric_cols) > 0: # Create scaled versions of numeric columns # Standard scaling (z-score) scaler = StandardScaler() scaled_data = scaler.fit_transform(transformed_df[numeric_cols]) scaled_df = pd.DataFrame( scaled_data, columns=[f"{col}_scaled" for col in numeric_cols], index=transformed_df.index ) # Min-max scaling (0-1 range) minmax_scaler = MinMaxScaler() minmax_data = minmax_scaler.fit_transform(transformed_df[numeric_cols]) minmax_df = pd.DataFrame( minmax_data, columns=[f"{col}_normalized" for col in numeric_cols], index=transformed_df.index ) # Log transform (for positive columns only) log_cols = [] for col in numeric_cols: if (transformed_df[col] > 0).all(): transformed_df[f"{col}_log"] = np.log(transformed_df[col]) log_cols.append(f"{col}_log") # Combine all transformations transformed_df = pd.concat([transformed_df, scaled_df, minmax_df], axis=1) # One-hot encode categorical columns cat_cols = transformed_df.select_dtypes(include=['object', 'category']).columns if len(cat_cols) > 0: # One-hot encode all categorical columns transformed_df = pd.get_dummies(transformed_df, columns=cat_cols, drop_first=False) return transformed_df