Spaces:
Running
Running
import pandas as pd | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import LabelEncoder | |
def load_data(file_path): | |
"""Load dataset from a CSV file.""" | |
return pd.read_csv(file_path) | |
def handle_missing_values(df): | |
"""Handle missing values in the dataset.""" | |
# Impute numerical columns with the median | |
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns | |
imputer = SimpleImputer(strategy='median') | |
df[numerical_cols] = imputer.fit_transform(df[numerical_cols]) | |
# Impute categorical columns with the most frequent value | |
categorical_cols = df.select_dtypes(include=['object']).columns | |
imputer = SimpleImputer(strategy='most_frequent') | |
df[categorical_cols] = imputer.fit_transform(df[categorical_cols]) | |
return df | |
def encode_categorical_variables(df): | |
"""Encode categorical variables using Label Encoding.""" | |
categorical_cols = df.select_dtypes(include=['object']).columns | |
label_encoder = LabelEncoder() | |
for col in categorical_cols: | |
df[col] = label_encoder.fit_transform(df[col]) | |
return df | |
def preprocess_data(file_path): | |
"""Load, preprocess, and return the dataset.""" | |
df = load_data(file_path) | |
df = handle_missing_values(df) | |
df = encode_categorical_variables(df) | |
return df | |
if __name__ == "__main__": | |
file_path = 'path_to_your_data.csv' # Replace with your actual file path | |
processed_data = preprocess_data(file_path) | |
processed_data.to_csv('processed_data.csv', index=False) | |