import pandas as pd from sklearn.preprocessing import OneHotEncoder, LabelEncoder def get_cleaned_data(): df = pd.read_csv('German Credit Data.csv') # Fill missing values df['Saving accounts'] = df['Saving accounts'].fillna('No Savings') df['Checking account'] = df['Checking account'].fillna('No Checking') df = df.drop(columns='Unnamed: 0') #print(df.info()) num_cols = ['Credit amount', 'Duration in month', 'Age in years'] cat_cols = ['Saving accounts', 'Checking account', 'Purpose', 'Sex', 'Housing', 'Job'] #Encoding Categorical Variabpythles label = LabelEncoder() df['Saving accounts'] = label.fit_transform(df['Saving accounts']) df['Checking account'] = label.fit_transform(df['Checking account']) #One Hot Encoding df = pd.get_dummies(df, columns=['Purpose', 'Sex', 'Housing', 'Job']).astype(int) # Scoring system risk_score = ( (df['Credit amount'] > 5000).astype(int) + (df['Duration'] > 24).astype(int) + (df['Saving accounts'] == 0).astype(int) + # 0 = 'No Savings' after label encoding (df['Checking account'] == 0).astype(int) + # 0 = 'No Checking' after label encoding (df['Purpose_radio/TV'] == 1).astype(int) if 'Purpose_radio/TV' in df.columns else 0 + (df['Housing_rent'] == 1).astype(int) if 'Housing_rent' in df.columns else 0 + (df['Job_0'] == 1).astype(int) if 'Job_0' in df.columns else 0 ) # Set threshold: if risk_score >= 3, high risk (1), else low risk (0) df['credit_risk'] = (risk_score >= 3).astype(int) return df