import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Training and Test data
app_train = pd.read_csv("./home-credit-default-risk/application_train.csv")
app_test = pd.read_csv('./home-credit-default-risk/application_test.csv')

app_train.head()

print("Training data shape: ", app_train.shape)

Training data shape:  (307511, 122)

app_test.head()

print('Testing data shape: ', app_test.shape)

Testing data shape:  (48744, 121)

df = pd.read_csv('./home-credit-default-risk/application_train.csv')

def plot_stats(df,feature, title, label_rotation=False,horizontal_layout=True):
    temp = df[feature].value_counts()
    df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})

    # Calculate the percentage of target=1 per category value
    cat_perc = df[[feature, 'TARGET']].groupby([feature],as_index=False).mean()
    cat_perc.sort_values(by='TARGET', ascending=False, inplace=True)
    
    if(horizontal_layout):
        fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
    else:
        fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))
        
    sns.color_palette("flare")
    
    s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)
    
    if(label_rotation):
        s.set_xticklabels(s.get_xticklabels(),rotation=90)
    
    s = sns.barplot(ax=ax2, x = feature, y='TARGET', order=cat_perc[feature], data=cat_perc)
    
    if(label_rotation):
        s.set_xticklabels(s.get_xticklabels(),rotation=90)
    plt.ylabel('Percent of target with value 1 [%]', fontsize=10)
    plt.tick_params(axis='both', which='major', labelsize=10)
    plt.suptitle(title)
    plt.show();

count = app_train['TARGET'].value_counts()
count_df = pd.DataFrame({'labels': count.index, 'values':count.values})

plt.figure(figsize=(6,6))
plt.title("Application loans dataset")
sns.barplot(x = 'labels', y="values", data=count_df)
plt.show()

plot_stats(df, 'NAME_CONTRACT_TYPE', "Loan Type")

plot_stats(df, "CODE_GENDER", "Gender", True, True)

plot_stats(df, 'NAME_TYPE_SUITE', "Accompanied By", True, True, )

plot_stats(df, 'NAME_FAMILY_STATUS', "Family Status", True, True, )

plot_stats(df, 'CNT_CHILDREN', "Number of Childern")

plot_stats(df, 'NAME_INCOME_TYPE', "Income Type", True, True )

plot_stats(df, "OCCUPATION_TYPE", True, True)

plot_stats(df, "NAME_HOUSING_TYPE", "Client Housing", True, True)

plot_stats(df, 'NAME_EDUCATION_TYPE', "Clients Education", True, True)

plot_stats(df, 'ORGANIZATION_TYPE', 'Organization Type', True, False)

def plot_distribution(feature, color):
    plt.figure(figsize=(10,6))
    plt.title("Distribution of %s" % feature)
    sns.distplot(df[feature].dropna(),color=color, kde=True,bins=100)
    plt.show()

plot_distribution('DAYS_BIRTH', 'red')

app_train['DAYS_BIRTH'] = abs(app_train["DAYS_BIRTH"])

# plot distribution
plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor='k', bins=30, color='red')
plt.title('Age of Client')
plt.xlabel('Age(years)')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 8))
sns.kdeplot(app_train.loc[df['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')
sns.kdeplot(app_train.loc[df['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

plt.legend()

<matplotlib.legend.Legend at 0x1e40443f290>

plot_stats(app_train,'REG_CITY_NOT_LIVE_CITY', "Not Live in City", False, True)
plot_stats(app_train, 'REG_CITY_NOT_WORK_CITY', "Not Work in City")

numeric_cols = app_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = app_train.select_dtypes('object').columns.tolist()

(app_train['DAYS_BIRTH'] / 365).describe()

count    307511.000000
mean         43.936973
std          11.956133
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: DAYS_BIRTH, dtype: float64

(app_train['DAYS_EMPLOYED']).describe()

count    307511.000000
mean      63815.045904
std      141275.766519
min      -17912.000000
25%       -2760.000000
50%       -1213.000000
75%        -289.000000
max      365243.000000
Name: DAYS_EMPLOYED, dtype: float64

app_train['DAYS_EMPLOYED'].plot.box();

# app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram', color="aquamarine");
# plt.xlabel('Days Employment');

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

def replace_XNA_XAP(table):
    
    cols = table.columns.to_list()
    
    for col in cols:
        # Check if the column exists in the DataFrame
        if col in ['CODE_GENDER', 'ORGANIZATION_TYPE']:
            # Check if the column contains string values before applying .str accessor
            if table[col].dtype == 'O':
                table[col] = table[col].str.strip().replace({'XNA': np.nan, 'XAP': np.nan})
    
    # Replace all values of 'XNA', 'XAP' with np.nan
    # table.replace(to_replace = {'XNA': np.nan, 'XAP': np.nan}, value = np.nan, regex=True, inplace = True)
    
    return table

# https://www.kaggle.com/code/jamesdellinger/home-credit-putting-all-the-steps-together?scriptVersionId=5486249&cellId=11
def preprocess_main(df, flag):
    
    if flag == 1:
        # Separate target data from training dataset.
        y_train = df['TARGET']
        X = df.drop('TARGET', axis = 1)

        # Replace all entries of 365243 in 'DAYS_EMPLOYED' with nan
        X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

        # Replace all entries of 0 in 'DAYS_LAST_PHONE_CHANGE' with nan
        X['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

        # Replace all entries of 'XNA' or 'XAP' in main data table with np.nan
        # (Such entries should be confined to the features 'CODE_GENDER' and 'ORGANIZATION_TYPE'.)
        X = replace_XNA_XAP(X)

        # Two rows in training table have a value of 'Unknown' for 
        # 'NAME_FAMILY_STATUS', but no rows in test table do.
        X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)

        # Five rows in training table have a value of 'Maternity leave' for 
        # 'NAME_INCOME_TYPE', but no rows in test table do.
        X['NAME_INCOME_TYPE'].replace('Maternity leave', np.nan, inplace=True)

        # No rows in training table have -1 for 'REGION_RATING_CLIENT_W_CITY' 
        # but at least one row in test table does.
        X['REGION_RATING_CLIENT_W_CITY'].replace(-1, np.nan, inplace=True)

        return X, y_train
        
    else:
        X = df
        
        # Replace all entries of 365243 in 'DAYS_EMPLOYED' with nan
        X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

        # Replace all entries of 0 in 'DAYS_LAST_PHONE_CHANGE' with nan
        X['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

        # Replace all entries of 'XNA' or 'XAP' in main data table with np.nan
        # (Such entries should be confined to the features 'CODE_GENDER' and 'ORGANIZATION_TYPE'.)
        X = replace_XNA_XAP(X)

        # Two rows in training table have a value of 'Unknown' for 
        # 'NAME_FAMILY_STATUS', but no rows in test table do.
        X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)

        # Five rows in training table have a value of 'Maternity leave' for 
        # 'NAME_INCOME_TYPE', but no rows in test table do.
        X['NAME_INCOME_TYPE'].replace('Maternity leave', np.nan, inplace=True)

        # No rows in training table have -1 for 'REGION_RATING_CLIENT_W_CITY' 
        # but at least one row in test table does.
        X['REGION_RATING_CLIENT_W_CITY'].replace(-1, np.nan, inplace=True)

        return X

X_train, y_train = preprocess_main(app_train, 1)
X_test = preprocess_main(app_test, 0)

X_train.shape, y_train.shape, X_test.shape

((307511, 121), (307511,), (48744, 121))

def replace_day_outliers(df):
    """Replace 365243 with np.nan in any columns with DAYS"""
    for col in df.columns:
        if "DAYS" in col:
            df[col] = df[col].replace({365243: np.nan})
    return df

app_train = replace_day_outliers(X_train)
app_test = replace_day_outliers(X_test)
table = missing_values_table(app_train)
table = missing_values_table(app_test)
# table[table['% of Total Values'] > 60]

Your selected dataframe has 121 columns.
There are 72 columns that have missing values.
Your selected dataframe has 121 columns.
There are 68 columns that have missing values.

def remove_missing_col(df):
    miss_data = pd.DataFrame((df.isnull().sum())*100/df.shape[0])
    miss_data_col=miss_data[miss_data[0]>60].index
    data_new  = df[[i for i in df.columns if i not in miss_data_col]]
    return data_new

app_train = remove_missing_col(app_train)
app_test = remove_missing_col(app_test)
table = missing_values_table(app_train)
table = missing_values_table(app_test)

Your selected dataframe has 104 columns.
There are 55 columns that have missing values.
Your selected dataframe has 104 columns.
There are 51 columns that have missing values.

# Create imputer function

# (https://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn)

import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

df_train = pd.get_dummies(app_train)
df_test = pd.get_dummies(app_test)

df_train.shape, df_test.shape

((307511, 221), (48744, 221))

# Drop the SK_ID_CURR from training data
temp = df_train['SK_ID_CURR']
train = df_train.drop(columns=['SK_ID_CURR'])

# Features 
features = df_train.columns.to_list()
features = features[1:]

# Scale each features to 0-1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

# Median imputation
train = DataFrameImputer().fit_transform(train)

## Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)

base_train = pd.DataFrame(data=train, columns=features)

base_train['SK_ID_CURR'] = temp
print('Data shape: ', base_train.shape)

Data shape:  (307511, 221)

# Drop the SK_ID_CURR from test data
temp = df_test['SK_ID_CURR']
test = df_test.drop(columns=['SK_ID_CURR'])

# Features 
features = df_test.columns.to_list()
features = features[1:]

# Scale each features to 0-1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

# Median imputation
test = DataFrameImputer().fit_transform(test)

## Repeat with the scaler
scaler.fit(test)
test = scaler.transform(test)

test = pd.DataFrame(data=test, columns=features)

list_train = base_train.columns.tolist()
list_test = test.columns.tolist()

# Find values in train that are not in test
not_in_test = set(list_train) - set(list_test)

print("Values in train but not in test:", not_in_test)

Values in train but not in test: {'SK_ID_CURR'}

# Align the Training and Testing data, keep only columns present in both dataframes
train, test = base_train.align(test, join = 'inner', axis = 1)

print('Training Features size: ', train.shape)
print('Testing Features size: ', test.shape)

Training Features size:  (307511, 220)
Testing Features size:  (48744, 220)

missing_values_table(train)

Your selected dataframe has 220 columns.
There are 0 columns that have missing values.

missing_values_table(test)

Your selected dataframe has 220 columns.
There are 0 columns that have missing values.

#Import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,classification_report,roc_curve,auc, f1_score

# Model function

def model_base(algorithm,dtrain_X,dtrain_Y,dtest_X,cols=None):
    
    # Extract feature names
    feature_names = list(dtrain_X.columns)
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    algorithm.fit(dtrain_X[cols],dtrain_Y)
    predictions = algorithm.predict(dtest_X[cols])
    prediction_probabilities = algorithm.predict_proba(dtest_X[cols])[:,1]
    
    return prediction_probabilities

#separating dependent and independent variables

train_X = train[[i for i in train.columns if i not in ['SK_ID_CURR', 'TARGET']]]
train_Y = y_train

test_X  = test[[i for i in test.columns if i not in ['SK_ID_CURR']]]

train_X.shape, train_Y.shape, test_X.shape

((307511, 220), (307511,), (48744, 220))

logit = LogisticRegression()
prediction_probabilities = model_base(logit,train_X,train_Y,test_X,train_X.columns)

prediction_probabilities

array([0.05003637, 0.20782622, 0.04599527, ..., 0.05561639, 0.04693359,
       0.12761884])

# Creating Submission Dataframe
submit = pd.read_csv('./home-credit-default-risk/application_test.csv')
df = pd.DataFrame({'SK_ID_CURR': submit['SK_ID_CURR'], 'TARGET': prediction_probabilities})
df.to_csv('./logreg_baseline.csv', index = False)

from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

prediction_probabilities = model_base(random_forest,train_X,train_Y,test_X,train_X.columns)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.0min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.4s finished

prediction_probabilities

array([0.1 , 0.17, 0.09, ..., 0.14, 0.1 , 0.24])

# Creating Submission Dataframe
submit = pd.read_csv('./home-credit-default-risk/application_test.csv')
df = pd.DataFrame({'SK_ID_CURR': submit['SK_ID_CURR'], 'TARGET': prediction_probabilities})
df.to_csv('./rf.csv', index = False)

# pred = pd.read_csv('./rf.csv')

from xgboost import XGBClassifier

clf = XGBClassifier(learning_rate =0.01,
                    n_estimators=1000,
                    max_depth=4,
                    min_child_weight=4,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    nthread=4,
                    scale_pos_weight=2,
                    seed=27)
clf

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, nthread=4, num_parallel_tree=None,
              predictor=None, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, nthread=4, num_parallel_tree=None,
              predictor=None, ...)

clf.fit(train_X, train_Y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, nthread=4, num_parallel_tree=None,
              predictor=None, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, nthread=4, num_parallel_tree=None,
              predictor=None, ...)

XGB_clf_pred = clf.predict_proba(test_X)[:, 1]

XGB_clf_pred

array([0.07996661, 0.20139933, 0.0383848 , ..., 0.08075171, 0.08277728,
       0.2158363 ], dtype=float32)

# Creating Submission Dataframe
submit = pd.read_csv('./home-credit-default-risk/application_test.csv')
df = pd.DataFrame({'SK_ID_CURR': submit['SK_ID_CURR'], 'TARGET': XGB_clf_pred})
df.to_csv('./xgb.csv', index = False)

from lightgbm import LGBMClassifier

LGB_clf = LGBMClassifier(n_estimators=100, 
                         boosting_type='gbdt', 
                         objective='binary', 
                         metric='binary_logloss',
                        force_col_wise=True)

LGB_clf.fit(train_X, train_Y)

---------------------------------------------------------------------------
LightGBMError                             Traceback (most recent call last)
Cell In[93], line 9
      1 from lightgbm import LGBMClassifier
      3 LGB_clf = LGBMClassifier(n_estimators=100, 
      4                          boosting_type='gbdt', 
      5                          objective='binary', 
      6                          metric='binary_logloss',
      7                         force_col_wise=True)
----> 9 LGB_clf.fit(train_X, train_Y)

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\sklearn.py:1142, in LGBMClassifier.fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, feature_name, categorical_feature, callbacks, init_model)
   1139         else:
   1140             valid_sets.append((valid_x, self._le.transform(valid_y)))
-> 1142 super().fit(
   1143     X,
   1144     _y,
   1145     sample_weight=sample_weight,
   1146     init_score=init_score,
   1147     eval_set=valid_sets,
   1148     eval_names=eval_names,
   1149     eval_sample_weight=eval_sample_weight,
   1150     eval_class_weight=eval_class_weight,
   1151     eval_init_score=eval_init_score,
   1152     eval_metric=eval_metric,
   1153     feature_name=feature_name,
   1154     categorical_feature=categorical_feature,
   1155     callbacks=callbacks,
   1156     init_model=init_model
   1157 )
   1158 return self

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\sklearn.py:842, in LGBMModel.fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, feature_name, categorical_feature, callbacks, init_model)
    839 evals_result: _EvalResultDict = {}
    840 callbacks.append(record_evaluation(evals_result))
--> 842 self._Booster = train(
    843     params=params,
    844     train_set=train_set,
    845     num_boost_round=self.n_estimators,
    846     valid_sets=valid_sets,
    847     valid_names=eval_names,
    848     feval=eval_metrics_callable,  # type: ignore[arg-type]
    849     init_model=init_model,
    850     feature_name=feature_name,
    851     callbacks=callbacks
    852 )
    854 self._evals_result = evals_result
    855 self._best_iteration = self._Booster.best_iteration

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\engine.py:255, in train(params, train_set, num_boost_round, valid_sets, valid_names, feval, init_model, feature_name, categorical_feature, keep_training_booster, callbacks)
    253 # construct booster
    254 try:
--> 255     booster = Booster(params=params, train_set=train_set)
    256     if is_valid_contain_train:
    257         booster.set_train_data_name(train_data_name)

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\basic.py:3200, in Booster.__init__(self, params, train_set, model_file, model_str)
   3193     self.set_network(
   3194         machines=machines,
   3195         local_listen_port=params["local_listen_port"],
   3196         listen_time_out=params.get("time_out", 120),
   3197         num_machines=params["num_machines"]
   3198     )
   3199 # construct booster object
-> 3200 train_set.construct()
   3201 # copy the parameters from train_set
   3202 params.update(train_set.get_params())

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\basic.py:2276, in Dataset.construct(self)
   2269             self._set_init_score_by_predictor(
   2270                 predictor=self._predictor,
   2271                 data=self.data,
   2272                 used_indices=used_indices
   2273             )
   2274 else:
   2275     # create train
-> 2276     self._lazy_init(data=self.data, label=self.label, reference=None,
   2277                     weight=self.weight, group=self.group,
   2278                     init_score=self.init_score, predictor=self._predictor,
   2279                     feature_name=self.feature_name, categorical_feature=self.categorical_feature,
   2280                     params=self.params, position=self.position)
   2281 if self.free_raw_data:
   2282     self.data = None

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\basic.py:1959, in Dataset._lazy_init(self, data, label, reference, weight, group, init_score, predictor, feature_name, categorical_feature, params, position)
   1957     raise TypeError(f'Wrong predictor type {type(predictor).__name__}')
   1958 # set feature names
-> 1959 return self.set_feature_name(feature_name)

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\basic.py:2639, in Dataset.set_feature_name(self, feature_name)
   2637         raise ValueError(f"Length of feature_name({len(feature_name)}) and num_feature({self.num_feature()}) don't match")
   2638     c_feature_name = [_c_str(name) for name in feature_name]
-> 2639     _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
   2640         self._handle,
   2641         _c_array(ctypes.c_char_p, c_feature_name),
   2642         ctypes.c_int(len(feature_name))))
   2643 return self

File ~\anaconda3\envs\ml\Lib\site-packages\lightgbm\basic.py:242, in _safe_call(ret)
    234 """Check the return value from C API call.
    235 
    236 Parameters
   (...)
    239     The return value from C API calls.
    240 """
    241 if ret != 0:
--> 242     raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))

LightGBMError: Do not support special JSON characters in feature name.

# Remove unwanted characters from column names
train_X.columns = [col.replace(',', '').replace(']', '').replace('[', '').replace('{', '').replace('}', '')
              .replace('"', '').replace(':', '').replace('/', '').replace(':', '').replace(' ', '').replace('_', '') for col in train_X.columns]

test_X.columns = [col.replace(',', '').replace(']', '').replace('[', '').replace('{', '').replace('}', '')
              .replace('"', '').replace(':', '').replace('/', '').replace(':', '').replace(' ', '').replace('_', '') for col in test_X.columns]

LGB_clf.fit(train_X, train_Y)

[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Total Bins 8872
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 215
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486

LGBMClassifier(force_col_wise=True, metric='binary_logloss', objective='binary')

LGBMClassifier(force_col_wise=True, metric='binary_logloss', objective='binary')

LGB_clf_pred = LGB_clf.predict_proba(test_X)[:, 1]

LGB_clf_pred

array([0.034586  , 0.10051357, 0.01817225, ..., 0.03762378, 0.03859904,
       0.10736544])

# Creating Submission Dataframe
submit = pd.read_csv('./home-credit-default-risk/application_test.csv')
df = pd.DataFrame({'SK_ID_CURR': submit['SK_ID_CURR'], 'TARGET': LGB_clf_pred})
df.to_csv('./lgbm.csv', index = False)

## In progress

	SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	...	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100002	1	Cash loans	M	N	Y	202500.0	406597.5	24700.5	...	0.0	0.0	0.0	0.0	0.0	1.0
1	100003	0	Cash loans	F	N	N	270000.0	1293502.5	35698.5	...	0.0	0.0	0.0	0.0	0.0	0.0
2	100004	0	Revolving loans	M	Y	Y	67500.0	135000.0	6750.0	...	0.0	0.0	0.0	0.0	0.0	0.0
3	100006	0	Cash loans	F	N	Y	135000.0	312682.5	29686.5	...	NaN	NaN	NaN	NaN	NaN	NaN
4	100007	0	Cash loans	M	N	Y	121500.0	513000.0	21865.5	...	0.0	0.0	0.0	0.0	0.0	0.0

	SK_ID_CURR	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	...	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100001	Cash loans	F	N	Y	0	135000.0	568800.0	20560.5	450000.0	...	0.0	0.0	0.0	0.0	0.0	0.0
1	100005	Cash loans	M	N	Y	0	99000.0	222768.0	17370.0	180000.0	...	0.0	0.0	0.0	0.0	0.0	3.0
2	100013	Cash loans	M	Y	Y	0	202500.0	663264.0	69777.0	630000.0	...	0.0	0.0	0.0	0.0	1.0	4.0
3	100028	Cash loans	F	N	Y	2	315000.0	1575000.0	49018.5	1575000.0	...	0.0	0.0	0.0	0.0	0.0	3.0
4	100038	Cash loans	M	Y	N	1	180000.0	625500.0	32067.0	625500.0	...	NaN	NaN	NaN	NaN	NaN	NaN

Home Credit Default Risk Project

Table of Contents:¶

1. Definition ¶

1.1 Project Overview ¶

1.2 Problem Statement ¶

1.3 Metrics ¶

2. Analysis ¶

2.1 Data Exploration ¶

Exploratory Visualization ¶

Contract Type

Gender

Client Accompanied By:

Family Status of Client

Income Type

Occupation

Client Housing

Client's Education

Organization Type

Days from birth distribution

City registered not live city and not work city

3. Algorithms ¶

3.1 Benchmark: Logistic Regression ¶

3.2 Random Forest ¶

3.3 Boosting ¶

4. Data Preprocessing ¶

4.1 Find Anamoly ¶

4.2 Missing Values ¶

4.1 Replace XNA & XAP ¶

4.4 Replace Outliers ¶

4.5 Scaling and Encoding ¶

5. Models ¶

5.1 Data Preparation ¶

5.2 Logistic Regression ¶

5.3 Improved Model: Random Forest ¶

5.4 XGBoost ¶

5.4 Light GBM ¶

Home Credit Default Risk Project

Table of Contents:¶

1. Definition¶

1.1 Project Overview ¶

1.2 Problem Statement ¶

1.3 Metrics ¶

2. Analysis ¶

2.1 Data Exploration¶

Exploratory Visualization ¶

Contract Type

Gender

Client Accompanied By:

Family Status of Client

Income Type

Occupation

Client Housing

Client's Education

Organization Type

Days from birth distribution

City registered not live city and not work city

3. Algorithms ¶

3.1 Benchmark: Logistic Regression ¶

3.2 Random Forest ¶

3.3 Boosting ¶

4. Data Preprocessing ¶

4.1 Find Anamoly ¶

4.2 Missing Values ¶

4.1 Replace XNA & XAP ¶

4.4 Replace Outliers ¶

4.5 Scaling and Encoding ¶

5. Models ¶

5.1 Data Preparation ¶

5.2 Logistic Regression¶

5.3 Improved Model: Random Forest¶

5.4 XGBoost¶

5.4 Light GBM¶

1. Definition ¶

2.1 Data Exploration ¶

5.2 Logistic Regression ¶

5.3 Improved Model: Random Forest ¶

5.4 XGBoost ¶

5.4 Light GBM ¶