Part of the eXplainable Machine Learning course for Machine Learning (MSc) studies at the University of Warsaw. @pbiecek @hbaniecki
v0.1.0: 2022-10-13
https://github.com/mim-uw/eXplainableMachineLearning-2023/tree/main/Homeworks/HW2
import dalex as dx
import xgboost
import shap
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import platform
print(f'Python {platform.python_version()}')
{package.__name__: package.__version__ for package in [dx, xgboost, shap, sklearn, pd, np]}
df = dx.datasets.load_titanic()
df
df.info()
df.loc[:, df.dtypes == 'object'] =\
df.select_dtypes(['object'])\
.apply(lambda x: x.astype('category'))
df.info()
X = df.drop(columns='survived')
# convert gender to binary only because the `max_cat_to_onehot` parameter in XGBoost is yet to be working properly..
X = pd.get_dummies(X, columns=["gender"], drop_first=True)
y = df.survived
categorical_variables = ['class', 'embarked']
preprocessor = ColumnTransformer([
('categorical', OneHotEncoder(), categorical_variables)
])
model = xgboost.XGBClassifier(
n_estimators=200,
max_depth=4,
use_label_encoder=False,
eval_metric="logloss"
)
model_pipeline = Pipeline([
('preprocessor', preprocessor),
('model', model)
])
model_pipeline.fit(X, y)
model_pipeline.predict_proba(X.iloc[0:2])
model_categorical = xgboost.XGBClassifier(
n_estimators=200,
max_depth=4,
use_label_encoder=False,
eval_metric="logloss",
enable_categorical=True,
tree_method="hist"
)
model_categorical.fit(X, y)
model_categorical.predict_proba(X.iloc[0:2])
model = model_pipeline
pf_xgboost_classifier_default = lambda m, d: m.predict_proba(d)[:, 1]
explainer = dx.Explainer(model, X, y, predict_function=pf_xgboost_classifier_default, label="GBM")
model = model_categorical
def pf_xgboost_classifier_categorical(model, df):
df.loc[:, df.dtypes == 'object'] =\
df.select_dtypes(['object'])\
.apply(lambda x: x.astype('category'))
return model.predict_proba(df)[:, 1]
explainer = dx.Explainer(model, X, y, predict_function=pf_xgboost_classifier_categorical, label="GBM")
explainer.model_performance()
explainer.predict(X.iloc[0:10])
shap_attributions = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'passenger {i}') for i in range(5)]
shap_attributions[0].plot(shap_attributions[1::])
bd_attributions = [explainer.predict_parts(X.iloc[[i]], type="break_down", label=f'passenger {i}') for i in range(5)]
bd_attributions[0].plot(bd_attributions[1::])
_ = shap.Explainer(model_pipeline)
_ = shap.Explainer(model_categorical)
X_ohe = pd.get_dummies(X, columns=categorical_variables, drop_first=True)
X_ohe
model_ohe = xgboost.XGBClassifier(
n_estimators=200,
max_depth=4,
use_label_encoder=False,
eval_metric="logloss"
)
model_ohe.fit(X_ohe, y)
shap_explainer = shap.explainers.Tree(model_ohe, data=X_ohe, model_output="probability")
shap_values = shap_explainer(X_ohe)
shap_values
for i in range(5):
shap.plots.waterfall(shap_values[i])
shap.plots.beeswarm(shap_values, max_display=10, plot_size=(9, 6))
import matplotlib.pyplot as plt
# plots.bar() has no plot_size parameter
shap.plots.bar(shap_values, max_display=10, show=False)
plt.gcf().set_size_inches(9, 6)
plt.show()
# plot.scatter() has no plot_size parameter
shap.plots.scatter(shap_values[:, "age"], show=False)
plt.gcf().set_size_inches(9, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "gender_male"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "class_3rd"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
from sklearn.svm import SVC
svm_ohe = SVC(probability=True)
svm_ohe.fit(X_ohe, y)
# sample data for KerneLSHAP
X_subset = X_ohe.sample(111, random_state=0)
exp_svm = dx.Explainer(svm_ohe, X_subset, label="SVM", verbose=False)
exp_xgboost = dx.Explainer(model_ohe, X_subset, label="GBM", verbose=False)
sv_svm = [exp_svm.predict_parts(X_ohe.iloc[[i]], type="shap_wrapper") for i in range(5)]
sv_xgboost = [exp_xgboost.predict_parts(X_ohe.iloc[[i]], type="shap_wrapper") for i in range(5)]
for i in range(5):
print(f'==== Passenger {i} ====')
print('---- XGBoost:')
sv_xgboost[i].plot()
print('---- SVM:')
sv_svm[i].plot()