Source code for dabl.explain

import numpy as np
from warnings import warn

import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.inspection import plot_partial_dependence
from sklearn.feature_selection import f_classif
from sklearn.impute import SimpleImputer

from .models import SimpleClassifier, SimpleRegressor, AnyClassifier
from .utils import _validate_Xyt
from .plot.utils import (plot_coefficients, plot_multiclass_roc_curve,
                         find_pretty_grid, _make_subplots)


def classification_metrics(estimator, X_val, y_val):
    y_pred = estimator.predict(X_val)
    print(classification_report(y_val, y_pred))
    print(confusion_matrix(y_val, y_pred))
    try:
        from sklearn.metrics import plot_roc_curve
        if len(estimator.classes_) == 2:
            plot_roc_curve(estimator, X_val, y_val)
        elif len(estimator.classes_) > 2:
            plot_multiclass_roc_curve(estimator, X_val, y_val)
    except ImportError:
        warn("Can't plot roc curve, install sklearn 0.22-dev")


[docs]def explain(estimator, X_val=None, y_val=None, target_col=None, feature_names=None): """Explain estimator. Provide basic properties and evaluation plots for the estimator. Parameters ---------- estimator : dabl or sklearn estimator Model to evaluate. X_val : DataFrame, optional Validation set. Used for computing hold-out evaluations like roc-curves, permutation importance or partial dependence plots. y_val : Series or numpy array, optional. Validation set labels. You need to specify either y_val or target_col. target_col : string or int, optional Column name of target if included in X. """ if feature_names is None: try: feature_names = estimator.feature_names_.to_list() except AttributeError: raise ValueError("Can't determine input feature names, " "please pass them.") classifier = False if hasattr(estimator, 'classes_') and len(estimator.classes_) >= 2: n_classes = len(estimator.classes_) classifier = True else: n_classes = 1 inner_estimator, inner_feature_names = _extract_inner_estimator( estimator, feature_names) if X_val is not None: X_val, y_val = _validate_Xyt(X_val, y_val, target_col, do_clean=False) if classifier: # classification metrics: classification_metrics(estimator, X_val, y_val) else: # FIXME add regression metrics pass if isinstance(inner_estimator, DecisionTreeClassifier): try: print("Depth: {}".format(inner_estimator.get_depth())) print("Number of leaves: {}".format( inner_estimator.get_n_leaves())) except AttributeError: warn("Can't show tree depth, install scikit-learn 0.21" " to show the full information.") # FIXME !!! bug in plot_tree with integer class names class_names = [str(c) for c in estimator.classes_] plt.figure(figsize=(18, 10)) plot_tree(inner_estimator, feature_names=inner_feature_names, class_names=class_names, filled=True, max_depth=5, precision=2, proportion=True) # FIXME This is a bad thing to show! plot_coefficients( inner_estimator.feature_importances_, inner_feature_names) plt.ylabel("Impurity Decrease") elif hasattr(inner_estimator, 'coef_'): # probably a linear model, can definitely show the coefficients if n_classes > 2: fix, axes = _make_subplots(n_classes) coef = np.atleast_2d(inner_estimator.coef_) for ax, k, c in zip(axes.ravel(), inner_estimator.classes_, coef): plot_coefficients(c, inner_feature_names, ax=ax, classname="class: {}".format(k)) else: coef = np.squeeze(inner_estimator.coef_) if coef.ndim > 1: raise ValueError("Don't know how to handle " "multi-target regressor") plot_coefficients(coef, inner_feature_names) elif isinstance(inner_estimator, RandomForestClassifier): # FIXME This is a bad thing to show! plot_coefficients( inner_estimator.feature_importances_, inner_feature_names) plt.ylabel("Imputity Decrease") if X_val is not None: # feature names might change during preprocessing # but we don't want partial dependence plots for one-hot features idx, org_features = zip( *[(i, f) for i, f in enumerate(inner_feature_names) if f in feature_names]) if hasattr(inner_estimator, 'feature_importances_'): importances = inner_estimator.feature_importances_[ np.array(idx)] features = inner_feature_names[np.argsort(importances)[::-1][:10]] else: X_cont_imputed = SimpleImputer().fit_transform( X_val.loc[:, org_features]) importances, p = f_classif(X_cont_imputed, y_val) features = np.array(org_features)[ np.argsort(importances)[::-1][:10]] if not hasattr(inner_estimator, 'coef_'): print("Computing partial dependence plots...") n_rows, n_cols = find_pretty_grid(len(features)) try: if n_classes <= 2: plot = plot_partial_dependence( estimator, X_val, features=features, feature_names=np.array(feature_names), n_cols=n_cols) plot.figure_.suptitle("Partial Dependence") for ax in plot.axes_.ravel(): ax.set_ylabel('') else: for c in estimator.classes_: plot = plot_partial_dependence( estimator, X_val, features=features, feature_names=np.array(feature_names), target=c, n_cols=n_cols) plot.figure_.suptitle( "Partial Dependence for class {}".format(c)) for ax in plot.axes_.ravel(): ax.set_ylabel('') except ValueError as e: warn("Couldn't run partial dependence plot: " + str(e))
def _extract_inner_estimator(estimator, feature_names): # Start unpacking the estimator to get to the final step inner_estimator = estimator if (isinstance(inner_estimator, SimpleClassifier) or isinstance(inner_estimator, SimpleRegressor)): # get the pipeline inner_estimator = inner_estimator.est_ elif isinstance(inner_estimator, AnyClassifier): inner_estimator = inner_estimator.est_ if isinstance(inner_estimator, Pipeline): assert len(inner_estimator.steps) == 2 # pipelines don't have feature names yet in sklearn # *cries in scikit-learn roadmap* final_est = inner_estimator._final_estimator try: feature_names = inner_estimator.steps[0][1].get_feature_names( feature_names) except TypeError: feature_names = inner_estimator.steps[0][1].get_feature_names() # now we have input feature names for the final step inner_estimator = final_est # done unwrapping, start evaluating return inner_estimator, np.array(feature_names)