Source code for dabl.models

import warnings
import numpy as np
import pandas as pd

import sklearn

from sklearn.metrics import make_scorer, average_precision_score
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import type_of_target
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.model_selection import StratifiedKFold, KFold
try:
    from sklearn.metrics._scorer import _check_multimetric_scoring
except ImportError:
    from sklearn.metrics.scorer import _check_multimetric_scoring
from sklearn.model_selection._validation import _fit_and_score
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.metaestimators import if_delegate_has_method
try:
    from sklearn.utils._testing import set_random_state
except ImportError:
    from sklearn.utils.testing import set_random_state

from sklearn.dummy import DummyClassifier

from .preprocessing import EasyPreprocessor, detect_types
from .pipelines import (get_fast_classifiers, get_fast_regressors,
                        get_any_classifiers)
from .utils import nice_repr, _validate_Xyt
from .search import GridSuccessiveHalving


def _format_scores(scores):
    return " ".join(('{}: {:.3f}'.format(name, score)
                     for name, score in scores.items()))


class _DablBaseEstimator(BaseEstimator):

    @if_delegate_has_method(delegate='est_')
    def predict_proba(self, X):
        return self.est_.predict_proba(X)

    @if_delegate_has_method(delegate='est_')
    def decision_function(self, X):
        return self.est_.decision_function(X)


class _BaseSimpleEstimator(_DablBaseEstimator):
    def predict(self, X):
        if not self.refit:
            raise ValueError("Must specify refit=True to predict.")
        with warnings.catch_warnings():
            # fix when requiring sklearn 0.22
            # check_is_fitted will not have arguments any more
            warnings.filterwarnings('ignore', category=FutureWarning)
            warnings.filterwarnings('ignore', category=DeprecationWarning)
            check_is_fitted(self, 'est_')

        return self.est_.predict(X)

    def _evaluate_one(self, estimator, data_preproc, scorers):
        res = []
        for X_train, X_test, y_train, y_test in data_preproc:
            X = np.vstack([X_train, X_test])
            if y_train.ndim < 2 and y_test.ndim < 2:
                y = np.hstack([y_train, y_test])
            else:
                y = np.vstack([y_train, y_test])
            train = np.arange(len(X_train))
            test = np.arange(len(X_train), len(X_test) + len(X_train))
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore',
                                        category=UndefinedMetricWarning)
                test_scores = _fit_and_score(estimator, X, y, scorer=scorers,
                                             train=train, test=test,
                                             parameters={}, fit_params={},
                                             verbose=self.verbose)[0]
            res.append(test_scores)

        res_mean = pd.DataFrame(res).mean(axis=0)
        try:
            # show only last step of pipeline for simplicity
            name = nice_repr(estimator.steps[-1][1])
        except AttributeError:
            name = nice_repr(estimator)

        if self.verbose:
            print("Running {}".format(name))
            print(_format_scores(res_mean))
        res_mean.name = name
        self.log_.append(res_mean)
        return res_mean

    def _fit(self, X, y=None, target_col=None):
        """Fit estimator.

        Requiers to either specify the target as separate 1d array or Series y
        (in scikit-learn fashion) or as column of the dataframe X specified by
        target_col.
        If y is specified, X is assumed not to contain the target.

        Parameters
        ----------
        X : DataFrame
            Input features. If target_col is specified, X also includes the
            target.
        y : Series or numpy array, optional.
            Target. You need to specify either y or target_col.
        target_col : string or int, optional
            Column name of target if included in X.
        """
        X, y = _validate_Xyt(X, y, target_col, do_clean=False)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        types = detect_types(X, type_hints=self.type_hints)
        self.feature_names_ = X.columns
        self.types_ = types

        y, self.scoring_ = self._preprocess_target(y)
        self.log_ = []

        # reimplement cross-validation so we only do preprocessing once
        # This could/should be solved with dask?
        if isinstance(self, RegressorMixin):
            # this is how inheritance works, right?
            cv = KFold(n_splits=5)
        elif isinstance(self, ClassifierMixin):
            cv = StratifiedKFold(n_splits=5)
        data_preproc = []
        for i, (train, test) in enumerate(cv.split(X, y)):
            # maybe do two levels of preprocessing
            # to search over treatment of categorical variables etc
            # Also filter?
            verbose = self.verbose if i == 0 else 0
            sp = EasyPreprocessor(verbose=verbose, types=types)
            X_train = sp.fit_transform(X.iloc[train], y.iloc[train])
            X_test = sp.transform(X.iloc[test])
            data_preproc.append((X_train, X_test, y.iloc[train], y.iloc[test]))

        estimators = self._get_estimators()
        rank_scoring = self._rank_scoring
        self.current_best_ = {rank_scoring: -np.inf}
        for est in estimators:
            set_random_state(est, self.random_state)
            scorers, _ = _check_multimetric_scoring(est, self.scoring_)
            scores = self._evaluate_one(est, data_preproc, scorers)
            # make scoring configurable
            if scores[rank_scoring] > self.current_best_[rank_scoring]:
                if self.verbose:
                    print("=== new best {} (using {}):".format(
                        scores.name,
                        rank_scoring))
                    print(_format_scores(scores))
                    print()

                self.current_best_ = scores
                best_est = est
        if self.verbose:
            print("\nBest model:\n{}\nBest Scores:\n{}".format(
                  nice_repr(best_est), _format_scores(self.current_best_)))
        if self.refit:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', UserWarning)
                self.est_ = make_pipeline(EasyPreprocessor(types=types),
                                          best_est)
                self.est_.fit(X, y)
        return self


[docs]class SimpleClassifier(_BaseSimpleEstimator, ClassifierMixin):
    """Automagic anytime classifier.

    Parameters
    ----------
    refit : boolean, True
        Whether to refit the model on the full dataset.

    random_state : random state, int or None (default=None)
        Random state or seed.

    verbose : integer, default=1
        Verbosity (higher is more output)

    type_hints : dict or None
            If dict, provide type information for columns.
            Keys are column names, values are types as provided by
            detect_types.

    Attributes
    ----------
    est_ : sklearn estimator
        Best estimator found.

    """
[docs]    def __init__(self, refit=True, random_state=None, verbose=1,
                 type_hints=None):
        self.verbose = verbose
        self.random_state = random_state
        self.refit = refit
        self.type_hints = type_hints

    def _get_estimators(self):
        return get_fast_classifiers(n_classes=len(self.classes_))

    def _preprocess_target(self, y):
        target_type = type_of_target(y)
        le = LabelEncoder().fit(y)
        y = pd.Series(y)
        self.classes_ = le.classes_

        if target_type == "binary":
            minority_class = y.value_counts().index[1]
            my_average_precision_scorer = make_scorer(
                average_precision_score, pos_label=minority_class,
                needs_threshold=True)
            scoring = {'accuracy': 'accuracy',
                       'average_precision': my_average_precision_scorer,
                       'roc_auc': 'roc_auc',
                       'recall_macro': 'recall_macro',
                       'f1_macro': 'f1_macro'
                       }
        elif target_type == "multiclass":
            scoring = ['accuracy', 'recall_macro', 'precision_macro',
                       'f1_macro']
        else:
            raise ValueError("Unknown target type: {}".format(target_type))
        return y, scoring

[docs]    def fit(self, X, y=None, *, target_col=None):
        """Fit classifier.

        Requires to either specify the target as separate 1d array or Series y
        (in scikit-learn fashion) or as column of the dataframe X specified by
        target_col.
        If y is specified, X is assumed not to contain the target.

        Parameters
        ----------
        X : DataFrame
            Input features. If target_col is specified, X also includes the
            target.
        y : Series or numpy array, optional.
            Target class labels. You need to specify either y or target_col.
        target_col : string or int, optional
            Column name of target if included in X.
        """
        self._rank_scoring = "recall_macro"
        return self._fit(X=X, y=y, target_col=target_col)


[docs]class SimpleRegressor(_BaseSimpleEstimator, RegressorMixin):
    """Automagic anytime classifier.

    Parameters
    ----------
    refit : boolean, True
        Whether to refit the model on the full dataset (I think).

    random_state : random state, int or None (default=None)
        Random state or seed.

    verbose : integer, default=1
        Verbosity (higher is more output)

    type_hints : dict or None
            If dict, provide type information for columns.
            Keys are column names, values are types as provided by
            detect_types.
    """
[docs]    def __init__(self, refit=True, random_state=None, verbose=1,
                 type_hints=None):
        self.verbose = verbose
        self.refit = refit
        self.random_state = random_state
        self.type_hints = type_hints

    def _get_estimators(self):
        return get_fast_regressors()

    def _preprocess_target(self, y):
        target_type = type_of_target(y)

        if target_type not in ["continuous", "multiclass"]:
            # if all labels are integers type_of_target is multiclass.
            # We trust our user they mean regression.
            raise ValueError("Unknown target type: {}".format(target_type))
        scoring = ('r2', 'neg_mean_squared_error')
        return y, scoring

[docs]    def fit(self, X, y=None, *, target_col=None):
        """Fit regressor.

        Requires to either specify the target as separate 1d array or Series y
        (in scikit-learn fashion) or as column of the dataframe X specified by
        target_col.
        If y is specified, X is assumed not to contain the target.

        Parameters
        ----------
        X : DataFrame
            Input features. If target_col is specified, X also includes the
            target.
        y : Series or numpy array, optional.
            Target class labels. You need to specify either y or target_col.
        target_col : string or int, optional
            Column name of target if included in X.

        """
        self._rank_scoring = "r2"

        return self._fit(X=X, y=y, target_col=target_col)


[docs]class AnyClassifier(_DablBaseEstimator, ClassifierMixin):
    """Classifier with automatic model selection.

    This model uses successive halving on a portfolio of complex models
    (HistGradientBoosting, RandomForest, SVC, LogisticRegression)
    to pick the best model family and hyper-parameters.

    AnyClassifier internally applies EasyPreprocessor, so no preprocessing
    is necessary.

    Parameters
    ----------
    n_jobs : int, default=None
        Number of processes to spawn for parallelizing the search.

    force_exhaust_budget : bool, default=True
        Whether to ensure at least one model is trained on the full
        dataset in successive halving.
        See the documentation of successive halving for details.

    verbose : integer, default=0
        Verbosity. Higher means more output.

    type_hints : dict or None
            If dict, provide type information for columns.
            Keys are column names, values are types as provided by
            detect_types.

    Attributes
    ----------
    search_ : SuccessiveHalving instance
        Fitted GridSuccessiveHalving instance for inspection.

    est_ : sklearn estimator
        Best estimator (pipeline) found during search.

    """
[docs]    def __init__(self, n_jobs=None, force_exhaust_budget=True, verbose=0,
                 type_hints=None):
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.force_exhaust_budget = force_exhaust_budget
        self.type_hints = type_hints

    def _get_estimators(self):
        return get_any_classifiers()

    def _preprocess_target(self, y):
        # copy and paste from above, should be a mixin
        target_type = type_of_target(y)
        le = LabelEncoder().fit(y)
        y = pd.Series(y)
        self.classes_ = le.classes_

        if target_type == "binary":
            scoring = 'recall_macro'
        elif target_type == "multiclass":
            scoring = 'recall_macro'
        else:
            raise ValueError("Unknown target type: {}".format(target_type))
        return y, scoring

    def predict(self, X):
        with warnings.catch_warnings():
            # fix when requiring sklearn 0.22
            # check_is_fitted will not have arguments any more
            warnings.filterwarnings('ignore', category=FutureWarning)
            warnings.filterwarnings('ignore', category=DeprecationWarning)
            check_is_fitted(self, 'est_')

        return self.est_.predict(X)

[docs]    def fit(self, X, y=None, *, target_col=None):
        """Fit estimator.

        Requiers to either specify the target as separate 1d array or Series y
        (in scikit-learn fashion) or as column of the dataframe X specified by
        target_col.
        If y is specified, X is assumed not to contain the target.

        Parameters
        ----------
        X : DataFrame
            Input features. If target_col is specified, X also includes the
            target.
        y : Series or numpy array, optional.
            Target. You need to specify either y or target_col.
        target_col : string or int, optional
            Column name of target if included in X.
        """
        # copy and paste from above?!
        if ((y is None and target_col is None)
                or (y is not None) and (target_col is not None)):
            raise ValueError(
                "Need to specify exactly one of y and target_col.")
        X, y = _validate_Xyt(X, y, target_col, do_clean=False)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        types = detect_types(X, type_hints=self.type_hints)
        self.feature_names_ = X.columns
        self.types_ = types
        cv = 5
        ratio = 3

        y, self.scoring_ = self._preprocess_target(y)
        self.log_ = []

        # reimplement cross-validation so we only do preprocessing once
        pipe = Pipeline([('preprocessing',
                          EasyPreprocessor(verbose=self.verbose, types=types)),
                         ('classifier', DummyClassifier())])

        estimators = self._get_estimators()
        param_grid = [{'classifier': [est]} for est in estimators]
        gs = GridSuccessiveHalving(
            ratio=ratio,
            estimator=pipe, param_grid=param_grid,
            force_exhaust_budget=self.force_exhaust_budget,
            verbose=self.verbose, cv=cv, error_score='raise',
            scoring=self.scoring_, refit='recall_macro', n_jobs=self.n_jobs)
        self.search_ = gs
        with sklearn.config_context(print_changed_only=True):
            gs.fit(X, y)
        self.est_ = gs.best_estimator_

        print("best classifier: ", gs.best_params_['classifier'])
        print("best score: {:.3f}".format(gs.best_score_))

        return self