Custom Estimators

https://scikit-learn.org/dev/developers/develop.html

Function Transformer

from sklearn.preprocessing import FunctionTransformer
ft = FunctionTransformer(func=lambda X: X+2, inverse_func=lambda X: X-2)
import numpy as np
X = np.arange(100).reshape(25, 4)
X
ft.fit(X)
ft.transform(X)
ft.inverse_transform(ft.transform(X))

Basic Transformer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array

class MyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, first_parameter=1, second_parameter=2):
        # all parameters must be specified in the __init__ function
        # and init is not allowed to do anythin
        self.first_parameter = first_parameter
        self.second_parameter = second_parameter
        
    def fit(self, X, y=None):
        X = check_array(X)
        # fit should only take X and y as parameters
        # even if your model is unsupervised, you need to accept a y argument!
        
        # Model fitting code goes here
        print("fitting the model right here")
        # fit returns self
        return self
    
    def transform(self, X):
        X = check_array(X)

        # transform takes as parameter only X
        
        # apply some transformation to X:
        X_transformed = X + 1
        return X_transformed
from sklearn.utils.estimator_checks import check_estimator
check_estimator(MyTransformer)
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_X_y

class MyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, my_parameter="stuff"):
        self.my_parameter = my_parameter
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.n_features_ = X.shape[1]
        return self
    
    def transform(self, X):
        X = check_array(X)
        if X.shape[1] != self.n_features_:
            raise ValueError("Wrong number of features {} != {}".format(
                X.shape[1], self.n_features_))
        return X - 2
check_estimator(MyTransformer)

Exercise

  • Reimplement a simple version of the standard scaler (that removes mean and scales to unit variance) with scikit-learn interface. Can you make it pass the tests? Does it give the same result as sklearn.preprocessing.StandardScaler?

  • Reimplement a one nearest neighbor classifier with scikit-learn interface (that memorizes the training set and assignes a new test point to the class of the closest training point). Again, try making it pass the tests.

hint: use sklearn.utils.validation.check_is_fitted and sklearn.utils.validation.unique_labels (though you don’t have to).

# %load solutions/custom_estimators.py