Function Transformer¶
from sklearn.preprocessing import FunctionTransformer
ft = FunctionTransformer(func=lambda X: X+2, inverse_func=lambda X: X-2)
import numpy as np
X = np.arange(100).reshape(25, 4)
X
ft.fit(X)
ft.transform(X)
ft.inverse_transform(ft.transform(X))
Basic Transformer¶
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, first_parameter=1, second_parameter=2):
# all parameters must be specified in the __init__ function
# and init is not allowed to do anythin
self.first_parameter = first_parameter
self.second_parameter = second_parameter
def fit(self, X, y=None):
X = check_array(X)
# fit should only take X and y as parameters
# even if your model is unsupervised, you need to accept a y argument!
# Model fitting code goes here
print("fitting the model right here")
# fit returns self
return self
def transform(self, X):
X = check_array(X)
# transform takes as parameter only X
# apply some transformation to X:
X_transformed = X + 1
return X_transformed
from sklearn.utils.estimator_checks import check_estimator
check_estimator(MyTransformer)
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_X_y
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, my_parameter="stuff"):
self.my_parameter = my_parameter
def fit(self, X, y):
X, y = check_X_y(X, y)
self.n_features_ = X.shape[1]
return self
def transform(self, X):
X = check_array(X)
if X.shape[1] != self.n_features_:
raise ValueError("Wrong number of features {} != {}".format(
X.shape[1], self.n_features_))
return X - 2
check_estimator(MyTransformer)
Exercise¶
Reimplement a simple version of the standard scaler (that removes mean and scales to unit variance) with scikit-learn interface. Can you make it pass the tests? Does it give the same result as sklearn.preprocessing.StandardScaler?
Reimplement a one nearest neighbor classifier with scikit-learn interface (that memorizes the training set and assignes a new test point to the class of the closest training point). Again, try making it pass the tests.
hint: use sklearn.utils.validation.check_is_fitted and sklearn.utils.validation.unique_labels (though you don’t have to).
# %load solutions/custom_estimators.py