Feature Engineering¶

interesting random states

18 0.486666666667 0.986666666667 42 0.553333333333 0.986666666667 44 0.526666666667 1.0 54 0.56 1.0 67 0.506666666667 1.0 70 0.586666666667 1.0 79 0.673333333333 1.0 96 0.526666666667 1.0 161 0.486666666667 1.0 174 0.566666666667 1.0 175 0.62 1.0

from sklearn.datasets import make_blobs
from sklearn.preprocessing import scale
X, y = make_blobs(n_samples=200, centers=4, random_state=42)
X = scale(X)
y = y % 2
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='Accent')

<matplotlib.collections.PathCollection at 0x7f4bdff125c0>

../_images/05-feature-engineering_2_1.png

from sklearn.linear_model import LogisticRegressionCV

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegressionCV().fit(X_train, y_train)
logreg.score(X_test, y_test)

0.5

plt.scatter(X[:, 0], X[:, 1], c=y, cmap='Accent')
line = np.linspace(-3, 3, 100)
coef = logreg.coef_.ravel()

plt.plot(line, -(coef[0] * line + logreg.intercept_) / coef[1])

plt.xlim(-1.8, 2)
plt.ylim(-2, 1.8)

(-2, 1.8)

../_images/05-feature-engineering_4_1.png

# Same as PolynomialFeatures(order=2, interactions_only=True)
X_interaction = np.hstack([X, X[:, 0:1] * X[:, 1:]])

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(X_interaction[:, 2], X_interaction[:, 0], X_interaction[:, 1], c=y, cmap="Accent")
ax.view_init(elev=0., azim=0)

ax = fig.add_subplot(122, projection='3d')
ax.scatter(X_interaction[:, 2], X_interaction[:, 0], X_interaction[:, 1], c=y, cmap="Accent")
ax.view_init(elev=60., azim=0)

../_images/05-feature-engineering_6_0.png

X_i_train, X_i_test, y_train, y_test = train_test_split(X_interaction, y, random_state=0)
logreg3 = LogisticRegressionCV().fit(X_i_train, y_train)
logreg3.score(X_i_test, y_test)

0.96

plt.scatter(X[:, 0], X[:, 1], c=y, cmap='Accent')
line = np.linspace(-3, 3, 100)
coef = logreg.coef_.ravel()
coef3 = logreg3.coef_.ravel()
plt.plot(line, -(coef[0] * line + logreg.intercept_) / coef[1])
curve = -(coef3[0] * line + logreg3.intercept_) / (coef3[1] + line * coef3[2])
mask = coef3[1] + line * coef3[2] > 0
plt.plot(line[mask], curve[mask], c='k')
plt.plot(line[~mask], curve[~mask], c='k')
plt.xlim(-1.8, 2)
plt.ylim(-2, 1.8)

(-2, 1.8)

../_images/05-feature-engineering_8_1.png

	gender	age	spend$	articles_bought	time_online
0	M	14	70	5	269
1	F	16	12	10	1522
2	M	12	42	2	235
3	F	25	64	1	63
4	F	22	93	1	21

	age	spend$	articles_bought	time_online	gender_F	gender_M
0	14	70	5	269	0	1
1	16	12	10	1522	1	0
2	12	42	2	235	0	1
3	25	64	1	63	1	0
4	22	93	1	21	1	0

	age_M	spend$_M	articles_bought_M	time_online_M	gender_M_M	age_F	spend$_F	articles_bought_F	time_online_F	gender_F_F
0	14	70	5	269	1	0	0	0	0	0
1	0	0	0	0	0	16	12	10	1522	1
2	12	42	2	235	1	0	0	0	0	0
3	0	0	0	0	0	25	64	1	63	1
4	0	0	0	0	0	22	93	1	21	1

Applied Machine Learning in Python

Feature Engineering¶

Discrete interactions¶

Polynomial Features¶

Feature Distributions¶