import warnings
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
from sklearn.feature_selection import (f_regression,
mutual_info_regression,
mutual_info_classif,
f_classif)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import recall_score
from ..preprocessing import detect_types, clean, guess_ordinal
from .utils import (_check_X_target_col, _get_n_top, _make_subplots,
_short_tick_names, _shortname, _prune_category_make_X,
find_pretty_grid, _find_scatter_plots_classification,
class_hists, discrete_scatter, mosaic_plot,
_find_inliers, pairplot, _get_scatter_alpha,
_get_scatter_size)
[docs]def plot_regression_continuous(X, target_col, types=None,
scatter_alpha='auto', scatter_size='auto',
drop_outliers=True, **kwargs):
"""Plots for continuous features in regression.
Creates plots of all the continuous features vs the target.
Relevant features are determined using F statistics.
Parameters
----------
X : dataframe
Input data including features and target
target_col : str or int
Identifier of the target column in X
types : dataframe of types, optional.
Output of detect_types on X. Can be used to avoid recomputing the
types.
scatter_alpha : float, default='auto'.
Alpha values for scatter plots. 'auto' is dirty hacks.
scatter_size : float, default='auto'.
Marker size for scatter plots. 'auto' is dirty hacks.
drop_outliers : bool, default=True
Whether to drop outliers when plotting.
"""
types = _check_X_target_col(X, target_col, types, task="regression")
features = X.loc[:, types.continuous]
if target_col in features.columns:
features = features.drop(target_col, axis=1)
if features.shape[1] == 0:
return
show_top = _get_n_top(features, "continuous")
target = X[target_col]
# HACK we should drop them per column before feeding them into f_regression
# FIXME
features_imp = SimpleImputer().fit_transform(features)
f, p = f_regression(features_imp, target)
top_k = np.argsort(f)[-show_top:][::-1]
# we could do better lol
fig, axes = _make_subplots(n_plots=show_top)
# FIXME this could be a function or maybe using seaborn
plt.suptitle("Continuous Feature vs Target")
for i, (col_idx, ax) in enumerate(zip(top_k, axes.ravel())):
if i % axes.shape[1] == 0:
ax.set_ylabel(target_col)
col = features.columns[col_idx]
if drop_outliers:
inliers = _find_inliers(features.loc[:, col])
ax.scatter(features.loc[inliers, col], target[inliers],
alpha=scatter_alpha, s=scatter_size)
else:
ax.scatter(features.loc[:, col], target,
alpha=scatter_alpha, s=scatter_size)
ax.set_xlabel(_shortname(col))
ax.set_title("F={:.2E}".format(f[col_idx]))
for j in range(i + 1, axes.size):
# turn off axis if we didn't fill last row
axes.ravel()[j].set_axis_off()
[docs]def plot_regression_categorical(X, target_col, types=None, **kwargs):
"""Plots for categorical features in regression.
Creates box plots of target distribution for important categorical
features. Relevant features are identified using mutual information.
For high cardinality categorical variables (variables with many categories)
only the most frequent categories are shown.
Parameters
----------
X : dataframe
Input data including features and target
target_col : str or int
Identifier of the target column in X
types : dataframe of types, optional.
Output of detect_types on X. Can be used to avoid recomputing the
types.
"""
types = _check_X_target_col(X, target_col, types, task="regression")
if types is None:
types = detect_types(X)
features = X.loc[:, types.categorical]
if target_col in features.columns:
features = features.drop(target_col, axis=1)
if features.shape[1] == 0:
return
features = features.astype('category')
show_top = _get_n_top(features, "categorical")
# can't use OrdinalEncoder because we might have mix of int and string
ordinal_encoded = features.apply(lambda x: x.cat.codes)
target = X[target_col]
f = mutual_info_regression(
ordinal_encoded, target,
discrete_features=np.ones(X.shape[1], dtype=bool))
top_k = np.argsort(f)[-show_top:][::-1]
# large number of categories -> taller plot
row_height = 3 if X.nunique().max() <= 5 else 5
fig, axes = _make_subplots(n_plots=show_top, row_height=row_height)
plt.suptitle("Categorical Feature vs Target")
for i, (col_ind, ax) in enumerate(zip(top_k, axes.ravel())):
col = features.columns[i]
X_new = _prune_category_make_X(X, col, target_col)
medians = X_new.groupby(col)[target_col].median()
order = medians.sort_values().index
sns.boxplot(x=target_col, y=col, data=X_new, order=order, ax=ax)
ax.set_title("F={:.2E}".format(f[col_ind]))
# shorten long ticks and labels
_short_tick_names(ax)
for j in range(i + 1, axes.size):
# turn off axis if we didn't fill last row
axes.ravel()[j].set_axis_off()
[docs]def plot_classification_continuous(X, target_col, types=None, hue_order=None,
scatter_alpha='auto', scatter_size="auto",
univariate_plot='histogram',
drop_outliers=True, plot_pairwise=True,
top_k_interactions=10, random_state=None,
**kwargs):
"""Plots for continuous features in classification.
Selects important continuous features according to F statistics.
Creates univariate distribution plots for these, as well as scatterplots
for selected pairs of features, and scatterplots for selected pairs of
PCA directions.
If there are more than 2 classes, scatter plots from Linear Discriminant
Analysis are also shown.
Scatter plots are determined "interesting" is a decision tree on the
two-dimensional projection performs well. The cross-validated macro-average
recall of a decision tree is shown in the title for each scatterplot.
Parameters
----------
X : dataframe
Input data including features and target
target_col : str or int
Identifier of the target column in X
types : dataframe of types, optional.
Output of detect_types on X. Can be used to avoid recomputing the
types.
scatter_alpha : float, default='auto'.
Alpha values for scatter plots. 'auto' is dirty hacks.
scatter_size : float, default='auto'.
Marker size for scatter plots. 'auto' is dirty hacks.
univariate_plot : string, default="histogram"
Supported: 'histogram' and 'kde'.
drop_outliers : bool, default=True
Whether to drop outliers when plotting.
plot_pairwise : bool, default=True
Whether to create pairwise plots. Can be a bit slow.
top_k_interactions : int, default=10
How many pairwise interactions to consider
(ranked by univariate f scores).
Runtime is quadratic in this, but higher numbers might find more
interesting interactions.
random_state : int, None or numpy RandomState
Random state used for subsampling for determining pairwise features
to show.
Notes
-----
important kwargs parameters are: scatter_size and scatter_alpha.
"""
types = _check_X_target_col(X, target_col, types, task='classification')
features = X.loc[:, types.continuous]
if target_col in features.columns:
features = features.drop(target_col, axis=1)
if features.shape[1] == 0:
return
features_imp = SimpleImputer().fit_transform(features)
target = X[target_col]
figures = []
if features.shape[1] <= 5:
pairplot(X, target_col=target_col, columns=features.columns,
scatter_alpha=scatter_alpha,
scatter_size=scatter_size)
title = "Continuous features"
if features.shape[1] > 1:
title = title + " pairplot"
plt.suptitle(title, y=1.02)
fig = plt.gcf()
else:
# univariate plots
f = _plot_univariate_classification(features, features_imp, target,
drop_outliers, target_col,
univariate_plot, hue_order)
figures.append(plt.gcf())
# FIXME remove "variable = " from title, add f score
# pairwise plots
if not plot_pairwise:
return figures
top_k = np.argsort(f)[-top_k_interactions:][::-1]
fig, axes = _plot_top_pairs(features_imp[:, top_k], target,
scatter_alpha, scatter_size,
feature_names=features.columns[top_k],
how_many=4, random_state=random_state)
fig.suptitle("Top feature interactions")
figures.append(fig)
if not plot_pairwise:
return figures
# get some PCA directions
# we're using all features here, not only most informative
# should we use only those?
n_components = min(top_k_interactions, features.shape[0],
features.shape[1])
if n_components < 2:
return figures
features_scaled = _plot_pca_classification(
n_components, features_imp, target, scatter_alpha, scatter_size,
random_state=random_state)
figures.append(plt.gcf())
# LDA
_plot_lda_classification(features_scaled, target, top_k_interactions,
scatter_alpha, scatter_size,
random_state=random_state)
figures.append(plt.gcf())
return figures
def _plot_pca_classification(n_components, features_imp, target,
scatter_alpha='auto', scatter_size='auto',
random_state=None):
pca = PCA(n_components=n_components)
features_scaled = scale(features_imp)
features_pca = pca.fit_transform(features_scaled)
feature_names = ['PCA {}'.format(i) for i in range(n_components)]
fig, axes = _plot_top_pairs(features_pca, target, scatter_alpha,
scatter_size,
feature_names=feature_names,
how_many=3, additional_axes=1,
random_state=random_state)
ax = axes.ravel()[-1]
ax.plot(pca.explained_variance_ratio_, label='variance')
ax.plot(np.cumsum(pca.explained_variance_ratio_),
label='cumulative variance')
ax.set_title("Scree plot (PCA explained variance)")
ax.legend()
fig.suptitle("Discriminating PCA directions")
return features_scaled
def _plot_lda_classification(features, target, top_k_interactions,
scatter_alpha='auto', scatter_size='auto',
random_state=None):
# assume features are scaled
n_components = min(top_k_interactions, features.shape[0],
features.shape[1], target.nunique() - 1)
lda = LinearDiscriminantAnalysis(n_components=n_components)
features_lda = lda.fit_transform(features, target)
# we should probably do macro-average recall here as everywhere else?
print("Linear Discriminant Analysis training set score: {:.3f}".format(
recall_score(target, lda.predict(features), average='macro')))
if features_lda.shape[1] < 2:
# Do a single plot and exit
plt.figure()
single_lda = pd.DataFrame({'feature': features_lda.ravel(),
'target': target})
class_hists(single_lda, 'feature', 'target', legend=True)
plt.title("Linear Discriminant")
return
feature_names = ['LDA {}'.format(i) for i in range(n_components)]
fig, _ = _plot_top_pairs(features_lda, target, scatter_alpha, scatter_size,
feature_names=feature_names,
random_state=random_state)
fig.suptitle("Discriminating LDA directions")
def _plot_top_pairs(features, target, scatter_alpha='auto',
scatter_size='auto',
feature_names=None, how_many=4, additional_axes=0,
random_state=None):
top_pairs = _find_scatter_plots_classification(
features, target, how_many=how_many, random_state=random_state)
if feature_names is None:
feature_names = ["feature {}".format(i)
for i in range(features.shape[1])]
fig, axes = _make_subplots(len(top_pairs) + additional_axes, row_height=4)
for x, y, score, ax in zip(top_pairs.feature0, top_pairs.feature1,
top_pairs.score, axes.ravel()):
discrete_scatter(features[:, x], features[:, y],
c=target, ax=ax, alpha=scatter_alpha,
s=scatter_size)
ax.set_xlabel(feature_names[x])
ax.set_ylabel(feature_names[y])
ax.set_title("{:.3f}".format(score))
return fig, axes
def _plot_univariate_classification(features, features_imp, target,
drop_outliers,
target_col, univariate_plot, hue_order):
# univariate plots
show_top = _get_n_top(features, "continuous")
f, p = f_classif(features_imp, target)
top_k = np.argsort(f)[-show_top:][::-1]
# FIXME this will fail if a feature is always
# NaN for a particular class
best_features = features.iloc[:, top_k].copy()
if drop_outliers:
for col in best_features.columns:
inliers = _find_inliers(best_features.loc[:, col])
best_features[~inliers] = np.NaN
best_features[target_col] = target
if univariate_plot == 'kde':
df = best_features.melt(target_col)
rows, cols = find_pretty_grid(show_top)
g = sns.FacetGrid(df, col='variable', hue=target_col,
col_wrap=cols,
sharey=False, sharex=False, hue_order=hue_order)
g = g.map(sns.kdeplot, "value", shade=True)
g.axes[0].legend()
plt.suptitle("Continuous features by target", y=1.02)
elif univariate_plot == 'histogram':
# row_height = 3 if target.nunique() < 5 else 5
n_classes = target.nunique()
row_height = n_classes * 1 if n_classes < 10 else n_classes * .5
fig, axes = _make_subplots(n_plots=show_top, row_height=row_height)
for i, (ind, ax) in enumerate(zip(top_k, axes.ravel())):
class_hists(best_features, best_features.columns[i],
target_col, ax=ax, legend=i == 0)
ax.set_title("F={:.2E}".format(f[ind]))
for j in range(i + 1, axes.size):
# turn off axis if we didn't fill last row
axes.ravel()[j].set_axis_off()
else:
raise ValueError("Unknown value for univariate_plot: ",
univariate_plot)
return f
[docs]def plot_classification_categorical(X, target_col, types=None, kind='auto',
hue_order=None, **kwargs):
"""Plots for categorical features in classification.
Creates plots of categorical variable distributions for each target class.
Relevant features are identified via mutual information.
For high cardinality categorical variables (variables with many categories)
only the most frequent categories are shown.
Parameters
----------
X : dataframe
Input data including features and target
target_col : str or int
Identifier of the target column in X
types : dataframe of types, optional.
Output of detect_types on X. Can be used to avoid recomputing the
types.
kind : string, default 'auto'
Kind of plot to show. Options are 'count', 'proportion',
'mosaic' and 'auto'.
Count shows raw class counts within categories
(can be hard to read with imbalanced classes)
Proportion shows class proportions within categories
(can be misleading with imbalanced categories)
Mosaic shows both aspects, but can be a bit busy.
Auto uses mosaic plots for binary classification and counts otherwise.
"""
types = _check_X_target_col(X, target_col, types, task="classification")
if kind == "auto":
if X[target_col].nunique() > 5:
kind = 'count'
else:
kind = 'mosaic'
features = X.loc[:, types.categorical]
if target_col in features.columns:
features = features.drop(target_col, axis=1)
if features.shape[1] == 0:
return
features = features.astype('category')
show_top = _get_n_top(features, "categorical")
# can't use OrdinalEncoder because we might have mix of int and string
ordinal_encoded = features.apply(lambda x: x.cat.codes)
target = X[target_col]
f = mutual_info_classif(
ordinal_encoded, target,
discrete_features=np.ones(X.shape[1], dtype=bool))
top_k = np.argsort(f)[-show_top:][::-1]
# large number of categories -> taller plot
row_height = 3 if features.nunique().max() <= 5 else 5
fig, axes = _make_subplots(n_plots=show_top, row_height=row_height)
plt.suptitle("Categorical Features vs Target", y=1.02)
for i, (col_ind, ax) in enumerate(zip(top_k, axes.ravel())):
col = features.columns[col_ind]
if kind == 'proportion':
X_new = _prune_category_make_X(X, col, target_col)
df = (X_new.groupby(col)[target_col]
.value_counts(normalize=True)
.unstack()
.sort_values(by=target[0])) # hacky way to get a class name
df.plot(kind='barh', stacked='True', ax=ax, legend=i == 0)
ax.set_title(col)
ax.set_ylabel(None)
elif kind == 'mosaic':
# how many categories make up at least 1% of data:
n_cats = (X[col].value_counts() / len(X) > 0.01).sum()
n_cats = np.minimum(n_cats, 20)
X_new = _prune_category_make_X(X, col, target_col,
max_categories=n_cats)
mosaic_plot(X_new, col, target_col, ax=ax)
ax.set_title(col)
elif kind == 'count':
X_new = _prune_category_make_X(X, col, target_col)
# absolute counts
# FIXME show f value
# FIXME shorten titles?
props = {}
if X[target_col].nunique() > 15:
props['font.size'] = 6
with mpl.rc_context(props):
sns.countplot(y=col, data=X_new, ax=ax, hue=target_col,
hue_order=hue_order)
if i > 0:
ax.legend(())
else:
raise ValueError("Unknown plot kind {}".format(kind))
_short_tick_names(ax)
for j in range(i + 1, axes.size):
# turn off axis if we didn't fill last row
axes.ravel()[j].set_axis_off()
[docs]def plot(X, y=None, target_col=None, type_hints=None, scatter_alpha='auto',
scatter_size='auto', verbose=10, plot_pairwise=True,
**kwargs):
"""Automatic plots for classification and regression.
Determines whether the target is categorical or continuous and plots the
target distribution. Then calls the relevant plotting functions
accordingly.
Parameters
----------
X : DataFrame
Input features. If target_col is specified, X also includes the
target.
y : Series or numpy array, optional.
Target. You need to specify either y or target_col.
target_col : string or int, optional
Column name of target if included in X.
type_hints : dict or None
If dict, provide type information for columns.
Keys are column names, values are types as provided by detect_types.
scatter_alpha : float, default='auto'
Alpha values for scatter plots. 'auto' is dirty hacks.
scatter_size : float, default='auto'.
Marker size for scatter plots. 'auto' is dirty hacks.
plot_pairwise : bool, default=True
Whether to include pairwise scatterplots for classification.
These can be somewhat expensive to compute.
verbose : int, default=10
Controls the verbosity (output).
See also
--------
plot_regression_continuous
plot_regression_categorical
plot_classification_continuous
plot_classification_categorical
"""
if ((y is None and target_col is None)
or (y is not None) and (target_col is not None)):
raise ValueError(
"Need to specify exactly one of y and target_col.")
if isinstance(y, str):
warnings.warn("The second positional argument of plot is a Series 'y'."
" If passing a column name, use a keyword.",
FutureWarning)
target_col = y
y = None
if target_col is None:
if not isinstance(y, pd.Series):
y = pd.Series(y)
if y.name is None:
y = y.rename('target')
target_col = y.name
X = pd.concat([X, y], axis=1)
X, types = clean(X, type_hints=type_hints, return_types=True,
target_col=target_col)
types = _check_X_target_col(X, target_col, types=types)
# low_cardinality integers plot better as categorical
# FIXME the logic should be down in the plotting functions maybe
# or at least passed on so we can do better.
if types.low_card_int.any():
for col in types.index[types.low_card_int]:
# kinda hacky for now
if guess_ordinal(X[col]):
types.loc[col, 'low_card_int'] = False
types.loc[col, 'continuous'] = True
else:
types.loc[col, 'low_card_int'] = False
types.loc[col, 'categorical'] = True
if types.continuous[target_col]:
print("Target looks like regression")
# FIXME we might be overwriting the original dataframe here?
X[target_col] = X[target_col].astype(np.float)
# regression
# make sure we include the target column in X
# even though it's not categorical
plt.hist(X[target_col], bins='auto')
plt.xlabel(target_col)
plt.ylabel("frequency")
plt.title("Target distribution")
scatter_alpha = _get_scatter_alpha(scatter_alpha, X[target_col])
scatter_size = _get_scatter_size(scatter_size, X[target_col])
plot_regression_continuous(X, target_col, types=types,
scatter_alpha=scatter_alpha,
scatter_size=scatter_size, **kwargs)
plot_regression_categorical(X, target_col, types=types, **kwargs)
else:
print("Target looks like classification")
# regression
# make sure we include the target column in X
# even though it's not categorical
plt.figure()
counts = pd.DataFrame(X[target_col].value_counts())
melted = counts.T.melt().rename(
columns={'variable': 'class', 'value': 'count'})
# class could be a string that's a float
# seaborn is trying to be smart unless we declare it categorical
# we actually fixed counts to have categorical index
# but melt destroys it:
# https://github.com/pandas-dev/pandas/issues/15853
melted['class'] = melted['class'].astype('category')
sns.barplot(y='class', x='count', data=melted)
plt.title("Target distribution")
if len(counts) >= 50:
print("Not plotting anything for 50 classes or more."
"Current visualizations are quite useless for"
" this many classes. Try slicing the data.")
plot_classification_continuous(
X, target_col, types=types, hue_order=counts.index,
scatter_alpha=scatter_alpha, scatter_size=scatter_size,
plot_pairwise=plot_pairwise, **kwargs)
plot_classification_categorical(X, target_col, types=types,
hue_order=counts.index, **kwargs)