Source code for dabl.plot.utils

from warnings import warn
from functools import reduce
import itertools


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from seaborn.utils import despine


# from sklearn.dummy import DummyClassifier
# from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve

from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit


from ..preprocessing import detect_types


[docs]def find_pretty_grid(n_plots, max_cols=5): """Determine a good grid shape for subplots. Tries to find a way to arange n_plots many subplots on a grid in a way that fills as many grid-cells as possible, while keeping the number of rows low and the number of columns below max_cols. Parameters ---------- n_plots : int Number of plots to arrange. max_cols : int, default=5 Maximum number of columns. Returns ------- n_rows : int Number of rows in grid. n_cols : int Number of columns in grid. Examples -------- >>> find_pretty_grid(16, 5) (4, 4) >>> find_pretty_grid(11, 5) (3, 4) >>> find_pretty_grid(10, 5) (2, 5) """ # we could probably do something with prime numbers here # but looks like that becomes a combinatorial problem again? if n_plots % max_cols == 0: # perfect fit! # if max_cols is 6 do we prefer 6x1 over 3x2? return int(n_plots / max_cols), max_cols # min number of rows needed min_rows = int(np.ceil(n_plots / max_cols)) best_empty = max_cols best_cols = max_cols for cols in range(max_cols, min_rows - 1, -1): # we only allow getting narrower if we have more cols than rows remainder = (n_plots % cols) empty = cols - remainder if remainder != 0 else 0 if empty == 0: return int(n_plots / cols), cols if empty < best_empty: best_empty = empty best_cols = cols return int(np.ceil(n_plots / best_cols)), best_cols
[docs]def plot_coefficients(coefficients, feature_names, n_top_features=10, classname=None, ax=None): """Visualize coefficients of a linear model. Parameters ---------- coefficients : nd-array, shape (n_features,) Model coefficients. feature_names : list or nd-array of strings, shape (n_features,) Feature names for labeling the coefficients. n_top_features : int, default=10 How many features to show. The function will show the largest (most positive) and smallest (most negative) n_top_features coefficients, for a total of 2 * n_top_features coefficients. """ coefficients = coefficients.squeeze() feature_names = np.asarray(feature_names) if coefficients.ndim > 1: # this is not a row or column vector raise ValueError("coefficients must be 1d array or column vector, got" " shape {}".format(coefficients.shape)) coefficients = coefficients.ravel() if len(coefficients) != len(feature_names): raise ValueError("Number of coefficients {} doesn't match number of" "feature names {}.".format(len(coefficients), len(feature_names))) # get coefficients with large absolute values coef = coefficients.ravel() mask = coef != 0 coef = coef[mask] feature_names = feature_names[mask] # FIXME this could be easier with pandas by sorting by a column interesting_coefficients = np.argsort(np.abs(coef))[-n_top_features:] new_inds = np.argsort(coef[interesting_coefficients]) interesting_coefficients = interesting_coefficients[new_inds] # plot them if ax is None: plt.figure(figsize=(len(interesting_coefficients), 5)) ax = plt.gca() colors = ['red' if c < 0 else 'blue' for c in coef[interesting_coefficients]] ax.bar(np.arange(len(interesting_coefficients)), coef[interesting_coefficients], color=colors) feature_names = np.array(feature_names) ax.set_xticks(np.arange(0, len(interesting_coefficients))) ax.set_xticklabels(feature_names[interesting_coefficients], rotation=60, ha="right") ax.set_ylabel("Coefficient magnitude") ax.set_xlabel("Feature") ax.set_title(classname) return feature_names[interesting_coefficients]
def heatmap(values, xlabel, ylabel, xticklabels, yticklabels, cmap=None, vmin=None, vmax=None, ax=None, fmt="%0.2f", origin='lower'): if ax is None: ax = plt.gca() img = ax.pcolor(values, cmap=cmap, vmin=vmin, vmax=vmax) img.update_scalarmappable() ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_xticks(np.arange(len(xticklabels)) + .5) ax.set_yticks(np.arange(len(yticklabels)) + .5) ax.set_xticklabels(xticklabels) ax.set_yticklabels(yticklabels) ax.set_aspect(1) if origin == 'upper': ylim = ax.get_ylim() ax.set_ylim(ylim[::-1]) for p, color, value in zip(img.get_paths(), img.get_facecolors(), img.get_array()): x, y = p.vertices[:-2, :].mean(0) if np.mean(color[:3]) > 0.5: c = 'k' else: c = 'w' ax.text(x, y, fmt % value, color=c, ha="center", va="center") return img def _shortname(some_string, maxlen=20): """Shorten a string given a maximum length. Longer strings will be shortened and the rest replaced by ... Parameters ---------- some_string : string Input string to shorten maxlen : int, default=20 Returns ------- return_string : string Output string of size ``min(len(some_string), maxlen)``. """ some_string = str(some_string) if len(some_string) > maxlen: return some_string[:maxlen - 3] + "..." else: return some_string
[docs]def mosaic_plot(data, rows, cols, vary_lightness=False, ax=None): """Create a mosaic plot from a dataframe. Right now only horizontal mosaic plots are supported, i.e. rows are prioritized over columns. Parameters ---------- data : pandas data frame Data to tabulate. rows : column specifier Column in data to tabulate across rows. cols : column specifier Column in data to use to subpartition rows. vary_lightness : bool, default=False Whether to vary lightness across categories. ax : matplotlib axes or None Axes to plot into. """ cont = pd.crosstab(data[cols], data[rows]) sort = np.argsort((cont / cont.sum()).iloc[0]) cont = cont.iloc[:, sort] if ax is None: ax = plt.gca() pos_y = 0 positions_y = [] n_cols = cont.shape[1] for i, col in enumerate(cont.columns): height = cont[col].sum() positions_y.append(pos_y + height / 2) pos_x = 0 for j, row in enumerate(cont[col]): width = row / height color = plt.cm.tab10(j) if vary_lightness: color = _lighten_color(color, (i + 1) / (n_cols + 1)) rect = Rectangle((pos_x, pos_y), width, height, edgecolor='k', facecolor=color) pos_x += width ax.add_patch(rect) pos_y += height ax.set_ylim(0, pos_y) ax.set_yticks(positions_y) ax.set_yticklabels(cont.columns)
def _lighten_color(color, amount=0.5): """ Lightens the given color by multiplying (1-luminosity) by the given amount. Input can be matplotlib color string, hex string, or RGB tuple. https://stackoverflow.com/questions/37765197/darken-or-lighten-a-color-in-matplotlib Examples: >> lighten_color('g', 0.3) >> lighten_color('#F034A3', 0.6) >> lighten_color((.3,.55,.1), 0.5) """ import matplotlib.colors as mc import colorsys c = color amount += 0.5 c = colorsys.rgb_to_hls(*mc.to_rgb(c)) return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2]) def _get_n_top(features, name): if features.shape[1] > 20: print("Showing only top 10 of {} {} features".format( features.shape[1], name)) # too many features, show just top 10 show_top = 10 else: show_top = features.shape[1] return show_top def _prune_categories(series, max_categories=10): series = series.astype('category') small_categories = series.value_counts()[max_categories:].index res = series.cat.remove_categories(small_categories) res = res.cat.add_categories(['dabl_other']).fillna("dabl_other") return res def _prune_category_make_X(X, col, target_col, max_categories=20): col_values = X[col] if col_values.nunique() > max_categories: # keep only top 10 categories if there are more than 20 col_values = _prune_categories(col_values, max_categories=min(10, max_categories)) X_new = X[[target_col]].copy() X_new[col] = col_values else: X_new = X.copy() X_new[col] = X_new[col].astype('category') return X_new def _fill_missing_categorical(X): # fill in missing values in categorical variables with new category # ensure we use strings for object columns and number for integers X = X.copy() max_value = X.max(numeric_only=True).max() for col in X.columns: if X[col].dtype == 'object': X[col].fillna("dabl_missing", inplace=True) else: X[col].fillna(max_value + 1, inplace=True) return X def _make_subplots(n_plots, max_cols=5, row_height=3): """Create a harmonious subplot grid. """ n_rows, n_cols = find_pretty_grid(n_plots, max_cols=max_cols) fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, row_height * n_rows), constrained_layout=True) # we don't want ravel to fail, this is awkward! axes = np.atleast_2d(axes) return fig, axes def _check_X_target_col(X, target_col, types=None, type_hints=None, task=None): if types is None: types = detect_types(X, type_hints=type_hints) if (not isinstance(target_col, str) and hasattr(target_col, '__len__') and len(target_col) > 1): raise ValueError("target_col should be a column in X, " "got {}".format(target_col)) if target_col not in X.columns: raise ValueError("{} is not a valid column of X".format(target_col)) if X[target_col].nunique() < 2: raise ValueError("Less than two classes present, {}, need at least two" " for classification.".format(X.loc[0, target_col])) # FIXME we get target types here with detect_types, # but in the estimator with type_of_target if task == "classification" and not types.loc[target_col, 'categorical']: raise ValueError("Type for target column {} detected as {}," " need categorical for classification.".format( target_col, types.T.idxmax()[target_col])) if task == "regression" and (not types.loc[target_col, 'continuous']): raise ValueError("Type for target column {} detected as {}," " need continuous for regression.".format( target_col, types.T.idxmax()[target_col])) return types def _short_tick_names(ax): ax.set_yticklabels([_shortname(t.get_text(), maxlen=10) for t in ax.get_yticklabels()]) ax.set_xlabel(_shortname(ax.get_xlabel(), maxlen=20)) ax.set_ylabel(_shortname(ax.get_ylabel(), maxlen=20)) def _find_scatter_plots_classification(X, target, how_many=3, random_state=None): # input is continuous # look at all pairs of features, find most promising ones # dummy = DummyClassifier(strategy='prior').fit(X, target) # baseline_score = recall_score(target, dummy.predict(X), average='macro') scores = [] # converting to int here might save some time _, target = np.unique(target, return_inverse=True) # limit to 2000 training points for speed? train_size = min(2000, int(.9 * X.shape[0])) cv = StratifiedShuffleSplit(n_splits=3, train_size=train_size, random_state=random_state) for i, j in itertools.combinations(np.arange(X.shape[1]), 2): this_X = X[:, [i, j]] # assume this tree is simple enough so not be able to overfit in 2d # so we don't bother with train/test split tree = DecisionTreeClassifier(max_leaf_nodes=8) scores.append((i, j, np.mean(cross_val_score( tree, this_X, target, cv=cv, scoring='recall_macro')))) scores = pd.DataFrame(scores, columns=['feature0', 'feature1', 'score']) top = scores.sort_values(by='score').iloc[-how_many:][::-1] return top
[docs]def discrete_scatter(x, y, c, unique_c=None, legend='first', clip_outliers=True, alpha='auto', s='auto', ax=None, **kwargs): """Scatter plot for categories. Creates a scatter plot for x and y grouped by c. Parameters ---------- x : array-like x coordinates to scatter y : array-like y coordinates to scatter c : array-like Grouping of samples (similar to hue in seaborn) legend : bool, or "first", default="first" Whether to create a legend. "first" mean only the first one in a given gridspec. scatter_alpha : float, default='auto' Alpha values for scatter plots. 'auto' is dirty hacks. scatter_size : float, default='auto'. Marker size for scatter plots. 'auto' is dirty hacks. ax : matplotlib axes, default=None Axes to plot into kwargs : Passed through to plt.scatter """ alpha = _get_scatter_alpha(alpha, x) s = _get_scatter_size(s, x) if ax is None: ax = plt.gca() if legend == "first": legend = (ax.get_geometry()[2] == 1) if unique_c is None: unique_c = np.unique(c) for i in unique_c: mask = c == i ax.scatter(x[mask], y[mask], label=i, s=s, alpha=alpha, **kwargs) if clip_outliers: x_low, x_high = _inlier_range(x) y_low, y_high = _inlier_range(y) xlims = ax.get_xlim() ylims = ax.get_ylim() ax.set_xlim(max(x_low, xlims[0]), min(x_high, xlims[1])) ax.set_ylim(max(y_low, ylims[0]), min(y_high, ylims[1])) if legend: props = {} if len(unique_c) > 15: props['size'] = 6 legend = ax.legend(prop=props) for handle in legend.legendHandles: handle.set_alpha(1) handle.set_sizes((100,))
[docs]def class_hists(data, column, target, bins="auto", ax=None, legend=False, scale_separately=True): """Grouped univariate histograms. Parameters ---------- data : pandas DataFrame Input data to plot column : column specifier Column in the data to compute histograms over (must be continuous). target : column specifier Target column in data, must be categorical. bins : string, int or array-like Number of bins, 'auto' or bin edges. Passed to np.histogram_bin_edges. We always show at least 5 bins for now. ax : matplotlib axes Axes to plot into legend : boolean, default=False Whether to create a legend. scale_separately : boolean, default=True Whether to scale each class separately. Examples -------- >>> from dabl.datasets import load_adult >>> data = load_adult() >>> class_hists(data, "age", "gender", legend=True) <matplotlib... """ col_data = data[column].dropna() if ax is None: ax = plt.gca() if col_data.nunique() > 10: ordinal = False # histograms bin_edges = np.histogram_bin_edges(col_data, bins=bins) if len(bin_edges) > 30: bin_edges = np.histogram_bin_edges(col_data, bins=30) counts = {} for name, group in data.groupby(target)[column]: this_counts, _ = np.histogram(group, bins=bin_edges) counts[name] = this_counts counts = pd.DataFrame(counts) else: ordinal = True # ordinal data, count distinct values counts = data.groupby(target)[column].value_counts().unstack(target) if scale_separately: # normalize by maximum counts = counts / counts.max() bottom = counts.max().max() * 1.1 for i, name in enumerate(counts.columns): if ordinal: ax.bar(range(counts.shape[0]), counts[name], width=.9, bottom=bottom * i, tick_label=counts.index, linewidth=2, edgecolor='k') xmin, xmax = 0 - .5, counts.shape[0] - .5 else: ax.bar(bin_edges[:-1], counts[name], bottom=bottom * i, label=name, align='edge', width=(bin_edges[1] - bin_edges[0]) * .9) xmin, xmax = bin_edges[0], bin_edges[-1] ax.hlines(bottom * i, xmin=xmin, xmax=xmax, linewidth=1) if legend: ax.legend() ax.set_yticks(()) ax.set_xlabel(column) return ax
def pairplot(data, target_col, columns=None, scatter_alpha='auto', scatter_size='auto'): """Pairplot (scattermatrix) Because there's already too many implementations of this. This is meant for classification only. This is very bare-bones right now :-/ Parameters ---------- data : pandas dataframe Input data target_col : column specifier Target column in data. columns : column specifiers, default=None. Columns in data to include. None means all. scatter_alpha : float, default='auto' Alpha values for scatter plots. 'auto' is dirty hacks. scatter_size : float, default='auto'. Marker size for scatter plots. 'auto' is dirty hacks. """ if columns is None: columns = data.columns.drop(target_col) n_features = len(columns) fig, axes = plt.subplots(n_features, n_features, figsize=(n_features * 3, n_features * 3)) axes = np.atleast_2d(axes) for ax, (i, j) in zip(axes.ravel(), itertools.product(range(n_features), repeat=2)): legend = i == 0 and j == n_features - 1 if i == j: class_hists(data, columns[i], target_col, ax=ax.twinx()) else: discrete_scatter(data[columns[j]], data[columns[i]], c=data[target_col], legend=legend, ax=ax, alpha=scatter_alpha, s=scatter_size) if j == 0: ax.set_ylabel(columns[i]) else: ax.set_ylabel("") ax.set_yticklabels(()) if i == n_features - 1: ax.set_xlabel(columns[j]) else: ax.set_xlabel("") ax.set_xticklabels(()) despine(fig) if n_features > 1: axes[0, 0].set_yticks(axes[0, 1].get_yticks()) axes[0, 0].set_ylim(axes[0, 1].get_ylim()) return axes def _inlier_range(series): low = np.nanquantile(series, 0.01) high = np.nanquantile(series, 0.99) assert low <= high # the two is a complete hack inner_range = (high - low) / 2 return low - inner_range, high + inner_range def _find_inliers(series): low, high = _inlier_range(series) mask = series.between(low, high) mask = mask | series.isna() dropped = len(mask) - mask.sum() if dropped > 0: warn("Dropped {} outliers in column {}.".format( int(dropped), series.name), UserWarning) return mask def _clean_outliers(data): def _find_outliers_series(series): series = series.dropna() low = series.quantile(0.01) high = series.quantile(0.99) # the two is a complete hack inner_range = (high - low) / 2 return series.between(low - inner_range, high + inner_range) mask = data.apply(_find_outliers_series) mask = mask.apply(lambda x: reduce(np.logical_and, x), axis=1).fillna(True) dropped = len(mask) - mask.sum() if dropped > 0: warn("Dropped {} outliers.".format(int(dropped)), UserWarning) return mask return None def _get_scatter_alpha(scatter_alpha, x): if scatter_alpha != "auto": return scatter_alpha if x.shape[0] < 100: return .9 elif x.shape[0] < 1000: return .5 elif x.shape[0] < 10000: return .2 else: return .1 def _get_scatter_size(scatter_size, x): if scatter_size != "auto": return scatter_size if x.shape[0] < 100: return 30 elif x.shape[0] < 1000: return 30 elif x.shape[0] < 2000: return 10 elif x.shape[0] < 10000: return 2 else: return 1 def plot_multiclass_roc_curve(estimator, X_val, y_val): if len(estimator.classes_) < 3: raise ValueError("Only for multi-class") try: y_score = estimator.predict_proba(X_val) except AttributeError: y_score = estimator.decision_function(X_val) fig, axes = _make_subplots(len(estimator.classes_)) for i, (ax, c) in enumerate(zip(axes.ravel(), estimator.classes_)): fpr, tpr, _ = roc_curve(y_val == c, y_score[:, i]) ax.plot(fpr, tpr) ax.set_xlabel("False Positive Rate") ax.set_ylabel("True Positive Rate (recall)") ax.set_title("ROC curve for class {}".format(c))