import random import matplotlib.lines as mlines import matplotlib.patches as patches import numpy as np from pandas.core.dtypes.missing import notna from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import _set_ticks_props, _subplots def scatter_matrix( frame, alpha=0.5, figsize=None, ax=None, grid=False, diagonal="hist", marker=".", density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds, ): df = frame._get_numeric_data() n = df.columns.size naxes = n * n fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = notna(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # GH 14855 kwds.setdefault("edgecolors", "none") boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0 boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in enumerate(df.columns): for j, b in enumerate(df.columns): ax = axes[i, j] if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == "hist": ax.hist(values, **hist_kwds) elif diagonal in ("kde", "density"): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: common = (mask[a] & mask[b]).values ax.scatter( df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds ) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel(b) ax.set_ylabel(a) if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) if len(df.columns) > 1: lim1 = boundaries_list[0] locs = axes[0][1].yaxis.get_majorticklocs() locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) lim0 = axes[0][0].get_ylim() adj = adj * (lim0[1] - lim0[0]) + lim0[0] axes[0][0].yaxis.set_ticks(adj) if np.all(locs == locs.astype(int)): # if all ticks are int locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) return axes def _get_marker_compat(marker): if marker not in mlines.lineMarkers: return "o" return marker def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): import matplotlib.pyplot as plt def normalize(series): a = min(series) b = max(series) return (series - a) / (b - a) n = len(frame) classes = frame[class_column].drop_duplicates() class_col = frame[class_column] df = frame.drop(class_column, axis=1).apply(normalize) if ax is None: ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) to_plot = {} colors = _get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color ) for kls in classes: to_plot[kls] = [[], []] m = len(frame.columns) - 1 s = np.array( [ (np.cos(t), np.sin(t)) for t in [2.0 * np.pi * (i / float(m)) for i in range(m)] ] ) for i in range(n): row = df.iloc[i].values row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) y = (s * row_).sum(axis=0) / row.sum() kls = class_col.iat[i] to_plot[kls][0].append(y[0]) to_plot[kls][1].append(y[1]) for i, kls in enumerate(classes): ax.scatter( to_plot[kls][0], to_plot[kls][1], color=colors[i], label=pprint_thing(kls), **kwds, ) ax.legend() ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) for xy, name in zip(s, df.columns): ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) if xy[0] < 0.0 and xy[1] < 0.0: ax.text( xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small" ) elif xy[0] < 0.0 and xy[1] >= 0.0: ax.text( xy[0] - 0.025, xy[1] + 0.025, name, ha="right", va="bottom", size="small", ) elif xy[0] >= 0.0 and xy[1] < 0.0: ax.text( xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small" ) elif xy[0] >= 0.0 and xy[1] >= 0.0: ax.text( xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small" ) ax.axis("equal") return ax def andrews_curves( frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds ): import matplotlib.pyplot as plt def function(amplitudes): def f(t): x1 = amplitudes[0] result = x1 / np.sqrt(2.0) # Take the rest of the coefficients and resize them # appropriately. Take a copy of amplitudes as otherwise numpy # deletes the element from amplitudes itself. coeffs = np.delete(np.copy(amplitudes), 0) coeffs.resize(int((coeffs.size + 1) / 2), 2) # Generate the harmonics and arguments for the sin and cos # functions. harmonics = np.arange(0, coeffs.shape[0]) + 1 trig_args = np.outer(harmonics, t) result += np.sum( coeffs[:, 0, np.newaxis] * np.sin(trig_args) + coeffs[:, 1, np.newaxis] * np.cos(trig_args), axis=0, ) return result return f n = len(frame) class_col = frame[class_column] classes = frame[class_column].drop_duplicates() df = frame.drop(class_column, axis=1) t = np.linspace(-np.pi, np.pi, samples) used_legends = set() color_values = _get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color ) colors = dict(zip(classes, color_values)) if ax is None: ax = plt.gca(xlim=(-np.pi, np.pi)) for i in range(n): row = df.iloc[i].values f = function(row) y = f(t) kls = class_col.iat[i] label = pprint_thing(kls) if label not in used_legends: used_legends.add(label) ax.plot(t, y, color=colors[kls], label=label, **kwds) else: ax.plot(t, y, color=colors[kls], **kwds) ax.legend(loc="upper right") ax.grid() return ax def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): import matplotlib.pyplot as plt # TODO: is the failure mentioned below still relevant? # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] means = np.array([np.mean(sampling) for sampling in samplings]) medians = np.array([np.median(sampling) for sampling in samplings]) midranges = np.array( [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings] ) if fig is None: fig = plt.figure() x = list(range(samples)) axes = [] ax1 = fig.add_subplot(2, 3, 1) ax1.set_xlabel("Sample") axes.append(ax1) ax1.plot(x, means, **kwds) ax2 = fig.add_subplot(2, 3, 2) ax2.set_xlabel("Sample") axes.append(ax2) ax2.plot(x, medians, **kwds) ax3 = fig.add_subplot(2, 3, 3) ax3.set_xlabel("Sample") axes.append(ax3) ax3.plot(x, midranges, **kwds) ax4 = fig.add_subplot(2, 3, 4) ax4.set_xlabel("Mean") axes.append(ax4) ax4.hist(means, **kwds) ax5 = fig.add_subplot(2, 3, 5) ax5.set_xlabel("Median") axes.append(ax5) ax5.hist(medians, **kwds) ax6 = fig.add_subplot(2, 3, 6) ax6.set_xlabel("Midrange") axes.append(ax6) ax6.hist(midranges, **kwds) for axis in axes: plt.setp(axis.get_xticklabels(), fontsize=8) plt.setp(axis.get_yticklabels(), fontsize=8) plt.tight_layout() return fig def parallel_coordinates( frame, class_column, cols=None, ax=None, color=None, use_columns=False, xticks=None, colormap=None, axvlines=True, axvlines_kwds=None, sort_labels=False, **kwds, ): import matplotlib.pyplot as plt if axvlines_kwds is None: axvlines_kwds = {"linewidth": 1, "color": "black"} n = len(frame) classes = frame[class_column].drop_duplicates() class_col = frame[class_column] if cols is None: df = frame.drop(class_column, axis=1) else: df = frame[cols] used_legends = set() ncols = len(df.columns) # determine values to use for xticks if use_columns is True: if not np.all(np.isreal(list(df.columns))): raise ValueError("Columns must be numeric to be used as xticks") x = df.columns elif xticks is not None: if not np.all(np.isreal(xticks)): raise ValueError("xticks specified must be numeric") elif len(xticks) != ncols: raise ValueError("Length of xticks must match number of columns") x = xticks else: x = list(range(ncols)) if ax is None: ax = plt.gca() color_values = _get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color ) if sort_labels: classes = sorted(classes) color_values = sorted(color_values) colors = dict(zip(classes, color_values)) for i in range(n): y = df.iloc[i].values kls = class_col.iat[i] label = pprint_thing(kls) if label not in used_legends: used_legends.add(label) ax.plot(x, y, color=colors[kls], label=label, **kwds) else: ax.plot(x, y, color=colors[kls], **kwds) if axvlines: for i in x: ax.axvline(i, **axvlines_kwds) ax.set_xticks(x) ax.set_xticklabels(df.columns) ax.set_xlim(x[0], x[-1]) ax.legend(loc="upper right") ax.grid() return ax def lag_plot(series, lag=1, ax=None, **kwds): # workaround because `c='b'` is hardcoded in matplotlib's scatter method import matplotlib.pyplot as plt kwds.setdefault("c", plt.rcParams["patch.facecolor"]) data = series.values y1 = data[:-lag] y2 = data[lag:] if ax is None: ax = plt.gca() ax.set_xlabel("y(t)") ax.set_ylabel(f"y(t + {lag})") ax.scatter(y1, y2, **kwds) return ax def autocorrelation_plot(series, ax=None, **kwds): import matplotlib.pyplot as plt n = len(series) data = np.asarray(series) if ax is None: ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0)) mean = np.mean(data) c0 = np.sum((data - mean) ** 2) / float(n) def r(h): return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 x = np.arange(n) + 1 y = [r(loc) for loc in x] z95 = 1.959963984540054 z99 = 2.5758293035489004 ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey") ax.axhline(y=z95 / np.sqrt(n), color="grey") ax.axhline(y=0.0, color="black") ax.axhline(y=-z95 / np.sqrt(n), color="grey") ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey") ax.set_xlabel("Lag") ax.set_ylabel("Autocorrelation") ax.plot(x, y, **kwds) if "label" in kwds: ax.legend() ax.grid() return ax