Source code for probflow.utils.plotting

"""Plotting utilities.

TODO: more info...

----------

"""


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

COLORS = plt.rcParams["axes.prop_cycle"].by_key()["color"]


[docs]def approx_kde(data, bins=500, bw=0.075):
    """A fast approximation to kernel density estimation."""
    stds = 3  # use a gaussian kernel w/ this many std devs
    counts, be = np.histogram(data, bins=bins)
    db = be[1] - be[0]
    pad = 0.5 * bins * bw * stds * db
    pbe = np.arange(db, pad, db)
    x_out = np.concatenate(
        (be[0] - np.flip(pbe), be[0:-1] + np.diff(be), be[-1] + pbe)
    )
    z_pad = np.zeros(pbe.shape[0])
    raw = np.concatenate((z_pad, counts, z_pad))
    k_x = np.linspace(-stds, stds, int(bins * bw * stds))
    kernel = 1.0 / np.sqrt(2.0 * np.pi) * np.exp(-np.square(k_x) / 2.0)
    y_out = np.convolve(raw, kernel, mode="same")
    return x_out, y_out


[docs]def get_next_color(def_color, ix):
    """Get the next color in the color cycle"""
    if def_color is None:
        return COLORS[ix % len(COLORS)]
    elif isinstance(def_color, list):
        return def_color[ix % len(def_color)]
    else:
        return def_color


[docs]def get_ix_label(ix, shape):
    """Get a string representation of the current index"""
    dims = np.zeros(len(shape))
    for d in range(len(shape) - 1, 0, -1):
        prod = np.prod(shape[:d])
        dims[d] = np.floor(ix / prod)
        ix -= dims[d] * prod
    dims[0] = ix
    if len(shape) == 1:
        return str(dims[0].astype("int32"))
    else:
        return str(list(dims.astype("int32")))


[docs]def plot_dist(
    data,
    xlabel="",
    style="fill",
    bins=20,
    ci=0.0,
    bw=0.075,
    alpha=0.4,
    color=None,
    legend=True,
):
    """Plot the distribution of samples.

    Parameters
    ----------
    data : |ndarray|
        Samples to plot.  Should be of size (Nsamples,...)
    xlabel : str
        Label for the x axis
    style : str
        Which style of plot to create.  Available types are:

        * ``'fill'`` - filled density plot (the default)
        * ``'line'`` - line density plot
        * ``'hist'`` - histogram

    bins : int or list or |ndarray|
        Number of bins to use for the histogram (if
        ``kde=False``), or a list or vector of bin edges.
    ci : float between 0 and 1
        Confidence interval to plot.  Default = 0.0 (i.e., not plotted)
    bw : float
        Bandwidth of the kernel density estimate (if using ``style='line'``
        or ``style='fill'``).  Default is 0.075
    alpha : float between 0 and 1
        Transparency of the plot (if ``style``=``'fill'`` or ``'hist'``)
    color : matplotlib color code or list of them
        Color(s) to use to plot the distribution.
        See https://matplotlib.org/tutorials/colors/colors.html
        Default = use the default matplotlib color cycle
    legend : bool
        Whether to show legends for plots with >1 distribution
        Default = True
    """

    # Check inputs
    if ci < 0.0 or ci > 1.0:
        raise ValueError("ci must be between 0 and 1")

    # If 1d make 2d
    if data.ndim == 1:
        data = np.expand_dims(data, 1)

    # Number of datasets
    dims = data.shape[1:]
    Nd = np.prod(dims)

    # Flatten if >1D
    data = np.reshape(data, (data.shape[0], Nd), order="F")

    # Compute confidence intervals
    if ci:
        cis = np.empty((Nd, 2))
        ci0 = 100 * (0.5 - ci / 2.0)
        ci1 = 100 * (0.5 + ci / 2.0)
        for i in range(Nd):
            cis[i, :] = np.percentile(data[:, i], [ci0, ci1])

    # Plot the data
    for i in range(Nd):
        next_color = get_next_color(color, i)
        lab = get_ix_label(i, dims)
        if style == "line":
            px, py = approx_kde(data[:, i], bw=bw)
            plt.plot(px, py, color=next_color, label=lab)
            if ci:
                yci = np.interp(cis[i, :], px, py)
                plt.plot(
                    [cis[i, 0], cis[i, 0]], [0, yci[0]], ":", color=next_color
                )
                plt.plot(
                    [cis[i, 1], cis[i, 1]], [0, yci[1]], ":", color=next_color
                )
        elif style == "fill":
            px, py = approx_kde(data[:, i], bw=bw)
            plt.fill(px, py, facecolor=next_color, alpha=alpha, label=lab)
            if ci:
                k = (px > cis[i, 0]) & (px < cis[i, 1])
                kx = px[k]
                ky = py[k]
                plt.fill(
                    np.concatenate(([kx[0]], kx, [kx[-1]])),
                    np.concatenate(([0], ky, [0])),
                    facecolor=next_color,
                    alpha=alpha,
                )
        elif style == "hist":
            _, be, patches = plt.hist(
                data[:, i], alpha=alpha, bins=bins, color=next_color, label=lab
            )
            if ci:
                k = (data[:, i] > cis[i, 0]) & (data[:, i] < cis[i, 1])
                plt.hist(data[k, i], alpha=alpha, bins=be, color=next_color)
        else:
            raise ValueError("style must be 'fill', 'line', or 'hist'")

    # Only show the legend if there are >1 sample set
    if Nd > 1 and legend:
        plt.legend()

    # Set x axis label, and no y axis or bounding box needed
    plt.xlabel(xlabel)
    plt.gca().get_yaxis().set_visible(False)
    plt.gca().spines["left"].set_visible(False)
    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["right"].set_visible(False)


[docs]def plot_line(xdata, ydata, xlabel="", ylabel="", fmt="-", color=None):
    """Plot lines.

    Parameters
    ----------
    xdata : |ndarray|
        X values of points to plot.  Should be vector of length ``Nsamples``.
    ydata : |ndarray|
        Y vaules of points to plot.  Should be of size ``(Nsamples,...)``.
    xlabel : str
        Label for the x axis. Default is no x axis label.
    ylabel : str
        Label for the y axis.  Default is no y axis label.
    fmt : str or matplotlib linespec
        Line marker to use.  Default = ``'-'`` (a normal line).
    color : matplotlib color code or list of them
        Color(s) to use to plot the distribution.
        See https://matplotlib.org/tutorials/colors/colors.html
        Default = use the default matplotlib color cycle
    """

    # If 1d make 2d
    if ydata.ndim == 1:
        ydata = np.expand_dims(ydata, 1)

    # Check x and y are the same size
    if xdata.shape[0] != ydata.shape[0]:
        raise ValueError("x and y data do not have same length")

    # Number of datasets
    dims = ydata.shape[1:]
    Nd = np.prod(dims)

    # Flatten if >1D
    ydata = np.reshape(ydata, (ydata.shape[0], Nd), order="F")

    # Plot the data
    for i in range(Nd):
        next_color = get_next_color(color, i)
        lab = get_ix_label(i, dims)
        plt.plot(xdata, ydata[:, i], fmt, color=next_color, label=lab)

    # Only show the legend if there are >1 sample set
    if Nd > 1:
        plt.legend()

    # Set axis labels
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)


[docs]def fill_between(xdata, lb, ub, xlabel="", ylabel="", alpha=0.3, color=None):
    """Fill between lines.

    Parameters
    ----------
    xdata : |ndarray|
        X values of points to plot.  Should be vector of length ``Nsamples``.
    lb : |ndarray|
        Lower bound of fill.  Should be of size ``(Nsamples,...)``.
    ub : |ndarray|
        Upper bound of fill.  Should be same size as lb.
    xlabel : str
        Label for the x axis. Default is no x axis label.
    ylabel : str
        Label for the y axis.  Default is no y axis label.
    fmt : str or matplotlib linespec
        Line marker to use.  Default = ``'-'`` (a normal line).
    color : matplotlib color code or list of them
        Color(s) to use to plot the distribution.
        See https://matplotlib.org/tutorials/colors/colors.html
        Default = use the default matplotlib color cycle
    """

    # Check shapes
    if not np.all(lb.shape == ub.shape):
        raise ValueError("lb and ub must have same shape")
    if len(xdata) != lb.shape[0]:
        raise ValueError("xdata does not match shape of lb and ub")

    # If 1d make 2d
    if lb.ndim == 1:
        lb = np.expand_dims(lb, 1)
        ub = np.expand_dims(ub, 1)

    # Number of fills and datasets
    dims = lb.shape[1:]
    Nd = int(np.prod(dims))

    # Flatten if >1D
    lb = np.reshape(lb, (lb.shape[0], Nd), order="F")
    ub = np.reshape(ub, (ub.shape[0], Nd), order="F")

    # Plot the data
    for iD in range(Nd):  # for each dataset,
        next_color = get_next_color(color, iD)
        lab = get_ix_label(iD, dims)
        plt.fill_between(
            xdata,
            lb[:, iD],
            ub[:, iD],
            alpha=alpha,
            facecolor=next_color,
            label=lab,
        )

    # Only show the legend if there are >1 datasets
    if Nd > 1:
        plt.legend()

    # Set x axis label, and no y axis or bounding box needed
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)


[docs]def centered_text(text):
    """Display text centered in the figure"""
    plt.gca().text(
        0.5,
        0.5,
        text,
        horizontalalignment="center",
        verticalalignment="center",
        transform=plt.gca().transAxes,
    )


[docs]def plot_discrete_dist(x):
    """Plot histogram of discrete variable"""
    minx = np.min(x)
    maxx = np.max(x)
    be = np.linspace(minx - 0.5, maxx + 0.5, int(maxx - minx + 2))
    bc = np.linspace(minx, maxx, int(maxx - minx + 1))
    xc, _ = np.histogram(x, be)
    xc = xc / xc.sum()  # normalize
    plt.bar(bc, xc)


[docs]def plot_categorical_dist(x):
    """Plot histogram of categorical variable"""
    xc = pd.Series(x.ravel()).value_counts().sort_index()
    xc = xc / xc.sum()  # normalize
    plt.bar(xc.index, xc.values)
    if len(xc.index) < 15:
        plt.xticks(xc.index, [str(e) for e in xc.index])
    else:
        step = int(len(xc.index) / 7)
        plt.xticks(xc.index[::step], [str(e) for e in xc.index[::step]])


[docs]def plot_by(
    x, data, bins=30, func="mean", plot=True, bootstrap=100, ci=0.95, **kwargs
):
    """Compute and plot some function func of data as a function of x.

    Parameters
    ----------
    x : |ndarray|
        Coordinates of data to plot
    data : |ndarray|
        Data to plot by bins of x
    bins : int
        Number of bins to bin x into
    func : callable or str
        Function to apply on elements of data in each x bin.  Can be a
        callable or one of the following str:

        * ``'count'``
        * ``'sum'``
        * ``'mean'``
        * ``'median'``

        Default = ``'mean'``

    plot : bool
        Whether to plot ``data`` as a function of ``x``
        Default = False
    bootstrap : None or int > 0
        Number of bootstrap samples to use for estimating the uncertainty of
        the true coverage.
    ci : list of float between 0 and 1
        Bootstrapped confidence interval percentiles of coverage to show.
    **kwargs
        Additional arguments are passed to plt.plot or fill_between

    Returns
    -------
    x_o : |ndarray|
        ``x`` bin centers
    data_o : |ndarray|
        ``func`` applied to ``data`` values in each ``x`` bin
    """

    # Check types
    if not isinstance(bins, int):
        raise TypeError("bins must be an int")
    if bins < 1:
        raise ValueError("bins must be positive")
    if not isinstance(plot, bool):
        raise TypeError("plot must be True or False")
    if bootstrap is not None and not isinstance(bootstrap, int):
        raise TypeError("bootstrap must be None or an int")
    if isinstance(bootstrap, int) and bootstrap < 1:
        raise ValueError("bootstrap must be > 0")
    if ci < 0.0 or ci > 1.0:
        raise ValueError("ci must be between 0 and 1")

    # Determine what function to use
    if callable(func):
        pass
    elif isinstance(func, str):
        if func == "mean":
            func = np.mean
        elif func == "median":
            func = np.median
        elif func == "count":
            func = len
        else:
            raise ValueError("Unknown function name " + func)
    else:
        raise TypeError("func must be a callable or a function name str")

    # Default color
    if "color" in kwargs:
        color = kwargs["color"]
    else:
        color = COLORS[0]

    # Ensure x is at least 2d
    if x.ndim == 1:
        x = x.reshape(-1, 1)

    # 1 Dimensional
    if x.shape[1] == 1:

        # Create bins over x
        edges = np.linspace(min(x), max(x), int(bins)).flatten()
        edges[-1] += 1e-9
        bin_id = np.digitize(x, edges)
        x_o = (edges[:-1] + edges[1:]) / 2.0  # bin centers

        # Bootstrap estimate coverage uncertainty
        if bootstrap is not None:

            # Compute func for data in each bins
            boots = pd.DataFrame(index=range(1, bins))
            for iB in range(bootstrap):
                ix = np.random.choice(range(data.size), size=data.size)
                boots[str(iB)] = (
                    pd.Series(data[ix].flatten())
                    .groupby(bin_id[ix].flatten())
                    .agg(func)
                )

            # Plot coverage confidence intervals
            ci = np.array(ci)
            ci_lb = 100 * (0.5 - ci / 2.0)
            ci_ub = 100 * (0.5 + ci / 2.0)
            boots = boots.values
            prc_lb = np.nanpercentile(boots, ci_lb, axis=1)
            prc_ub = np.nanpercentile(boots, ci_ub, axis=1)
            plt.fill_between(x_o, prc_lb, prc_ub, alpha=0.3, facecolor=color)

        # Compute func for data in each bins
        data_o = pd.DataFrame(index=range(1, bins))
        data_o["data"] = (
            pd.Series(data.flatten()).groupby(bin_id.flatten()).agg(func)
        )
        data_o = data_o["data"]

        # Plot coverage
        plt.plot(x_o, data_o, **kwargs)

        # Return values
        return x_o, data_o.values

    # 2 Dimensional
    elif x.shape[1] == 2:

        pass
        # TODO

    else:
        raise ValueError("x.shape[1] cannot be >2")