"""Plotting utilities.
TODO: more info...
----------
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
COLORS = plt.rcParams["axes.prop_cycle"].by_key()["color"]
[docs]def approx_kde(data, bins=500, bw=0.075):
"""A fast approximation to kernel density estimation."""
stds = 3 # use a gaussian kernel w/ this many std devs
counts, be = np.histogram(data, bins=bins)
db = be[1] - be[0]
pad = 0.5 * bins * bw * stds * db
pbe = np.arange(db, pad, db)
x_out = np.concatenate(
(be[0] - np.flip(pbe), be[0:-1] + np.diff(be), be[-1] + pbe)
)
z_pad = np.zeros(pbe.shape[0])
raw = np.concatenate((z_pad, counts, z_pad))
k_x = np.linspace(-stds, stds, int(bins * bw * stds))
kernel = 1.0 / np.sqrt(2.0 * np.pi) * np.exp(-np.square(k_x) / 2.0)
y_out = np.convolve(raw, kernel, mode="same")
return x_out, y_out
[docs]def get_next_color(def_color, ix):
"""Get the next color in the color cycle"""
if def_color is None:
return COLORS[ix % len(COLORS)]
elif isinstance(def_color, list):
return def_color[ix % len(def_color)]
else:
return def_color
[docs]def get_ix_label(ix, shape):
"""Get a string representation of the current index"""
dims = np.zeros(len(shape))
for d in range(len(shape) - 1, 0, -1):
prod = np.prod(shape[:d])
dims[d] = np.floor(ix / prod)
ix -= dims[d] * prod
dims[0] = ix
if len(shape) == 1:
return str(dims[0].astype("int32"))
else:
return str(list(dims.astype("int32")))
[docs]def plot_dist(
data,
xlabel="",
style="fill",
bins=20,
ci=0.0,
bw=0.075,
alpha=0.4,
color=None,
legend=True,
):
"""Plot the distribution of samples.
Parameters
----------
data : |ndarray|
Samples to plot. Should be of size (Nsamples,...)
xlabel : str
Label for the x axis
style : str
Which style of plot to create. Available types are:
* ``'fill'`` - filled density plot (the default)
* ``'line'`` - line density plot
* ``'hist'`` - histogram
bins : int or list or |ndarray|
Number of bins to use for the histogram (if
``kde=False``), or a list or vector of bin edges.
ci : float between 0 and 1
Confidence interval to plot. Default = 0.0 (i.e., not plotted)
bw : float
Bandwidth of the kernel density estimate (if using ``style='line'``
or ``style='fill'``). Default is 0.075
alpha : float between 0 and 1
Transparency of the plot (if ``style``=``'fill'`` or ``'hist'``)
color : matplotlib color code or list of them
Color(s) to use to plot the distribution.
See https://matplotlib.org/tutorials/colors/colors.html
Default = use the default matplotlib color cycle
legend : bool
Whether to show legends for plots with >1 distribution
Default = True
"""
# Check inputs
if ci < 0.0 or ci > 1.0:
raise ValueError("ci must be between 0 and 1")
# If 1d make 2d
if data.ndim == 1:
data = np.expand_dims(data, 1)
# Number of datasets
dims = data.shape[1:]
Nd = np.prod(dims)
# Flatten if >1D
data = np.reshape(data, (data.shape[0], Nd), order="F")
# Compute confidence intervals
if ci:
cis = np.empty((Nd, 2))
ci0 = 100 * (0.5 - ci / 2.0)
ci1 = 100 * (0.5 + ci / 2.0)
for i in range(Nd):
cis[i, :] = np.percentile(data[:, i], [ci0, ci1])
# Plot the data
for i in range(Nd):
next_color = get_next_color(color, i)
lab = get_ix_label(i, dims)
if style == "line":
px, py = approx_kde(data[:, i], bw=bw)
plt.plot(px, py, color=next_color, label=lab)
if ci:
yci = np.interp(cis[i, :], px, py)
plt.plot(
[cis[i, 0], cis[i, 0]], [0, yci[0]], ":", color=next_color
)
plt.plot(
[cis[i, 1], cis[i, 1]], [0, yci[1]], ":", color=next_color
)
elif style == "fill":
px, py = approx_kde(data[:, i], bw=bw)
plt.fill(px, py, facecolor=next_color, alpha=alpha, label=lab)
if ci:
k = (px > cis[i, 0]) & (px < cis[i, 1])
kx = px[k]
ky = py[k]
plt.fill(
np.concatenate(([kx[0]], kx, [kx[-1]])),
np.concatenate(([0], ky, [0])),
facecolor=next_color,
alpha=alpha,
)
elif style == "hist":
_, be, patches = plt.hist(
data[:, i], alpha=alpha, bins=bins, color=next_color, label=lab
)
if ci:
k = (data[:, i] > cis[i, 0]) & (data[:, i] < cis[i, 1])
plt.hist(data[k, i], alpha=alpha, bins=be, color=next_color)
else:
raise ValueError("style must be 'fill', 'line', or 'hist'")
# Only show the legend if there are >1 sample set
if Nd > 1 and legend:
plt.legend()
# Set x axis label, and no y axis or bounding box needed
plt.xlabel(xlabel)
plt.gca().get_yaxis().set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
[docs]def plot_line(xdata, ydata, xlabel="", ylabel="", fmt="-", color=None):
"""Plot lines.
Parameters
----------
xdata : |ndarray|
X values of points to plot. Should be vector of length ``Nsamples``.
ydata : |ndarray|
Y vaules of points to plot. Should be of size ``(Nsamples,...)``.
xlabel : str
Label for the x axis. Default is no x axis label.
ylabel : str
Label for the y axis. Default is no y axis label.
fmt : str or matplotlib linespec
Line marker to use. Default = ``'-'`` (a normal line).
color : matplotlib color code or list of them
Color(s) to use to plot the distribution.
See https://matplotlib.org/tutorials/colors/colors.html
Default = use the default matplotlib color cycle
"""
# If 1d make 2d
if ydata.ndim == 1:
ydata = np.expand_dims(ydata, 1)
# Check x and y are the same size
if xdata.shape[0] != ydata.shape[0]:
raise ValueError("x and y data do not have same length")
# Number of datasets
dims = ydata.shape[1:]
Nd = np.prod(dims)
# Flatten if >1D
ydata = np.reshape(ydata, (ydata.shape[0], Nd), order="F")
# Plot the data
for i in range(Nd):
next_color = get_next_color(color, i)
lab = get_ix_label(i, dims)
plt.plot(xdata, ydata[:, i], fmt, color=next_color, label=lab)
# Only show the legend if there are >1 sample set
if Nd > 1:
plt.legend()
# Set axis labels
plt.xlabel(xlabel)
plt.ylabel(ylabel)
[docs]def fill_between(xdata, lb, ub, xlabel="", ylabel="", alpha=0.3, color=None):
"""Fill between lines.
Parameters
----------
xdata : |ndarray|
X values of points to plot. Should be vector of length ``Nsamples``.
lb : |ndarray|
Lower bound of fill. Should be of size ``(Nsamples,...)``.
ub : |ndarray|
Upper bound of fill. Should be same size as lb.
xlabel : str
Label for the x axis. Default is no x axis label.
ylabel : str
Label for the y axis. Default is no y axis label.
fmt : str or matplotlib linespec
Line marker to use. Default = ``'-'`` (a normal line).
color : matplotlib color code or list of them
Color(s) to use to plot the distribution.
See https://matplotlib.org/tutorials/colors/colors.html
Default = use the default matplotlib color cycle
"""
# Check shapes
if not np.all(lb.shape == ub.shape):
raise ValueError("lb and ub must have same shape")
if len(xdata) != lb.shape[0]:
raise ValueError("xdata does not match shape of lb and ub")
# If 1d make 2d
if lb.ndim == 1:
lb = np.expand_dims(lb, 1)
ub = np.expand_dims(ub, 1)
# Number of fills and datasets
dims = lb.shape[1:]
Nd = int(np.prod(dims))
# Flatten if >1D
lb = np.reshape(lb, (lb.shape[0], Nd), order="F")
ub = np.reshape(ub, (ub.shape[0], Nd), order="F")
# Plot the data
for iD in range(Nd): # for each dataset,
next_color = get_next_color(color, iD)
lab = get_ix_label(iD, dims)
plt.fill_between(
xdata,
lb[:, iD],
ub[:, iD],
alpha=alpha,
facecolor=next_color,
label=lab,
)
# Only show the legend if there are >1 datasets
if Nd > 1:
plt.legend()
# Set x axis label, and no y axis or bounding box needed
plt.xlabel(xlabel)
plt.ylabel(ylabel)
[docs]def centered_text(text):
"""Display text centered in the figure"""
plt.gca().text(
0.5,
0.5,
text,
horizontalalignment="center",
verticalalignment="center",
transform=plt.gca().transAxes,
)
[docs]def plot_discrete_dist(x):
"""Plot histogram of discrete variable"""
minx = np.min(x)
maxx = np.max(x)
be = np.linspace(minx - 0.5, maxx + 0.5, int(maxx - minx + 2))
bc = np.linspace(minx, maxx, int(maxx - minx + 1))
xc, _ = np.histogram(x, be)
xc = xc / xc.sum() # normalize
plt.bar(bc, xc)
[docs]def plot_categorical_dist(x):
"""Plot histogram of categorical variable"""
xc = pd.Series(x.ravel()).value_counts().sort_index()
xc = xc / xc.sum() # normalize
plt.bar(xc.index, xc.values)
if len(xc.index) < 15:
plt.xticks(xc.index, [str(e) for e in xc.index])
else:
step = int(len(xc.index) / 7)
plt.xticks(xc.index[::step], [str(e) for e in xc.index[::step]])
[docs]def plot_by(
x, data, bins=30, func="mean", plot=True, bootstrap=100, ci=0.95, **kwargs
):
"""Compute and plot some function func of data as a function of x.
Parameters
----------
x : |ndarray|
Coordinates of data to plot
data : |ndarray|
Data to plot by bins of x
bins : int
Number of bins to bin x into
func : callable or str
Function to apply on elements of data in each x bin. Can be a
callable or one of the following str:
* ``'count'``
* ``'sum'``
* ``'mean'``
* ``'median'``
Default = ``'mean'``
plot : bool
Whether to plot ``data`` as a function of ``x``
Default = False
bootstrap : None or int > 0
Number of bootstrap samples to use for estimating the uncertainty of
the true coverage.
ci : list of float between 0 and 1
Bootstrapped confidence interval percentiles of coverage to show.
**kwargs
Additional arguments are passed to plt.plot or fill_between
Returns
-------
x_o : |ndarray|
``x`` bin centers
data_o : |ndarray|
``func`` applied to ``data`` values in each ``x`` bin
"""
# Check types
if not isinstance(bins, int):
raise TypeError("bins must be an int")
if bins < 1:
raise ValueError("bins must be positive")
if not isinstance(plot, bool):
raise TypeError("plot must be True or False")
if bootstrap is not None and not isinstance(bootstrap, int):
raise TypeError("bootstrap must be None or an int")
if isinstance(bootstrap, int) and bootstrap < 1:
raise ValueError("bootstrap must be > 0")
if ci < 0.0 or ci > 1.0:
raise ValueError("ci must be between 0 and 1")
# Determine what function to use
if callable(func):
pass
elif isinstance(func, str):
if func == "mean":
func = np.mean
elif func == "median":
func = np.median
elif func == "count":
func = len
else:
raise ValueError("Unknown function name " + func)
else:
raise TypeError("func must be a callable or a function name str")
# Default color
if "color" in kwargs:
color = kwargs["color"]
else:
color = COLORS[0]
# Ensure x is at least 2d
if x.ndim == 1:
x = x.reshape(-1, 1)
# 1 Dimensional
if x.shape[1] == 1:
# Create bins over x
edges = np.linspace(min(x), max(x), int(bins)).flatten()
edges[-1] += 1e-9
bin_id = np.digitize(x, edges)
x_o = (edges[:-1] + edges[1:]) / 2.0 # bin centers
# Bootstrap estimate coverage uncertainty
if bootstrap is not None:
# Compute func for data in each bins
boots = pd.DataFrame(index=range(1, bins))
for iB in range(bootstrap):
ix = np.random.choice(range(data.size), size=data.size)
boots[str(iB)] = (
pd.Series(data[ix].flatten())
.groupby(bin_id[ix].flatten())
.agg(func)
)
# Plot coverage confidence intervals
ci = np.array(ci)
ci_lb = 100 * (0.5 - ci / 2.0)
ci_ub = 100 * (0.5 + ci / 2.0)
boots = boots.values
prc_lb = np.nanpercentile(boots, ci_lb, axis=1)
prc_ub = np.nanpercentile(boots, ci_ub, axis=1)
plt.fill_between(x_o, prc_lb, prc_ub, alpha=0.3, facecolor=color)
# Compute func for data in each bins
data_o = pd.DataFrame(index=range(1, bins))
data_o["data"] = (
pd.Series(data.flatten()).groupby(bin_id.flatten()).agg(func)
)
data_o = data_o["data"]
# Plot coverage
plt.plot(x_o, data_o, **kwargs)
# Return values
return x_o, data_o.values
# 2 Dimensional
elif x.shape[1] == 2:
pass
# TODO
else:
raise ValueError("x.shape[1] cannot be >2")