import matplotlib.pyplot as plt
import numpy as np
from probflow.data import DataGenerator, make_generator
from probflow.utils.casting import to_numpy
from probflow.utils.plotting import plot_by, plot_dist
from .model import Model
[docs]class ContinuousModel(Model):
"""Abstract base class for probflow models where the dependent variable
(the target) is continuous and 1-dimensional.
The only advantage to using this over the more general :class:`.Model` is
that :class:`ContinuousModel` also includes several methods specific to
continuous models, for tasks such as getting the predictive intervals,
coverage, R-squared value, or calibration metrics (see below for the full
list of methods).
.. admonition:: Only supports scalar dependent variables
Note that the methods of :class:`.ContinuousModel` only support scalar,
continuous dependent variables (not *any* continuous model, as the name
might suggest). For models which have a multidimensional output, just
use the more general :class:`.Model`; for models with categorical
output (i.e., classifiers), use :class:`.CategoricalModel`; and for
models which have a discrete output (e.g. a Poisson regression), use
:class:`.DiscreteModel`.
This class inherits several methods from :class:`.Module`:
* :attr:`~parameters`
* :attr:`~modules`
* :attr:`~trainable_variables`
* :meth:`~kl_loss`
* :meth:`~kl_loss_batch`
* :meth:`~reset_kl_loss`
* :meth:`~add_kl_loss`
as well as several methods from :class:`.Model`:
* :meth:`~log_likelihood`
* :meth:`~train_step`
* :meth:`~fit`
* :meth:`~stop_training`
* :meth:`~set_learning_rate`
* :meth:`~predictive_sample`
* :meth:`~aleatoric_sample`
* :meth:`~epistemic_sample`
* :meth:`~predict`
* :meth:`~metric`
* :meth:`~posterior_mean`
* :meth:`~posterior_sample`
* :meth:`~posterior_ci`
* :meth:`~prior_sample`
* :meth:`~posterior_plot`
* :meth:`~prior_plot`
* :meth:`~log_prob`
* :meth:`~prob`
* :meth:`~save`
* :meth:`~summary`
and adds the following continuous-model-specific methods:
* :meth:`~predictive_interval`
* :meth:`~aleatoric_interval`
* :meth:`~epistemic_interval`
* :meth:`~pred_dist_plot`
* :meth:`~predictive_prc`
* :meth:`~pred_dist_covered`
* :meth:`~pred_dist_coverage`
* :meth:`~coverage_by`
* :meth:`~r_squared`
* :meth:`~r_squared_plot`
* :meth:`~residuals`
* :meth:`~residuals_plot`
* :meth:`~calibration_curve`
* :meth:`~calibration_curve_plot`
* :meth:`~calibration_metric`
* :meth:`~sharpness`
* :meth:`~coefficient_of_variation`
Example
-------
TODO
"""
def _intervals(self, fn, x, side, ci=0.95, n=1000, batch_size=None):
"""Compute intervals on some type of sample"""
# Compute in batches?
if batch_size is not None:
intervals = [
self._intervals(fn, x_data, side, ci=ci, n=n)
for x_data, y_data in make_generator(
x, test=True, batch_size=batch_size
)
]
return (np.concatenate(e, axis=0) for e in zip(*intervals))
# No batching (or this is a batch)
samples = fn(x, n=n)
if side == "lower":
return np.percentile(samples, 100 * (1.0 - ci), axis=0)
elif side == "upper":
return np.percentile(samples, 100 * ci, axis=0)
else:
lb = 100 * (1.0 - ci) / 2.0
prcs = np.percentile(samples, [lb, 100.0 - lb], axis=0)
return prcs[0, ...], prcs[1, ...]
[docs] def predictive_interval(
self, x, ci=0.95, side="both", n=1000, batch_size=None
):
r"""Compute confidence intervals on the model's estimate of the target
given ``x``, including all sources of uncertainty.
TODO: docs
TODO: using side= both, upper, vs lower
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features").
ci : float between 0 and 1
Inner proportion of predictive distribution to use a the
confidence interval.
Default = 0.95
side : str {'lower', 'upper', 'both'}
Whether to get the one- or two-sided interval, and which side to
get. If ``'both'`` (default), gets the upper and lower bounds of
the central ``ci`` interval. If ``'lower'``, gets the lower bound
on the one-sided ``ci`` interval. If ``'upper'``, gets the upper
bound on the one-sided ``ci`` interval.
n : int
Number of samples from the posterior predictive distribution to
take to compute the confidence intervals.
Default = 1000
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
lb : |ndarray|
Lower bounds of the ``ci`` confidence intervals on the predictions
for samples in ``x``. Doesn't return this if ``side='upper'``.
ub : |ndarray|
Upper bounds of the ``ci`` confidence intervals on the predictions
for samples in ``x``. Doesn't return this if ``side='lower'``.
"""
return self._intervals(
self.predictive_sample, x, side, ci=ci, n=n, batch_size=batch_size
)
[docs] def aleatoric_interval(
self, x, ci=0.95, side="both", n=1000, batch_size=None
):
r"""Compute confidence intervals on the model's estimate of the target
given ``x``, including only aleatoric uncertainty (uncertainty due to
noise).
TODO: docs
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features").
ci : float between 0 and 1
Inner proportion of predictive distribution to use a the
confidence interval.
Default = 0.95
side : str {'lower', 'upper', 'both'}
Whether to get the one- or two-sided interval, and which side to
get. If ``'both'`` (default), gets the upper and lower bounds of
the central ``ci`` interval. If ``'lower'``, gets the lower bound
on the one-sided ``ci`` interval. If ``'upper'``, gets the upper
bound on the one-sided ``ci`` interval.
n : int
Number of samples from the aleatoric predictive distribution to
take to compute the confidence intervals.
Default = 1000
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
lb : |ndarray|
Lower bounds of the ``ci`` confidence intervals on the predictions
for samples in ``x``. Doesn't return this if ``side='upper'``.
ub : |ndarray|
Upper bounds of the ``ci`` confidence intervals on the predictions
for samples in ``x``. Doesn't return this if ``side='lower'``.
"""
return self._intervals(
self.aleatoric_sample, x, side, ci=ci, n=n, batch_size=batch_size
)
[docs] def epistemic_interval(
self, x, ci=0.95, side="both", n=1000, batch_size=None
):
r"""Compute confidence intervals on the model's estimate of the target
given ``x``, including only epistemic uncertainty (uncertainty due to
uncertainty as to the model's parameter values).
TODO: docs
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features").
ci : float between 0 and 1
Inner proportion of predictive distribution to use a the
confidence interval.
Default = 0.95
side : str {'lower', 'upper', 'both'}
Whether to get the one- or two-sided interval, and which side to
get. If ``'both'`` (default), gets the upper and lower bounds of
the central ``ci`` interval. If ``'lower'``, gets the lower bound
on the one-sided ``ci`` interval. If ``'upper'``, gets the upper
bound on the one-sided ``ci`` interval.
n : int
Number of samples from the epistemic predictive distribution to
take to compute the confidence intervals.
Default = 1000
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
lb : |ndarray|
Lower bounds of the ``ci`` confidence intervals on the predictions
for samples in ``x``. Doesn't return this if ``side='upper'``.
ub : |ndarray|
Upper bounds of the ``ci`` confidence intervals on the predictions
for samples in ``x``. Doesn't return this if ``side='lower'``.
"""
return self._intervals(
self.epistemic_sample, x, side, ci=ci, n=n, batch_size=batch_size
)
[docs] def pred_dist_plot(
self, x, n=10000, cols=1, individually=False, batch_size=None, **kwargs
):
r"""Plot posterior predictive distribution from the model given ``x``.
TODO: Docs...
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features").
n : int
Number of samples to draw from the model given ``x``.
Default = 10000
cols : int
Divide the subplots into a grid with this many columns (if
``individually=True``.
individually : bool
If ``True``, plot one subplot per datapoint in ``x``, otherwise
plot all the predictive distributions on the same plot.
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
**kwargs
Additional keyword arguments are passed to :func:`.plot_dist`
Example
-------
TODO
"""
# Sample from the predictive distribution
samples = self.predictive_sample(x, n=n, batch_size=batch_size)
# Independent variable must be scalar
Ns = samples.shape[0]
N = samples.shape[1]
if samples.ndim > 2 and any(e > 1 for e in samples.shape[2:]):
raise NotImplementedError(
"only scalar dependent variables are supported"
)
else:
samples = samples.reshape([Ns, N])
# Plot the predictive distributions
if individually:
rows = np.ceil(N / cols)
for i in range(N):
plt.subplot(rows, cols, i + 1)
plot_dist(samples[:, i], **kwargs)
plt.xlabel("Predicted dependent variable value for " + str(i))
plt.tight_layout()
else:
plot_dist(samples, **kwargs)
plt.xlabel("Predicted dependent variable value")
def _get_y(self, x, y):
"""Get y, even when x is a DataGenerator and y is None"""
if y is not None:
return y
else:
y_true = [d for _, d in make_generator(x, y, test=True)]
return np.concatenate(to_numpy(y_true), axis=0)
[docs] def predictive_prc(self, x, y=None, n=1000, batch_size=None):
r"""Compute the percentile of each observation along the posterior
predictive distribution.
TODO: Docs... Returns a percentile between 0 and 1
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series| or Tensor
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of samples to draw from the model given ``x``.
Default = 1000
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
prcs : |ndarray| of float between 0 and 1
"""
# Need both x and y data
if y is None and not isinstance(x, DataGenerator):
raise TypeError("need both x and y to compute predictive prc")
# Compute in batches?
if batch_size is not None:
return np.concatenate(
[
self.predictive_prc(x_data, y_data, n=n)
for x_data, y_data in make_generator(
x, y, batch_size=batch_size
)
],
axis=0,
)
# Sample from the predictive distribution
samples = self.predictive_sample(x, n=n, batch_size=batch_size)
# Independent variable must be scalar
if samples.ndim > 2 and any(e > 1 for e in samples.shape[2:]):
raise NotImplementedError(
"only scalar dependent variables are supported"
)
# Reshape
Ns = samples.shape[0]
N = samples.shape[1]
samples = samples.reshape([Ns, N])
y = self._get_y(x, y).reshape([1, N])
# Percentiles of true y data along predictive distribution
prcs = np.argmax(np.sort(samples, axis=0) > y, axis=0) / Ns
# Argmax returns 0 when all samples are less than true value!
prcs[np.reshape(np.max(samples, axis=0) < y, [N])] = 1.0
# Return percentiles
return prcs.reshape([N, 1])
[docs] def pred_dist_covered(
self, x, y=None, n: int = 1000, ci: float = 0.95, batch_size=None
):
r"""Compute whether each observation was covered by a given confidence
interval.
TODO: Docs...
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series| or Tensor
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of samples to draw from the model given ``x``.
Default = 1000
ci : float between 0 and 1
Confidence interval to use.
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
TODO
"""
# Check values
if n < 1:
raise ValueError("n must be greater than 0")
if ci < 0.0 or ci > 1.0:
raise ValueError("ci must be between 0 and 1")
# Compute the predictive percentile of each observation
pred_prcs = self.predictive_prc(x, y=y, n=n, batch_size=batch_size)
# Determine what samples fall in the inner ci proportion
lb = (1.0 - ci) / 2.0
ub = 1.0 - lb
return (pred_prcs >= lb) & (pred_prcs < ub)
[docs] def pred_dist_coverage(self, x, y=None, n=1000, ci=0.95, batch_size=None):
r"""Compute what percent of samples are covered by a given confidence
interval.
TODO: Docs...
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series| or Tensor
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of samples to draw from the model given ``x``.
Default = 1000
ci : float between 0 and 1
Confidence interval to use.
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
prc_covered : float between 0 and 1
Proportion of the samples which were covered by the predictive
distribution's confidence interval.
"""
return self.pred_dist_covered(
x, y=y, n=n, ci=ci, batch_size=batch_size
).mean()
[docs] def coverage_by(
self,
x_by,
x,
y=None,
n: int = 1000,
ci: float = 0.95,
bins: int = 30,
plot: bool = True,
ideal_line_kwargs: dict = {},
batch_size=None,
**kwargs,
):
r"""Compute and plot the coverage of a given confidence interval
of the posterior predictive distribution as a function of specified
independent variables.
TODO: Docs...
Parameters
----------
x_by : int or str or list of int or list of str
Which independent variable(s) to plot the log probability as a
function of. That is, which columns in ``x`` to plot by.
x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series| or Tensor
Dependent variable values of the dataset to evaluate (aka the
"target").
ci : float between 0 and 1
Inner percentile to find the coverage of. For example, if
``ci=0.95``, will compute the coverage of the inner 95% of the
posterior predictive distribution.
bins : int
Number of bins to use for x_by
ideal_line_kwargs : dict
Dict of args to pass to matplotlib.pyplot.plot for ideal coverage
line.
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
**kwargs
Additional keyword arguments are passed to plot_by
Returns
-------
xo : |ndarray|
Values of x_by corresponding to bin centers.
co : |ndarray|
Coverage of the ``ci`` confidence interval of the predictive
distribution in each bin.
"""
# Compute whether each sample was covered by the predictive interval
covered = self.pred_dist_covered(
x, y=y, n=n, ci=ci, batch_size=batch_size
)
# Plot coverage proportion as a fn of x_by cols of x
xo, co = plot_by(x_by, 100 * covered, label="Actual", **kwargs)
# Line kwargs
if "linestyle" not in ideal_line_kwargs:
ideal_line_kwargs["linestyle"] = "--"
if "color" not in ideal_line_kwargs:
ideal_line_kwargs["color"] = "k"
# Also plot ideal line
plt.axhline(100 * ci, label="Ideal", **ideal_line_kwargs)
plt.legend()
plt.ylabel(str(100 * ci) + "% predictive interval coverage")
plt.xlabel("Independent variable")
return xo, co
[docs] def r_squared(self, x, y=None, n=1000, batch_size=None):
r"""Compute the Bayesian R-squared distribution (Gelman et al., 2018).
TODO: more info
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series|
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of posterior draws to use for computing the r-squared
distribution. Default = `1000`.
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
|ndarray|
Samples from the r-squared distribution. Size: ``(num_samples,)``.
Examples
--------
TODO: Docs...
References
----------
- Andrew Gelman, Ben Goodrich, Jonah Gabry, & Aki Vehtari.
`R-squared for Bayesian regression models. <https://doi.org/10.1080/00031305.2018.1549100>`_
*The American Statistician*, 2018.
"""
# Get true y values
y_true = self._get_y(x, y)
# Predict y with samples from the posterior distribution
y_pred = self.epistemic_sample(x, n=n, batch_size=batch_size)
# Compute Bayesian R^2
v_fit = np.var(y_pred, axis=1)
v_res = np.var(y_pred - np.expand_dims(y_true, 0), axis=1)
return v_fit / (v_fit + v_res)
[docs] def r_squared_plot(
self, x, y=None, n=1000, style="hist", batch_size=None, **kwargs
):
r"""Plot the Bayesian R-squared distribution.
See :meth:`~r_squared` for more info on the Bayesian R-squared metric.
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series|
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of posterior draws to use for computing the r-squared
distribution. Default = `1000`.
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
**kwargs
Additional keyword arguments are passed to :func:`.plot_dist`
Example
-------
TODO
"""
r2 = self.r_squared(x, y, n=n, batch_size=batch_size)
plot_dist(r2, style=style, **kwargs)
plt.xlabel("Bayesian R squared")
[docs] def residuals(self, x, y=None, batch_size=None):
r"""Compute the residuals of the model's predictions.
TODO: docs...
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series|
Dependent variable values of the dataset to evaluate (aka the
"target").
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
|ndarray|
The residuals.
Example
-------
TODO
"""
y_true = self._get_y(x, y)
y_pred = self.predict(x, batch_size=batch_size)
return y_true - y_pred
[docs] def residuals_plot(self, x, y=None, batch_size=None, **kwargs):
r"""Plot the distribution of residuals of the model's predictions.
TODO: docs...
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series|
Dependent variable values of the dataset to evaluate (aka the
"target").
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
**kwargs
Additional keyword arguments are passed to :func:`.plot_dist`
Example
-------
TODO
"""
res = self.residuals(x, y, batch_size=batch_size)
plot_dist(res, **kwargs)
plt.xlabel("Residual (True - Predicted)")
[docs] def calibration_curve(self, x, y, n=1000, resolution=100, batch_size=None):
r"""Compute the regression calibration curve (Kuleshov et al., 2018).
The regression calibration curve compares the empirical cumulative
probability to the cumulative probability predicted by a regression
model (Kuleshov et al., 2018). First, a vector :math:`p` of :math:`m`
confidence levels are chosen, which correspond to the predicted
cumulative probabilities:
.. math::
0 \leq p_1 \leq p_2 \leq \ldots \leq p_m \leq 1
Then, a vector of empirical frequencies :math:`\hat{p}` at each of the
predicted frequencies is computed by using validation data:
.. math::
\hat{p}_j = \frac{1}{N} \sum_{i=1}^N [ P_M(x_i \leq y_i) \leq p_j ]
where :math:`N` is the number of validation datapoints, :math:`P_M(x_i
\leq y_i)` is the model's predicted cumulative probability of datapoint
:math:`i` (i.e., the percentile along the model's predicted probability
distribution at which the true value of :math:`y_i` falls), and
:math:`\sum_i [ a_i \leq b_i ]` is just the count of elements of
:math:`a` which are less than corresponding elements in :math:`b`.
The calibration curve then plots :math:`p` against :math:`\hat{p}`.
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series|
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of samples to draw from the model for computing the
predictive percentile. Default = 1000
resolution : int
Number of confidence levels to evaluate at. This corresponds to
the :math:`m` parameter in section 3.5 of (Kuleshov et al., 2018).
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
p : |ndarray|
The predicted cumulative frequencies, :math:`p`.
p_hat : |ndarray|
The empirical cumulative frequencies, :math:`\hat{p}`.
Example
-------
Supposing we have some training data (``x_train`` and ``y_train``) and
validation data (``x_val`` and ``y_val``), and have already fit a model
to the training data,
.. code-block:: python3
model = # some ProbFlow model...
model.fit(x_train, y_train)
Then we can compute the calibration curve with
:meth:`~calibration_curve`:
.. code-block:: python3
p_pred, p_empirical = model.calibration_curve(x_val, y_val)
The returned values can be used directly or plotted against one another
to get the calibration curve (as in Figure 3 in Kuleshov et al., 2018)
.. code-block:: python3
import matplotlib.pyplot as plt
plt.plot(p_pred, p_empirical)
Or, even more simply, just use :meth:`~calibration_curve_plot`.
See also
--------
* :meth:`~calibration_curve_plot`
* :meth:`~expected_calibration_error`
References
----------
- Volodymyr Kuleshov, Nathan Fenner, and Stefano Ermon.
`Accurate Uncertainties for Deep Learning Using Calibrated Regression
<https://arxiv.org/abs/1807.00263>`_, 2018.
"""
pred_prc = self.predictive_prc(x, y, n=n, batch_size=batch_size)
p = np.linspace(0, 1, resolution + 2)[1:-1]
p_hat = np.array([np.mean(pred_prc < tp) for tp in p])
return p, p_hat
[docs] def calibration_curve_plot(
self, x, y, n=1000, resolution=100, batch_size=None, **kwargs
):
r"""Plot the regression calibration curve.
See :meth:`~calibration_curve` for more info about the regression
calibration curve.
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series|
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of samples to draw from the model for computing the
predictive percentile. Default = 1000
resolution : int
Number of confidence levels to evaluate at. This corresponds to
the :math:`m` parameter in section 3.5 of (Kuleshov et al., 2018).
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
**kwargs
Additional keyword arguments are passed to :func:`.plot_dist`
See also
--------
* :meth:`~calibration_curve`
* :meth:`~expected_calibration_error`
"""
p, p_hat = self.calibration_curve(
x, y, n=n, resolution=resolution, batch_size=batch_size
)
plt.plot(p, p_hat, **kwargs)
plt.xlabel("Predicted cumulative probability")
plt.ylabel("Empirical cumulative probability")
def _calibration_metric(self, metric: str, p, p_hat):
if metric == "msce":
return np.mean(np.square(p - p_hat))
elif metric == "rmsce":
return np.sqrt(np.mean(np.square(p - p_hat)))
elif metric == "mace":
return np.mean(np.abs(p - p_hat))
elif metric == "ma":
p0 = np.concatenate([[0.0], p, [1.0]])
p0_hat = np.concatenate([[0.0], p_hat, [1.0]])
return np.trapz(np.abs(p0 - p0_hat), p0)
else:
raise ValueError(f"Unknown calibration metric {metric}")
[docs] def calibration_metric(
self, metric, x, y=None, n=1000, resolution=100, batch_size=None
):
r"""Compute one or more of several calibration metrics
Regression calibration metrics measure the error between a model's
regression calibration curve and the ideal calibration curve - i.e.,
what the curve would be if the model were perfectly calibrated (see
`Kuleshov et al., 2018 <https://arxiv.org/abs/1807.00263>`_ and `Chung
et al., 2020 <https://arxiv.org/abs/2011.09588>`_). First, a vector
:math:`p` of :math:`m` confidence levels are chosen, which correspond
to the predicted cumulative probabilities:
.. math::
0 \leq p_1 \leq p_2 \leq \ldots \leq p_m \leq 1
Then, a vector of empirical frequencies :math:`\hat{p}` at each of the
predicted frequencies is computed by using validation data:
.. math::
\hat{p}_j =
\frac{1}{N} \sum_{i=1}^N [ P_M(x_i \leq y_i) \leq p_j ]
where :math:`N` is the number of validation datapoints, :math:`P_M(x_i
\leq y_i)` is the model's predicted cumulative probability of datapoint
:math:`i` (i.e., the percentile along the model's predicted probability
distribution at which the true value of :math:`y_i` falls), and
:math:`\sum_i [ a_i \leq b_i ]` is just the count of elements of
:math:`a` which are less than corresponding elements in :math:`b`.
Various metrics can be computed from these curves to measure how
accurately the regression model captures uncertainty:
The **mean squared calibration error (MSCE)** is the mean squared error
between the empirical and predicted frequencies,
.. math::
MSCE = \frac{1}{m} \sum_{j=1}^m (p_j - \hat{p}_j)^2
The **root mean squared calibration error (RMSCE)** is just the square
root of the MSCE:
.. math::
RMSCE = \sqrt{\frac{1}{m} \sum_{j=1}^m (p_j - \hat{p}_j)^2}
The **mean absolute calibration error (MACE)** is the mean of the
absolute differences between the empirical and predicted frequencies:
.. math::
MACE = \frac{1}{m} \sum_{j=1}^m | p_j - \hat{p}_j |
And the **miscalibration area (MA)** is the area between the
calibration curve and the ideal calibration curve (the identity line
from (0, 0) to (1, 1):
.. math::
MA = \int_0^1 p_x - \hat{p}_x dx
Note that MA is equal to MACE as the number of bins (set by the
``resolution`` keyword argument) goes to infinity.
To choose which metric to compute, pass the name of the metric
(``msce``, ``rmsce``, ``mace``, or ``ma``) as the first argument to
this function (or a list of them to compute multiple).
See `Kuleshov et al., 2018 <https://arxiv.org/abs/1807.00263>`_, `Chung
et al., 2020 <https://arxiv.org/abs/2011.09588>`_ and the user guide
page on :doc:`/user_guide/evaluating` for discussions of evaluating
uncertainty estimates using calibration metrics, among other metrics.
Note that calibration is generally less important than accuracy, but
more important than other metrics like :meth:`~sharpness` and any
:meth:`dispersion_metric`.
Parameters
----------
metric : str {'msce', 'rmsce', 'mace', or 'ma'} or List[str]
Which metric(s) to compute (see above for the definition of each
metric). To compute multiple metrics, pass a list of the metric
names you'd like to compute. Available metrics are:
* ``msce``: mean squared calibration error
* ``rmsce``: root mean squared calibration error
* ``mace``: mean absolute calibration error
* ``ma``: miscalibration area
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
y : |ndarray| or |DataFrame| or |Series|
Dependent variable values of the dataset to evaluate (aka the
"target").
n : int
Number of samples to draw from the model for computing the
predictive percentile. Default = 1000
resolution : int
Number of confidence levels to evaluate at. This corresponds to
the :math:`m` parameter in section 3.5 of (Kuleshov et al., 2018).
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
float or Dict[str, float]
The requested calibration metric. If a list of metric names was
passed, will return a dict whose keys are the metrics, and whose
values are the corresponding metric values.
Example
-------
Supposing we have some training data (``x_train`` and ``y_train``) and
validation data (``x_val`` and ``y_val``), and have already fit a model
to the training data,
.. code-block:: python3
model = # some ProbFlow model...
model.fit(x_train, y_train)
Then we can compute different calibration metrics using
:meth:`~expected_calibration_error`. For example, to compute the mean
squared calibration error (MSCE):
.. code-block:: pycon
>>> model.calibration_metric("msce", x_val, y_val)
0.123
Or, to compute the mean absolute calibration error (MACE):
.. code-block:: pycon
>>> model.calibration_metric("mace", x_val, y_val)
0.211
To compute multiple metrics at the same time, pass a list of metric
names:
.. code-block:: pycon
>>> model.calibration_metric(["msce", "mace"], x_val, y_val)
{"msce": 0.123, "mace": 0.211}
See also
--------
* :meth:`~calibration_curve`
* :meth:`~calibration_curve_plot`
* :meth:`~sharpness`
* :meth:`~dispersion_metric`
References
----------
- Volodymyr Kuleshov, Nathan Fenner, and Stefano Ermon.
`Accurate Uncertainties for Deep Learning Using Calibrated Regression
<https://arxiv.org/abs/1807.00263>`_, 2018.
- Youngseog Chung, Willie Neiswanger, Ian Char, Jeff Schneider.
`Beyond Pinball Loss: Quantile Methods for Calibrated Uncertainty
Quantification <https://arxiv.org/abs/2011.09588>`_, 2020.
"""
p, p_hat = self.calibration_curve(
x, y, n=n, resolution=resolution, batch_size=batch_size
)
if isinstance(metric, list):
return {m: self._calibration_metric(m, p, p_hat) for m in metric}
else:
return self._calibration_metric(metric, p, p_hat)
[docs] def sharpness(self, x, n=1000, batch_size=None):
r"""Compute the sharpness of the model's uncertainty estimates
The "sharpness" of a model's uncertainty estimates is the root mean of
the estimated variances:
.. math::
SHA = \sqrt{\frac{1}{N} \sum_{i=1}^N \text{Var}(\hat{Y}_i)}
See `Tran et al., 2020 <https://arxiv.org/abs/1912.10066>`_ and the
user guide page on :doc:`/user_guide/evaluating` for discussions of
evaluating uncertainty estimates using sharpness, among other metrics.
Note that the sharpness should generally be one of the later things you
consider - accuracy and calibration usually being more important.
Parameters
----------
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
n : int
Number of samples to draw from the model. Default = 1000
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
float
The sharpness of the model's uncertainty estimates
Example
-------
Supposing we have some training data (``x_train`` and ``y_train``) and
validation data (``x_val`` and ``y_val``), and have already fit a model
to the training data,
.. code-block:: python3
model = # some ProbFlow model...
model.fit(x_train, y_train)
Then we can compute the sharpness of our model's predictions with:
.. code-block:: pycon
>>> model.sharpness(x_val)
0.173
See also
--------
* :meth:`~calibration_metric`
* :meth:`~dispersion_metric`
References
----------
- Kevin Tran, Willie Neiswanger, Junwoong Yoon, Qingyang Zhang, Eric
Xing, Zachary W. Ulissi. `Methods for comparing uncertainty
quantifications for material property predictions
<https://arxiv.org/abs/1912.10066>`_, 2020.
"""
samples = self.predictive_sample(x, n=n, batch_size=batch_size)
return np.sqrt(np.mean(np.var(samples, axis=0)))
def _dispersion_metric(self, metric, samples):
stds = np.std(samples, axis=0)
if metric in ["cv", "cov", "coefficient_of_variation"]:
return np.std(stds) / np.mean(stds)
elif metric in ["qcd", "qcod", "quartile_coefficient_of_dispersion"]:
q1 = np.percentile(stds, 25)
q3 = np.percentile(stds, 75)
return (q3 - q1) / (q3 + q1)
else:
raise ValueError(f"Unknown dispersion metric {metric}")
[docs] def dispersion_metric(self, metric, x, n=1000, batch_size=None):
r"""Compute one or more of several calibration metrics
Dispersion metrics measure how much a model's uncertainty estimates
vary. There are several different dispersion metrics:
The **coefficient of variation** (:math:`C_v`) is the ratio of the
standard deviation to the mean (of the model's uncertainty standard
deviations):
.. math::
C_v =
The **quartile coefficient of dispersion** (:math:`QCD`) is less
sensitive to outliers, as it simply measures the difference between the
first and third quartile (of the model's uncertainty standard
deviations) to their sum:
.. math::
QCD = \frac{Q_3 - Q_1}{Q_3 + Q_1}
See `Tran et al., 2020 <https://arxiv.org/abs/1912.10066>`_ and the
user guide page on :doc:`/user_guide/evaluating` for discussions of
evaluating uncertainty estimates using dispersion metrics, among other
metrics. Note that dispersion metrics should generally be one of the
last things you consider - accuracy, calibration, and sharpness usually
being more important.
Parameters
----------
metric : str {'cv' or 'qcd'} or List[str]
Dispersion metric to compute. Or,
x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
Independent variable values of the dataset to evaluate (aka the
"features"). Or a |DataGenerator| for both x and y.
n : int
Number of samples to draw from the model. Default = 1000
batch_size : None or int
Compute using batches of this many datapoints. Default is `None`
(i.e., do not use batching).
Returns
-------
float or Dict[str, float]
The requested dispersion metric. If a list of metric names was
passed, will return a dict whose keys are the metrics, and whose
values are the corresponding metric values.
Example
-------
Supposing we have some training data (``x_train`` and ``y_train``) and
validation data (``x_val`` and ``y_val``), and have already fit a model
to the training data,
.. code-block:: python3
model = # some ProbFlow model...
model.fit(x_train, y_train)
Then we can compute the coefficient of variation of our model's
predictions with:
.. code-block:: pycon
>>> model.dispersion_metric('cv', x_val)
0.732
Or the quartile coefficient of dispersion with:
.. code-block:: pycon
>>> model.dispersion_metric('qcd', x_val)
0.625
See also
--------
* :meth:`~calibration_metric`
* :meth:`~sharpness`
References
----------
- Kevin Tran, Willie Neiswanger, Junwoong Yoon, Qingyang Zhang, Eric
Xing, Zachary W. Ulissi. `Methods for comparing uncertainty
quantifications for material property predictions
<https://arxiv.org/abs/1912.10066>`_, 2020.
"""
samples = self.predictive_sample(x, n=n, batch_size=batch_size)
if isinstance(metric, list):
return {m: self._dispersion_metric(m, samples) for m in metric}
else:
return self._dispersion_metric(metric, samples)