"""Implementation of necessary statistical methods."""
import bottleneck
import numpy as np
import xarray as xr
# PERFORMANCE METRICS
[docs]
def bias(sim, obs, dim="time"):
r"""Return the bias between two time series.
.. math::
Bias=\\frac{1}{N}\\sum_{i=1}^{N}(obs_{i}-sim_{i})
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
bias
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
bias = xr.apply_ufunc(_bias, sim, obs, **kwargs)
bias.name = "bias"
return bias
[docs]
def percentual_bias(sim, obs, dim="time"):
r"""Return the percentual bias between two time series.
.. math::
PBias= 100 * \\frac{\\sum_{i=1}^{N}(sim_{i}-obs_{i})}{\\sum_{i=1}^{N}(obs_{i})}
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
percentual bias
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
pbias = xr.apply_ufunc(_pbias, sim, obs, **kwargs)
pbias.name = "pbias"
return pbias
[docs]
def volumetric_error(sim, obs, dim="time"):
r"""Return the volumetric error between two time series.
.. math::
VE= 1 - \\frac{\\sum_{i=1}^{N}(|sim_{i}-obs_{i}|)}{\\sum_{i=1}^{N}(obs_{i})}
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
volumetric error
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
ve = xr.apply_ufunc(_ve, sim, obs, **kwargs)
ve.name = "ve"
return ve
[docs]
def nashsutcliffe(sim, obs, dim="time"):
r"""Return the Nash-Sutcliffe model efficiency.
Efficiency is based on a simulated and observed time series.
.. math::
NSE = 1-\\frac{\\sum_{i=1}^{N}(sim_{i}-obs_{i})^2}
{\\sum_{i=1}^{N}(obs_{i}-\\bar{obs})^2}
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
the Nash-Sutcliffe model efficiency
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
nse = xr.apply_ufunc(_nse, sim, obs, **kwargs)
nse.name = "nse"
return nse
[docs]
def lognashsutcliffe(sim, obs, epsilon=1e-6, dim="time"):
r"""Return the log Nash-Sutcliffe model efficiency.
Efficiency calculation is based on simulated and observed time series.
.. math::
NSE = 1-\\frac{\\sum_{i=1}^{N}(log(sim_{i})-log(obs_{i}))^2}
{\\sum_{i=1}^{N}(log(sim_{i})-log(\\bar{obs})^2}-1)*-1
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
epsilon : float, optional
small value to avoid taking the log of zero (the default is 1e-6)
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
the log of the Nash-Sutcliffe model efficiency
"""
obs = np.log(obs + epsilon)
sim = np.log(sim + epsilon)
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
log_nse = xr.apply_ufunc(_nse, sim, obs, **kwargs)
log_nse.name = "log_nse"
return log_nse
[docs]
def pearson_correlation(sim, obs, dim="time"):
"""Return the Pearson correlation coefficient of two time series.
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
the pearson correlation coefficient
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
pearsonr = xr.apply_ufunc(_pearson_correlation, sim, obs, **kwargs)
pearsonr.name = "pearson_coef"
return pearsonr
[docs]
def spearman_rank_correlation(sim, obs, dim="time"):
"""Return the spearman rank correlation coefficient of two time series.
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
the spearman rank correlation
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
spearmanr = xr.apply_ufunc(_spearman_correlation, sim, obs, **kwargs)
spearmanr.name = "spearmanr_coef"
return spearmanr
[docs]
def kge_non_parametric(sim, obs, dim="time"):
"""Return the Non Parametric Kling-Gupta Efficiency (KGE, 2018).
Calculation is based on the two time series with decomposed scores.
.. ref:
Pool, Vis, and Seibert, 2018 Evaluating model performance: towards
a non-parametric variant of the Kling-Gupta efficiency,
Hydrological Sciences Journal.
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataSet
Non Parametric Kling-Gupta Efficiency (2018) and with decomposed score
"""
cc = spearman_rank_correlation(sim, obs, dim=dim)
cc.name = "kge_np_spearman_rank_correlation_coef"
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
alpha = xr.apply_ufunc(_fdc_alpha, sim, obs, **kwargs)
alpha.name = "kge_np_rel_var"
beta = sim.sum(dim=dim) / obs.sum(dim=dim)
beta.name = "kge_np_bias"
kge = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)
kge.name = "kge_np"
ds_out = xr.merge([kge, cc, alpha, beta])
return ds_out
[docs]
def kge_non_parametric_flood(sim, obs, dim="time"):
"""Return the Non Parametric Kling-Gupta Efficiency (KGE, 2018) of two time series.
this KGE is optimized for flood peaks using Pearson (see Pool et al., 2018).
.. ref:
Pool, Vis, and Seibert, 2018 Evaluating model performance: towards
a non-parametric variant of the Kling-Gupta efficiency,
Hydrological Sciences Journal.
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataSet
Non Parametric Kling-Gupta Efficiency (2018) optimize for flood peaks
using pearson (see Pool et al., 2018) and with decomposed score
"""
cc = pearson_correlation(sim, obs, dim=dim)
cc.name = "kge_np_flood_pearson_coef"
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
alpha = xr.apply_ufunc(_fdc_alpha, sim, obs, **kwargs)
alpha.name = "kge_np_flood_rel_var"
beta = sim.sum(dim=dim) / obs.sum(dim=dim)
beta.name = "kge_np_flood_bias"
kge = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)
kge.name = "kge_np_flood"
ds_out = xr.merge([kge, cc, alpha, beta])
return ds_out
[docs]
def rsquared(sim, obs, dim="time"):
r"""Return the coefficient of determination of two time series.
.. math::
r^2=(\\frac{\\sum ^n _{i=1}(e_i - \\bar{e})(s_i - \\bar{s})}
{\\sqrt{\\sum ^n _{i=1}(e_i - \\bar{e})^2}
\\sqrt{\\sum ^n _{i=1}(s_i - \\bar{s})^2}})^2
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
the coefficient of determination
"""
# R2 equals the square of the Pearson correlation coefficient between obs and sim
rsquared = pearson_correlation(sim, obs, dim=dim) ** 2
rsquared.name = "rsquared"
return rsquared
[docs]
def mse(sim, obs, dim="time"):
r"""Return the mean squared error (MSE) between two time series.
.. math::
MSE=\\frac{1}{N}\\sum_{i=1}^{N}(e_{i}-s_{i})^2
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
the mean squared error
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
mse = xr.apply_ufunc(_mse, sim, obs, **kwargs)
mse.name = "mse"
return mse
[docs]
def rmse(sim, obs, dim="time"):
r"""Return the root mean squared error between two time series.
.. math::
RMSE=\\sqrt{\\frac{1}{N}\\sum_{i=1}^{N}(e_{i}-s_{i})^2}
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataArray
the root mean squared error
"""
rmse = np.sqrt(mse(sim, obs, dim=dim))
rmse.name = "rmse"
return rmse
[docs]
def rsr(sim, obs, dim="time"):
r"""Return the RMSE-observations standard deviation (RSR) between two time series.
.. math::
RSR=\\frac{RMSE}{STDEVobs}
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
Returns
-------
xarray DataArray
the root squared error
"""
# wrap numpy function
kwargs = dict(
input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float]
)
rsr = xr.apply_ufunc(_rsr, sim, obs, **kwargs)
rsr.name = "rsr"
return rsr
[docs]
def kge(sim, obs, dim="time"):
r"""Return the Kling-Gupta Efficiency (KGE) of two time series.
.. ref:
Gupta, Kling, Yilmaz, Martinez, 2009, Decomposition of the mean
squared error and NSE performance criteria: Implications for improving
hydrological modelling.
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataSet
Kling-Gupta Efficiency and with decomposed scores
"""
cc = pearson_correlation(sim, obs, dim=dim)
cc.name = "kge_pearson_coef"
alpha = sim.std(dim=dim) / obs.std(dim=dim)
alpha.name = "kge_rel_var"
beta = sim.sum(dim=dim) / obs.sum(dim=dim)
beta.name = "kge_bias"
kge = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)
kge.name = "kge"
ds_out = xr.merge([kge, cc, alpha, beta])
return ds_out
[docs]
def kge_2012(sim, obs, dim="time"):
r"""Return the Kling-Gupta Efficiency (KGE, 2012) of two time series.
.. ref:
Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the
upper Danube basin under an ensemble of climate change scenarios.
Journal of Hydrology, 424, 264-277, doi:10.1016/j.jhydrol.2012.01.011.
Parameters
----------
sim : xarray DataArray
simulations time series
obs : xarray DataArray
observations time series
dim : str, optional
name of time dimension in sim and obs (the default is 'time')
Returns
-------
xarray DataSet
Kling-Gupta Efficiency (2012) and with decomposed scores
"""
cc = pearson_correlation(sim, obs, dim=dim)
cc.name = "kge_2012_pearson_coef"
beta = sim.sum(dim=dim) / obs.sum(dim=dim)
beta.name = "kge_2012_bias"
# divide alpha by bias
alpha = (sim.std(dim=dim) * obs.sum(dim=dim)) / (
obs.std(dim=dim) * sim.sum(dim=dim)
)
alpha.name = "kge_2012_rel_var"
kge_2012 = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)
kge_2012.name = "kge_2012"
ds_out = xr.merge([kge_2012, cc, alpha, beta])
return ds_out
# correlation ufunc function
# from http://xarray.pydata.org/en/stable/dask.html#automatic-parallelization
def _covariance(x, y):
return np.nanmean(
(x - np.nanmean(x, axis=-1, keepdims=True))
* (y - np.nanmean(y, axis=-1, keepdims=True)),
axis=-1,
)
def _pearson_correlation(x, y):
return _covariance(x, y) / (np.nanstd(x, axis=-1) * np.nanstd(y, axis=-1))
def _spearman_correlation(x, y):
x_ranks = bottleneck.nanrankdata(x, axis=-1)
y_ranks = bottleneck.nanrankdata(y, axis=-1)
return _pearson_correlation(x_ranks, y_ranks)
# numpy functions
def _fdc_alpha(sim, obs, axis=-1):
fdc_s = np.sort(sim, axis=axis) / (np.nanmean(sim, axis=axis) * len(sim))
fdc_o = np.sort(obs, axis=axis) / (np.nanmean(obs, axis=axis) * len(obs))
return 1 - 0.5 * np.nansum(np.abs(fdc_s - fdc_o), axis=axis)
def _bias(sim, obs, axis=-1):
"""Bias."""
return np.nansum(sim - obs, axis=axis) / np.nansum(np.isfinite(obs), axis=axis)
def _pbias(sim, obs, axis=-1):
"""Percentual bias."""
return 100 * np.nansum(sim - obs, axis=axis) / np.nansum(obs, axis=axis)
def _ve(sim, obs, axis=-1):
"""Volumetric Error."""
ve = 1 - np.nansum(np.absolute(sim - obs), axis=axis) / np.nansum(obs, axis=axis)
return ve
def _mse(sim, obs, axis=-1):
"""Mean squared error."""
mse = np.nansum((obs - sim) ** 2, axis=axis) / np.nansum(
np.isfinite(obs), axis=axis
)
return mse
def _rsr(sim, obs, axis=-1):
"""RMSE-observations standard deviation."""
rmse = np.sqrt(_mse(sim, obs, axis=axis))
stdev = np.sqrt(
np.nansum((obs - np.nanmean(obs, axis=axis)) ** 2, axis=axis)
/ np.nansum(np.isfinite(obs), axis=axis)
)
return rmse / stdev
def _nse(sim, obs, axis=-1):
"""nash-sutcliffe efficiency."""
obs_mean = np.nanmean(obs, axis=axis)
a = np.nansum((sim - obs) ** 2, axis=axis)
b = np.nansum((obs - obs_mean[..., None]) ** 2, axis=axis)
return 1 - a / b