Source code for hydromt.stats.skills

"""Implementation of necessary statistical methods."""

import bottleneck
import numpy as np
import xarray as xr


# PERFORMANCE METRICS
[docs] def bias(sim, obs, dim="time"): r"""Return the bias between two time series. .. math:: Bias=\\frac{1}{N}\\sum_{i=1}^{N}(obs_{i}-sim_{i}) Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray bias """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) bias = xr.apply_ufunc(_bias, sim, obs, **kwargs) bias.name = "bias" return bias
[docs] def percentual_bias(sim, obs, dim="time"): r"""Return the percentual bias between two time series. .. math:: PBias= 100 * \\frac{\\sum_{i=1}^{N}(sim_{i}-obs_{i})}{\\sum_{i=1}^{N}(obs_{i})} Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray percentual bias """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) pbias = xr.apply_ufunc(_pbias, sim, obs, **kwargs) pbias.name = "pbias" return pbias
[docs] def volumetric_error(sim, obs, dim="time"): r"""Return the volumetric error between two time series. .. math:: VE= 1 - \\frac{\\sum_{i=1}^{N}(|sim_{i}-obs_{i}|)}{\\sum_{i=1}^{N}(obs_{i})} Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray volumetric error """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) ve = xr.apply_ufunc(_ve, sim, obs, **kwargs) ve.name = "ve" return ve
[docs] def nashsutcliffe(sim, obs, dim="time"): r"""Return the Nash-Sutcliffe model efficiency. Efficiency is based on a simulated and observed time series. .. math:: NSE = 1-\\frac{\\sum_{i=1}^{N}(sim_{i}-obs_{i})^2} {\\sum_{i=1}^{N}(obs_{i}-\\bar{obs})^2} Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray the Nash-Sutcliffe model efficiency """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) nse = xr.apply_ufunc(_nse, sim, obs, **kwargs) nse.name = "nse" return nse
[docs] def lognashsutcliffe(sim, obs, epsilon=1e-6, dim="time"): r"""Return the log Nash-Sutcliffe model efficiency. Efficiency calculation is based on simulated and observed time series. .. math:: NSE = 1-\\frac{\\sum_{i=1}^{N}(log(sim_{i})-log(obs_{i}))^2} {\\sum_{i=1}^{N}(log(sim_{i})-log(\\bar{obs})^2}-1)*-1 Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series epsilon : float, optional small value to avoid taking the log of zero (the default is 1e-6) dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray the log of the Nash-Sutcliffe model efficiency """ obs = np.log(obs + epsilon) sim = np.log(sim + epsilon) # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) log_nse = xr.apply_ufunc(_nse, sim, obs, **kwargs) log_nse.name = "log_nse" return log_nse
[docs] def pearson_correlation(sim, obs, dim="time"): """Return the Pearson correlation coefficient of two time series. Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray the pearson correlation coefficient """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) pearsonr = xr.apply_ufunc(_pearson_correlation, sim, obs, **kwargs) pearsonr.name = "pearson_coef" return pearsonr
[docs] def spearman_rank_correlation(sim, obs, dim="time"): """Return the spearman rank correlation coefficient of two time series. Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray the spearman rank correlation """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) spearmanr = xr.apply_ufunc(_spearman_correlation, sim, obs, **kwargs) spearmanr.name = "spearmanr_coef" return spearmanr
[docs] def kge_non_parametric(sim, obs, dim="time"): """Return the Non Parametric Kling-Gupta Efficiency (KGE, 2018). Calculation is based on the two time series with decomposed scores. .. ref: Pool, Vis, and Seibert, 2018 Evaluating model performance: towards a non-parametric variant of the Kling-Gupta efficiency, Hydrological Sciences Journal. Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataSet Non Parametric Kling-Gupta Efficiency (2018) and with decomposed score """ cc = spearman_rank_correlation(sim, obs, dim=dim) cc.name = "kge_np_spearman_rank_correlation_coef" kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) alpha = xr.apply_ufunc(_fdc_alpha, sim, obs, **kwargs) alpha.name = "kge_np_rel_var" beta = sim.sum(dim=dim) / obs.sum(dim=dim) beta.name = "kge_np_bias" kge = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2) kge.name = "kge_np" ds_out = xr.merge([kge, cc, alpha, beta]) return ds_out
[docs] def kge_non_parametric_flood(sim, obs, dim="time"): """Return the Non Parametric Kling-Gupta Efficiency (KGE, 2018) of two time series. this KGE is optimized for flood peaks using Pearson (see Pool et al., 2018). .. ref: Pool, Vis, and Seibert, 2018 Evaluating model performance: towards a non-parametric variant of the Kling-Gupta efficiency, Hydrological Sciences Journal. Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataSet Non Parametric Kling-Gupta Efficiency (2018) optimize for flood peaks using pearson (see Pool et al., 2018) and with decomposed score """ cc = pearson_correlation(sim, obs, dim=dim) cc.name = "kge_np_flood_pearson_coef" kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) alpha = xr.apply_ufunc(_fdc_alpha, sim, obs, **kwargs) alpha.name = "kge_np_flood_rel_var" beta = sim.sum(dim=dim) / obs.sum(dim=dim) beta.name = "kge_np_flood_bias" kge = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2) kge.name = "kge_np_flood" ds_out = xr.merge([kge, cc, alpha, beta]) return ds_out
[docs] def rsquared(sim, obs, dim="time"): r"""Return the coefficient of determination of two time series. .. math:: r^2=(\\frac{\\sum ^n _{i=1}(e_i - \\bar{e})(s_i - \\bar{s})} {\\sqrt{\\sum ^n _{i=1}(e_i - \\bar{e})^2} \\sqrt{\\sum ^n _{i=1}(s_i - \\bar{s})^2}})^2 Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray the coefficient of determination """ # R2 equals the square of the Pearson correlation coefficient between obs and sim rsquared = pearson_correlation(sim, obs, dim=dim) ** 2 rsquared.name = "rsquared" return rsquared
[docs] def mse(sim, obs, dim="time"): r"""Return the mean squared error (MSE) between two time series. .. math:: MSE=\\frac{1}{N}\\sum_{i=1}^{N}(e_{i}-s_{i})^2 Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray the mean squared error """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) mse = xr.apply_ufunc(_mse, sim, obs, **kwargs) mse.name = "mse" return mse
[docs] def rmse(sim, obs, dim="time"): r"""Return the root mean squared error between two time series. .. math:: RMSE=\\sqrt{\\frac{1}{N}\\sum_{i=1}^{N}(e_{i}-s_{i})^2} Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataArray the root mean squared error """ rmse = np.sqrt(mse(sim, obs, dim=dim)) rmse.name = "rmse" return rmse
[docs] def rsr(sim, obs, dim="time"): r"""Return the RMSE-observations standard deviation (RSR) between two time series. .. math:: RSR=\\frac{RMSE}{STDEVobs} Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series Returns ------- xarray DataArray the root squared error """ # wrap numpy function kwargs = dict( input_core_dims=[[dim], [dim]], dask="parallelized", output_dtypes=[float] ) rsr = xr.apply_ufunc(_rsr, sim, obs, **kwargs) rsr.name = "rsr" return rsr
[docs] def kge(sim, obs, dim="time"): r"""Return the Kling-Gupta Efficiency (KGE) of two time series. .. ref: Gupta, Kling, Yilmaz, Martinez, 2009, Decomposition of the mean squared error and NSE performance criteria: Implications for improving hydrological modelling. Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataSet Kling-Gupta Efficiency and with decomposed scores """ cc = pearson_correlation(sim, obs, dim=dim) cc.name = "kge_pearson_coef" alpha = sim.std(dim=dim) / obs.std(dim=dim) alpha.name = "kge_rel_var" beta = sim.sum(dim=dim) / obs.sum(dim=dim) beta.name = "kge_bias" kge = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2) kge.name = "kge" ds_out = xr.merge([kge, cc, alpha, beta]) return ds_out
[docs] def kge_2012(sim, obs, dim="time"): r"""Return the Kling-Gupta Efficiency (KGE, 2012) of two time series. .. ref: Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of Hydrology, 424, 264-277, doi:10.1016/j.jhydrol.2012.01.011. Parameters ---------- sim : xarray DataArray simulations time series obs : xarray DataArray observations time series dim : str, optional name of time dimension in sim and obs (the default is 'time') Returns ------- xarray DataSet Kling-Gupta Efficiency (2012) and with decomposed scores """ cc = pearson_correlation(sim, obs, dim=dim) cc.name = "kge_2012_pearson_coef" beta = sim.sum(dim=dim) / obs.sum(dim=dim) beta.name = "kge_2012_bias" # divide alpha by bias alpha = (sim.std(dim=dim) * obs.sum(dim=dim)) / ( obs.std(dim=dim) * sim.sum(dim=dim) ) alpha.name = "kge_2012_rel_var" kge_2012 = 1 - np.sqrt((cc - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2) kge_2012.name = "kge_2012" ds_out = xr.merge([kge_2012, cc, alpha, beta]) return ds_out
# correlation ufunc function # from http://xarray.pydata.org/en/stable/dask.html#automatic-parallelization def _covariance(x, y): return np.nanmean( (x - np.nanmean(x, axis=-1, keepdims=True)) * (y - np.nanmean(y, axis=-1, keepdims=True)), axis=-1, ) def _pearson_correlation(x, y): return _covariance(x, y) / (np.nanstd(x, axis=-1) * np.nanstd(y, axis=-1)) def _spearman_correlation(x, y): x_ranks = bottleneck.nanrankdata(x, axis=-1) y_ranks = bottleneck.nanrankdata(y, axis=-1) return _pearson_correlation(x_ranks, y_ranks) # numpy functions def _fdc_alpha(sim, obs, axis=-1): fdc_s = np.sort(sim, axis=axis) / (np.nanmean(sim, axis=axis) * len(sim)) fdc_o = np.sort(obs, axis=axis) / (np.nanmean(obs, axis=axis) * len(obs)) return 1 - 0.5 * np.nansum(np.abs(fdc_s - fdc_o), axis=axis) def _bias(sim, obs, axis=-1): """Bias.""" return np.nansum(sim - obs, axis=axis) / np.nansum(np.isfinite(obs), axis=axis) def _pbias(sim, obs, axis=-1): """Percentual bias.""" return 100 * np.nansum(sim - obs, axis=axis) / np.nansum(obs, axis=axis) def _ve(sim, obs, axis=-1): """Volumetric Error.""" ve = 1 - np.nansum(np.absolute(sim - obs), axis=axis) / np.nansum(obs, axis=axis) return ve def _mse(sim, obs, axis=-1): """Mean squared error.""" mse = np.nansum((obs - sim) ** 2, axis=axis) / np.nansum( np.isfinite(obs), axis=axis ) return mse def _rsr(sim, obs, axis=-1): """RMSE-observations standard deviation.""" rmse = np.sqrt(_mse(sim, obs, axis=axis)) stdev = np.sqrt( np.nansum((obs - np.nanmean(obs, axis=axis)) ** 2, axis=axis) / np.nansum(np.isfinite(obs), axis=axis) ) return rmse / stdev def _nse(sim, obs, axis=-1): """nash-sutcliffe efficiency.""" obs_mean = np.nanmean(obs, axis=axis) a = np.nansum((sim - obs) ** 2, axis=axis) b = np.nansum((obs - obs_mean[..., None]) ** 2, axis=axis) return 1 - a / b