Source code for xclim.ensembles._reduce

# noqa: D205,D400
"""
Ensemble reduction.
===================
Ensemble reduction is the process of selecting a subset of members from an ensemble in
order to reduce the volume of computation needed while still covering a good portion of
the simulated climate variability.
"""
import logging
import warnings
from typing import Optional, Tuple, Union

import numpy as np
import scipy.stats
import xarray
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

logger = logging.getLogger("xclim")

# Avoid having to include matplotlib in xclim requirements
try:
    from matplotlib import pyplot as plt

    logger.info("Matplotlib installed. Setting make_graph to True.")
    MPL_INSTALLED = True

except ImportError:
    logger.info("Matplotlib not found. No graph data will be produced.")
    plt = None
    MPL_INSTALLED = False


[docs]def kkz_reduce_ensemble( data: xarray.DataArray, num_select: int, *, dist_method: str = "euclidean", standardize: bool = True, **cdist_kwargs, ) -> list: """Return a sample of ensemble members using KKZ selection. The algorithm selects `num_select` ensemble members spanning the overall range of the ensemble. The selection is ordered, smaller groups are always subsets of larger ones for given criteria. The first selected member is the one nearest to the centroid of the ensemble, all subsequent members are selected in a way maximizing the phase-space coverage of the group. Algorithm taken from [CannonKKZ]_. Parameters ---------- data : xr.DataArray Selection criteria data : 2-D xr.DataArray with dimensions 'realization' (N) and 'criteria' (P). These are the values used for clustering. Realizations represent the individual original ensemble members and criteria the variables/indicators used in the grouping algorithm. num_select : int The number of members to select. dist_method : str Any distance metric name accepted by `scipy.spatial.distance.cdist`. standardize : bool Whether to standardize the input before running the selection or not. Standardization consists in translation as to have a zero mean and scaling as to have a unit standard deviation. cdist_kwargs All extra arguments are passed as-is to `scipy.spatial.distance.cdist`, see its docs for more information. Returns ------- list Selected model indices along the `realization` dimension. References ---------- .. [CannonKKZ] Cannon, Alex J. (2015). Selecting GCM Scenarios that Span the Range of Changes in a Multimodel Ensemble: Application to CMIP5 Climate Extremes Indices. Journal of Climate, (28)3, 1260-1267. https://doi.org/10.1175/JCLI-D-14-00636.1 .. Kastsavounidis, I, Kuo, C.-C. Jay, Zhang, Zhen (1994). A new initialization technique for generalized Lloyd iteration. IEEE Signal Processing Letters, 1(10), 144-146. https://doi.org/10.1109/97.329844 """ if standardize: data = (data - data.mean("realization")) / data.std("realization") data = data.transpose("realization", "criteria") data["realization"] = np.arange(data.realization.size) unselected = list(data.realization.values) selected = [] dist0 = cdist( data.mean("realization").expand_dims("realization"), data, metric=dist_method, **cdist_kwargs, ) selected.append(unselected.pop(dist0.argmin())) for i in range(1, num_select): dist = cdist( data.isel(realization=selected), data.isel(realization=unselected), metric=dist_method, **cdist_kwargs, ) dist = dist.min(axis=0) selected.append(unselected.pop(dist.argmax())) return selected
[docs]def kmeans_reduce_ensemble( data: xarray.DataArray, *, method: dict = None, make_graph: bool = MPL_INSTALLED, max_clusters: Optional[int] = None, variable_weights: Optional[np.ndarray] = None, model_weights: Optional[np.ndarray] = None, sample_weights: Optional[np.ndarray] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> Tuple[list, np.ndarray, dict]: """Return a sample of ensemble members using k-means clustering. The algorithm attempts to reduce the total number of ensemble members while maintaining adequate coverage of the ensemble uncertainty in an N-dimensional data space. K-Means clustering is carried out on the input selection criteria data-array in order to group individual ensemble members into a reduced number of similar groups. Subsequently, a single representative simulation is retained from each group. Parameters ---------- data : xr.DataArray Selecton criteria data : 2-D xr.DataArray with dimensions 'realization' (N) and 'criteria' (P). These are the values used for clustering. Realizations represent the individual original ensemble members and criteria the variables/indicators used in the grouping algorithm. method : dict Dictionary defining selection method and associated value when required. See Notes. max_clusters : Optional[int] Maximum number of members to include in the output ensemble selection. When using 'rsq_optimize' or 'rsq_cutoff' methods, limit the final selection to a maximum number even if method results indicate a higher value. Defaults to N. variable_weights: Optional[np.ndarray] An array of size P. This weighting can be used to influence of weight of the climate indices (criteria dimension) on the clustering itself. model_weights: Optional[np.ndarray] An array of size N. This weighting can be used to influence which realization is selected from within each cluster. This parameter has no influence on the clustering itself. sample_weights: Optional[np.ndarray] An array of size N. sklearn.cluster.KMeans() sample_weights parameter. This weighting can be used to influence of weight of simulations on the clustering itself. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html random_state: Optional[Union[int, np.random.RandomState]] sklearn.cluster.KMeans() random_state parameter. Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html make_graph: bool output a dictionary of input for displays a plot of R² vs. the number of clusters. Defaults to True if matplotlib is installed in runtime environment. Notes ----- Parameters for method in call must follow these conventions: rsq_optimize Calculate coefficient of variation (R²) of cluster results for n = 1 to N clusters and determine an optimal number of clusters that balances cost / benefit tradeoffs. This is the default setting. See supporting information S2 text in https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0152495 method={'rsq_optimize':None} rsq_cutoff Calculate Coefficient of variation (R²) of cluster results for n = 1 to N clusters and determine the minimum numbers of clusters needed for R² > val. val : float between 0 and 1. R² value that must be exceeded by clustering results. method={'rsq_cutoff': val} n_clusters Create a user determined number of clusters. val : integer between 1 and N method={'n_clusters': val} Returns ------- list Selected model indexes (positions) np.ndarray KMeans clustering results dict Dictionary of input data for creating R² profile plot. 'None' when make_graph=False References ---------- Casajus et al. 2016. https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0152495 Examples -------- >>> import xclim >>> from xclim.ensembles import create_ensemble, kmeans_reduce_ensemble >>> from xclim.indices import hot_spell_frequency Start with ensemble datasets for temperature: >>> ensTas = create_ensemble(temperature_datasets) Calculate selection criteria -- Use annual climate change Δ fields between 2071-2100 and 1981-2010 normals. First, average annual temperature: >>> tg = xclim.atmos.tg_mean(tas=ensTas.tas) >>> his_tg = tg.sel(time=slice('1990','2019')).mean(dim='time') >>> fut_tg = tg.sel(time=slice('2020','2050')).mean(dim='time') >>> dtg = fut_tg - his_tg Then, Hotspell frequency as second indicator: >>> hs = hot_spell_frequency(tasmax=ensTas.tas, window=2, thresh_tasmax='10 degC') >>> his_hs = hs.sel(time=slice('1990','2019')).mean(dim='time') >>> fut_hs = hs.sel(time=slice('2020','2050')).mean(dim='time') >>> dhs = fut_hs - his_hs Create a selection criteria xr.DataArray: >>> from xarray import concat >>> crit = concat((dtg, dhs), dim='criteria') Finally, create clusters and select realization ids of reduced ensemble: >>> ids, cluster, fig_data = kmeans_reduce_ensemble(data=crit, method={'rsq_cutoff':0.9}, random_state=42, make_graph=False) >>> ids, cluster, fig_data = kmeans_reduce_ensemble(data=crit, method={'rsq_optimize':None}, random_state=42, make_graph=True) """ if make_graph: fig_data = {} if max_clusters is not None: fig_data["max_clusters"] = max_clusters else: fig_data = None data = data.transpose("realization", "criteria") # initialize the variables n_sim = np.shape(data)[0] # number of simulations n_idx = np.shape(data)[1] # number of indicators # normalize the data matrix z = xarray.DataArray( scipy.stats.zscore(data.data, axis=0, ddof=1), coords=data.coords ) # ddof=1 to be the same as Matlab's zscore if sample_weights is None: sample_weights = np.ones(n_sim) else: # KMeans sample weights of zero cause errors occasionally - set to 1e-15 for now sample_weights[sample_weights == 0] = 1e-15 if model_weights is None: model_weights = np.ones(n_sim) if variable_weights is None: variable_weights = np.ones(shape=(1, n_idx)) if max_clusters is None: max_clusters = n_sim if method is None: method = {"rsq_optimize": None} # normalize the weights (note: I don't know if this is really useful... this was in the MATLAB code) sample_weights = sample_weights / np.sum(sample_weights) model_weights = model_weights / np.sum(model_weights) variable_weights = variable_weights / np.sum(variable_weights) z = z * variable_weights rsq = _calc_rsq(z, method, make_graph, n_sim, random_state, sample_weights) n_clusters = _get_nclust(method, n_sim, rsq, max_clusters) if make_graph: fig_data["method"] = method fig_data["rsq"] = rsq fig_data["n_clusters"] = n_clusters fig_data["realizations"] = n_sim # Final k-means clustering with 1000 iterations to avoid instabilities in the choice of final scenarios kmeans = KMeans( n_clusters=n_clusters, n_init=1000, max_iter=600, random_state=random_state ) # we use 'fit_' only once, otherwise it computes everything again clusters = kmeans.fit_predict(z, sample_weight=sample_weights) # squared distance to centroids d = np.square( kmeans.transform(z) ) # squared distance between each point and each centroid out = np.empty( shape=n_clusters ) # prepare an empty array in which to store the results r = np.arange(n_sim) # in each cluster, find the closest (weighted) simulation and select it for i in range(n_clusters): d_i = d[ clusters == i, i ] # distance to the centroid for all simulations within the cluster 'i' if d_i.shape[0] >= 2: if d_i.shape[0] == 2: sig = 1 else: sig = np.std( d_i, ddof=1 ) # standard deviation of those distances (ddof = 1 gives the same as Matlab's std function) like = ( scipy.stats.norm.pdf(d_i, 0, sig) * model_weights[clusters == i] ) # weighted likelihood argmax = np.argmax(like) # index of the maximum likelihood else: argmax = 0 r_clust = r[ clusters == i ] # index of the cluster simulations within the full ensemble out[i] = r_clust[argmax] out = sorted(out.astype(int)) # display graph - don't block code execution return out, clusters, fig_data
def _calc_rsq(z, method, make_graph, n_sim, random_state, sample_weights): """Sub-function to kmeans_reduce_ensemble. Calculates r-square profile (r-square versus number of clusters.""" rsq = None if list(method.keys())[0] != "n_clusters" or make_graph is True: # generate r2 profile data sumd = np.zeros(shape=n_sim) + np.nan for nclust in range(n_sim): # This is k-means with only 10 iterations, to limit the computation times kmeans = KMeans( n_clusters=nclust + 1, n_init=15, max_iter=300, random_state=random_state, ) kmeans = kmeans.fit(z, sample_weight=sample_weights) sumd[ nclust ] = ( kmeans.inertia_ ) # sum of the squared distance between each simulation and the nearest cluster centroid # R² of the groups vs. the full ensemble rsq = (sumd[0] - sumd) / sumd[0] return rsq def _get_nclust(method=None, n_sim=None, rsq=None, max_clusters=None): """Sub-function to kmeans_reduce_ensemble. Determine number of clusters to create depending on various methods.""" # if we actually need to find the optimal number of clusters, this is where it is done if list(method.keys())[0] == "rsq_cutoff": # argmax finds the first occurrence of rsq > rsq_cutoff,but we need to add 1 b/c of python indexing n_clusters = np.argmax(rsq > method["rsq_cutoff"]) + 1 elif list(method.keys())[0] == "rsq_optimize": # create constant benefits curve (one to one) onetoone = -1 * (1.0 / (n_sim - 1)) + np.arange(1, n_sim + 1) * ( 1.0 / (n_sim - 1) ) n_clusters = np.argmax(rsq - onetoone) + 1 # plt.show() elif list(method.keys())[0] == "n_clusters": n_clusters = method["n_clusters"] else: raise Exception(f"Unknown selection method : {list(method.keys())}") if n_clusters > max_clusters: warnings.warn( f"{n_clusters} clusters has been found to be the optimal number of clusters, but limiting " f"to {max_clusters} as required by user provided max_clusters", UserWarning, stacklevel=2, ) n_clusters = max_clusters return n_clusters
[docs]def plot_rsqprofile(fig_data): """Create an R² profile plot using kmeans_reduce_ensemble output. The R² plot allows evaluation of the proportion of total uncertainty in the original ensemble that is provided by the reduced selected. Examples -------- >>> from xclim.ensembles import kmeans_reduce_ensemble, plot_rsqprofile >>> is_matplotlib_installed() >>> crit = xr.open_dataset(path_to_ensemble_file).data >>> ids, cluster, fig_data = kmeans_reduce_ensemble(data=crit, method={'rsq_cutoff':0.9}, random_state=42, make_graph=True) >>> plot_rsqprofile(fig_data) """ rsq = fig_data["rsq"] n_sim = fig_data["realizations"] n_clusters = fig_data["n_clusters"] # make a plot of rsq profile plt.figure(figsize=(10, 6)) plt.plot(range(1, n_sim + 1), rsq, "k-o", label="R²", linewidth=0.8, markersize=4) # plt.plot(np.arange(1.5, n_sim + 0.5), np.diff(rsq), 'r', label='ΔR²') axes = plt.gca() axes.set_xlim([0, fig_data["realizations"]]) axes.set_ylim([0, 1]) plt.xlabel("Number of groups") plt.ylabel("R²") plt.legend(loc="lower right") plt.title("R² of groups vs. full ensemble") if "rsq_cutoff" in fig_data["method"].keys(): col = "k--" label = f"R² selection > {fig_data['method']['rsq_cutoff']} (n = {n_clusters})" if "max_clusters" in fig_data.keys(): if rsq[n_clusters - 1] < fig_data["method"]["rsq_cutoff"]: col = "r--" label = ( f"R² selection = {rsq[n_clusters - 1].round(2)} (n = {n_clusters}) :" f" Max cluster set to {fig_data['max_clusters']}" ) else: label = ( f"R² selection > {fig_data['method']['rsq_cutoff']} (n = {n_clusters}) :" f" Max cluster set to {fig_data['max_clusters']}" ) plt.plot( (0, n_clusters, n_clusters), (rsq[n_clusters - 1], rsq[n_clusters - 1], 0), col, label=label, linewidth=0.75, ) plt.legend(loc="lower right") elif "rsq_optimize" in fig_data["method"].keys(): onetoone = -1 * (1.0 / (n_sim - 1)) + np.arange(1, n_sim + 1) * ( 1.0 / (n_sim - 1) ) plt.plot( range(1, n_sim + 1), onetoone, color=[0.25, 0.25, 0.75], label="Theoretical constant increase in R²", linewidth=0.5, ) plt.plot( range(1, n_sim + 1), rsq - onetoone, color=[0.75, 0.25, 0.25], label="Real benefits (R² - theoretical)", linewidth=0.5, ) col = "k--" label = f"Optimized R² cost / benefit (n = {n_clusters})" if "max_clusters" in fig_data.keys(): opt = rsq - onetoone imax = np.where(opt == opt.max())[0] if rsq[n_clusters - 1] < rsq[imax]: col = "r--" label = ( f"R² selection = {rsq[n_clusters - 1].round(2)} (n = {n_clusters}) :" f" Max cluster set to {fig_data['max_clusters']}" ) plt.plot( (0, n_clusters, n_clusters), (rsq[n_clusters - 1], rsq[n_clusters - 1], 0), col, linewidth=0.75, label=label, ) plt.legend(loc="center right") else: plt.plot( (0, n_clusters, n_clusters), (rsq[n_clusters - 1], rsq[n_clusters - 1], 0), "k--", label=f"n = {n_clusters} (R² selection = {rsq[n_clusters - 1].round(2)})", linewidth=0.75, ) plt.legend(loc="lower right")