Source code for xclim.ensembles._reduce

"""
Ensemble Reduction
==================

Ensemble reduction is the process of selecting a subset of members from an ensemble in order to reduce the volume of
computation needed while still covering a good portion of the simulated climate variability.
"""

from __future__ import annotations

import importlib.util as _util
from typing import Any
from warnings import warn

import numpy as np
import pandas as pd
import scipy.stats
import xarray
from scipy.spatial import distance
from sklearn.cluster import KMeans

# Avoid having to include matplotlib in xclim requirements
MPL_INSTALLED = bool(_util.find_spec("matplotlib"))



[docs]
def make_criteria(ds: xarray.Dataset | xarray.DataArray):
    """
    Reshape the input into a criteria 2D DataArray.

    The reshaping preserves the "realization" dimension but stacks all other dimensions and variables into a new
    "criteria" dimension, as expected by functions :py:func:`xclim.ensembles._reduce.kkz_reduce_ensemble` and
    :py:func:`xclim.ensembles._reduce.kmeans_reduce_ensemble`.

    Parameters
    ----------
    ds : xr.Dataset or xr.DataArray
        Must at least have a "realization" dimension.
        All values are considered independent "criterion" for the ensemble reduction.
        If a Dataset, variables may have different sizes, but all must include the "realization" dimension.

    Returns
    -------
    xr.DataArray
        Same data, reshaped. Old coordinates (and variables) are available as a multiindex.

    Notes
    -----
    One can get back to the original dataset with

    .. code-block:: python

        crit = make_criteria(ds)
        ds2 = crit.unstack("criteria").to_dataset("variables")

    `ds2` will have all variables with the same dimensions, so if the original dataset had variables with different
    dimensions, the added dimensions are filled with NaNs. Also, note that criteria that are all NaN (such as lat/lon
    coordinates with no data) are dropped from `crit` to avoid issues with the clustering algorithms, so the original
    dataset might not be able to be fully reconstructed. The `to_dataset` part can be skipped if the original input
    was a DataArray.
    """

    def _make_crit(da):
        """Variable version : stack non-realization dims."""
        return da.stack(criteria=set(da.dims) - {"realization"})

    if isinstance(ds, xarray.Dataset):
        # When variables have different set of dims, missing dims on one variable results in duplicated values
        # when a simple stack is done. To avoid that: stack each variable independently add a new "variables" dim
        stacked = {da.name: _make_crit(da.expand_dims(variables=[da.name])) for da in ds.data_vars.values()}
        # Get name of all stacked coords
        stacked_coords = set.union(*[set(da.indexes["criteria"].names) for da in stacked.values()])
        # Concat the variables by dropping old stacked index and related coords
        crit = xarray.concat(
            [da.reset_index("criteria").drop_vars(stacked_coords, errors="ignore") for k, da in stacked.items()],
            "criteria",
        )
        # Reconstruct proper stacked coordinates. When a variable is missing one of the coords, give NaNss
        coords = [
            (
                crd,
                np.concatenate(
                    [(da[crd].values if crd in da.coords else [np.nan] * da.criteria.size) for da in stacked.values()],
                ),
            )
            for crd in stacked_coords
        ]
        crit = crit.assign_coords(
            xarray.Coordinates.from_pandas_multiindex(
                pd.MultiIndex.from_arrays([arr for name, arr in coords], names=[name for name, arr in coords]),
                "criteria",
            )
        )
        # Previous ops gave the first variable's attributes, replace by the original dataset ones.
        crit.attrs = ds.attrs
    else:
        # Easy-peasy, skip all the convoluted stuff
        crit = _make_crit(ds)

    # drop criteria that are all NaN
    crit = crit.dropna(dim="criteria", how="all")
    return crit.rename("criteria")




[docs]
def kkz_reduce_ensemble(
    data: xarray.DataArray,
    num_select: int,
    *,
    dist_method: str = "euclidean",
    standardize: bool = True,
    **cdist_kwargs: Any,
) -> list:
    r"""
    Return a sample of ensemble members using KKZ selection.

    The algorithm selects `num_select` ensemble members spanning the overall range of the ensemble.
    The selection is ordered, smaller groups are always subsets of larger ones for given criteria.
    The first selected member is the one nearest to the centroid of the ensemble, all subsequent members
    are selected in a way maximizing the phase-space coverage of the group.
    Algorithm taken from :cite:t:`cannon_selecting_2015`.

    Parameters
    ----------
    data : xr.DataArray
        Selection criteria data : 2-D xr.DataArray with dimensions 'realization' (N) and 'criteria' (P).
        These are the values used for clustering. Realizations represent the individual original
        ensemble members and criteria the variables/indicators used in the grouping algorithm.
    num_select : int
        The number of members to select.
    dist_method : str
        Any distance metric name accepted by `scipy.spatial.distance.cdist`.
    standardize : bool
        Whether to standardize the input before running the selection or not.
        Standardization consists in translation as to have a zero mean and scaling as to have a unit
        standard deviation.
    **cdist_kwargs : Any
        All extra arguments are passed as-is to `scipy.spatial.distance.cdist`, see its docs for more information.

    Returns
    -------
    list
        Selected model indices along the `realization` dimension.

    References
    ----------
    :cite:cts:`cannon_selecting_2015,katsavounidis_new_1994`
    """
    if standardize:
        data = (data - data.mean("realization")) / data.std("realization")

    data = data.transpose("realization", "criteria")
    data["realization"] = np.arange(data.realization.size)

    unselected = list(data.realization.values)
    selected = []

    dist0 = distance.cdist(
        data.mean("realization").expand_dims("realization"),
        data,
        metric=dist_method,
        **cdist_kwargs,
    )
    selected.append(unselected.pop(dist0.argmin()))

    for _ in range(1, num_select):
        dist = distance.cdist(
            data.isel(realization=selected),
            data.isel(realization=unselected),
            metric=dist_method,
            **cdist_kwargs,
        )
        dist = dist.min(axis=0)
        selected.append(unselected.pop(dist.argmax()))

    return selected




[docs]
def kmeans_reduce_ensemble(
    data: xarray.DataArray,
    *,
    method: dict | None = None,
    make_graph: bool = MPL_INSTALLED,
    max_clusters: int | None = None,
    variable_weights: np.ndarray | None = None,
    model_weights: np.ndarray | None = None,
    sample_weights: np.ndarray | None = None,
    random_state: int | np.random.RandomState | None = None,
) -> tuple[list, np.ndarray, dict]:
    """
    Return a sample of ensemble members using k-means clustering.

    The algorithm attempts to reduce the total number of ensemble members while maintaining adequate coverage of
    the ensemble uncertainty in an N-dimensional data space. K-Means clustering is carried out on the input
    selection criteria data-array in order to group individual ensemble members into a reduced number
    of similar groups. Subsequently, a single representative simulation is retained from each group.

    Parameters
    ----------
    data : xr.DataArray
        Selection criteria data : 2-D xr.DataArray with dimensions 'realization' (N) and
        'criteria' (P). These are the values used for clustering. Realizations represent the individual original
        ensemble members and criteria the variables/indicators used in the grouping algorithm.
    method : dict, optional
        Dictionary defining selection method and associated value when required. See Notes.
    make_graph : bool
        Output a dictionary of input for displays a plot of R² vs. the number of clusters.
        Defaults to True if matplotlib is installed in the runtime environment.
    max_clusters : int, optional
        Maximum number of members to include in the output ensemble selection.
        When using 'rsq_optimize' or 'rsq_cutoff' methods, limit the final selection to a maximum number
        even if method results indicate a higher value. Defaults to N.
    variable_weights : np.ndarray, optional
        An array of size P.
        This weighting can be used to influence of weight of the climate indices (criteria dimension)
        on the clustering itself.
    model_weights : np.ndarray, optional
        An array of size N. This weighting can be used to influence which realization is selected
        from within each cluster. This parameter has no influence on the clustering itself.
    sample_weights : np.ndarray, optional
        An array of size N. sklearn.cluster.KMeans() sample_weights parameter.
        This weighting can be used to influence of weight of simulations on the clustering itself.
        See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html.
    random_state : int or np.random.RandomState, optional
        A sklearn.cluster.KMeans() random_state parameter.
        Determines random number generation for centroid initialization.
        Use to make the randomness deterministic.
        See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html.

    Returns
    -------
    list
        Selected model indexes (positions).
    np.ndarray
        KMeans clustering results.
    dict
        Dictionary of input data for creating R² profile plot. 'None' when make_graph=False.

    Notes
    -----
    Parameters for method in call must follow these conventions:

    rsq_optimize
        Calculate coefficient of variation (R²) of cluster results for n = 1 to N clusters and determine
        an optimal number of clusters that balances cost/benefit tradeoffs. This is the default setting.
        See supporting information S2 text in :cite:t:`casajus_objective_2016`.

        method={'rsq_optimize':None}

    rsq_cutoff
        Calculate Coefficient of variation (R²) of cluster results for n = 1 to N clusters and determine
        the minimum numbers of clusters needed for R² > val.

        val : float between 0 and 1. R² value that must be exceeded by clustering results.

        method={'rsq_cutoff': val}

    n_clusters
        Create a user determined number of clusters.

        val : integer between 1 and N

        method={'n_clusters': val}

    References
    ----------
    :cite:cts:`casajus_objective_2016`

    Examples
    --------
    .. code-block::

        import xclim
        from xclim.ensembles import create_ensemble, kmeans_reduce_ensemble
        from xclim.indices import hot_spell_frequency

        # Start with ensemble datasets for temperature:

        ensTas = create_ensemble(temperature_datasets)

        # Calculate selection criteria -- Use annual climate change Δ fields between 2071-2100 and 1981-2010 normals.
        # First, average annual temperature:

        tg = xclim.atmos.tg_mean(tas=ensTas.tas)
        his_tg = tg.sel(time=slice("1990", "2019")).mean(dim="time")
        fut_tg = tg.sel(time=slice("2020", "2050")).mean(dim="time")
        dtg = fut_tg - his_tg

        # Then, hot spell frequency as second indicator:

        hs = hot_spell_frequency(tasmax=ensTas.tas, window=2, thresh_tasmax="10 degC")
        his_hs = hs.sel(time=slice("1990", "2019")).mean(dim="time")
        fut_hs = hs.sel(time=slice("2020", "2050")).mean(dim="time")
        dhs = fut_hs - his_hs

        # Create a selection criteria xr.DataArray:

        from xarray import concat

        crit = concat((dtg, dhs), dim="criteria")

        # Finally, create clusters and select realization ids of reduced ensemble:

        ids, cluster, fig_data = kmeans_reduce_ensemble(
            data=crit, method={"rsq_cutoff": 0.9}, random_state=42, make_graph=False
        )
        ids, cluster, fig_data = kmeans_reduce_ensemble(
            data=crit, method={"rsq_optimize": None}, random_state=42, make_graph=True
        )
    """
    if make_graph:
        fig_data = {}
        if max_clusters is not None:
            fig_data["max_clusters"] = max_clusters
    else:
        fig_data = None

    data = data.transpose("realization", "criteria")
    # initialize the variables
    n_sim = np.shape(data)[0]  # number of simulations
    n_idx = np.shape(data)[1]  # number of indicators

    # normalize the data matrix
    std = data.std(dim="realization", ddof=1)
    z = (data - data.mean(dim="realization")) / std.where(std > 0)

    if sample_weights is None:
        sample_weights = np.ones(n_sim)
    else:
        # KMeans sample weights of zero cause errors occasionally - set to 1e-15 for now
        sample_weights[sample_weights == 0] = 1e-15
    if model_weights is None:
        model_weights = np.ones(n_sim)
    if variable_weights is None:
        variable_weights = np.ones(shape=(1, n_idx))
    if max_clusters is None:
        max_clusters = n_sim
    if method is None:
        method = {"rsq_optimize": None}

    # normalize the weights (note: I don't know if this is really useful... this was in the MATLAB code)
    sample_weights = sample_weights / np.sum(sample_weights)
    model_weights = model_weights / np.sum(model_weights)
    variable_weights = variable_weights / np.sum(variable_weights)

    z = z * variable_weights
    rsq = _calc_rsq(z, method, make_graph, n_sim, random_state, sample_weights)

    n_clusters = _get_nclust(method=method, n_sim=n_sim, rsq=rsq, max_clusters=max_clusters)

    if make_graph:
        fig_data["method"] = method
        fig_data["rsq"] = rsq
        fig_data["n_clusters"] = n_clusters
        fig_data["realizations"] = n_sim

    # Final k-means clustering with 1000 iterations to avoid instabilities in the choice of final scenarios
    kmeans = KMeans(n_clusters=n_clusters, n_init=1000, max_iter=600, random_state=random_state)
    # we use 'fit_' only once, otherwise it computes everything again
    clusters = kmeans.fit_predict(z, sample_weight=sample_weights)

    # squared distance to centroids
    d = np.square(kmeans.transform(z))  # squared distance between each point and each centroid

    out = np.empty(shape=n_clusters)  # prepare an empty array in which to store the results
    r = np.arange(n_sim)

    # in each cluster, find the closest (weighted) simulation and select it
    for i in range(n_clusters):
        d_i = d[clusters == i, i]  # distance to the centroid for all simulations within the cluster 'i'
        if d_i.shape[0] >= 2:
            if d_i.shape[0] == 2:
                sig = 1
            else:
                sig = np.std(
                    d_i, ddof=1
                )  # standard deviation of those distances (ddof = 1 gives the same as Matlab's std function)

            like = scipy.stats.norm.pdf(d_i, 0, sig) * model_weights[clusters == i]  # weighted likelihood

            argmax = np.argmax(like)  # index of the maximum likelihood

        else:
            argmax = 0

        r_clust = r[clusters == i]  # index of the cluster simulations within the full ensemble

        out[i] = r_clust[argmax]

    out = sorted(out.astype(int))
    # display graph - don't block code execution

    return out, clusters, fig_data




[docs]
def _calc_rsq(z, method: dict, make_graph: bool, n_sim: np.ndarray | int, random_state, sample_weights):
    """
    Sub-function to kmeans_reduce_ensemble.

    Calculates r-square profile (r-square versus number of clusters).
    """
    rsq = None
    if list(method.keys())[0] != "n_clusters" or make_graph is True:
        # generate r2 profile data
        sum_d = np.zeros(shape=n_sim) + np.nan
        for n_clust in range(n_sim):
            # This is k-means with only 10 iterations, to limit the computation times
            kmeans = KMeans(
                n_clusters=n_clust + 1,
                n_init=15,
                max_iter=300,
                random_state=random_state,
            )
            kmeans = kmeans.fit(z, sample_weight=sample_weights)
            sum_d[n_clust] = (
                kmeans.inertia_
            )  # sum of the squared distance between each simulation and the nearest cluster centroid

        # R² of the groups vs. the full ensemble
        rsq = (sum_d[0] - sum_d) / sum_d[0]

    return rsq




[docs]
def _get_nclust(method: dict, n_sim: int, rsq: float, max_clusters: int):
    """Sub-function to kmeans_reduce_ensemble. Determine number of clusters to create depending on various methods."""
    # if we actually need to find the optimal number of clusters, this is where it is done
    if list(method.keys())[0] == "rsq_cutoff":
        # argmax finds the first occurrence of rsq > rsq_cutoff,but we need to add 1 b/c of python indexing
        n_clusters = np.argmax(rsq > method["rsq_cutoff"]) + 1

    elif list(method.keys())[0] == "rsq_optimize":
        # create constant benefits curve (one to one)
        onetoone = -1 * (1.0 / (n_sim - 1)) + np.arange(1, n_sim + 1) * (1.0 / (n_sim - 1))

        n_clusters = np.argmax(rsq - onetoone) + 1

    elif list(method.keys())[0] == "n_clusters":
        n_clusters = method["n_clusters"]
    else:
        raise KeyError(f"Unknown selection method : {list(method.keys())}")
    if n_clusters > max_clusters:
        warn(
            f"{n_clusters} clusters has been found to be the optimal number of clusters, but limiting "
            f"to {max_clusters} as required by user provided max_clusters",
            UserWarning,
            stacklevel=2,
        )
        n_clusters = max_clusters
    return n_clusters




[docs]
def plot_rsqprofile(fig_data: dict) -> None:
    """
    Create an R² profile plot using `kmeans_reduce_ensemble` output.

    The R² plot allows evaluation of the proportion of total uncertainty in the original ensemble that is provided
    by the reduced selected.

    Parameters
    ----------
    fig_data : dict
        Dictionary of input data for creating R² profile plot.

    Examples
    --------
    >>> from xclim.ensembles import kmeans_reduce_ensemble, plot_rsqprofile
    >>> is_matplotlib_installed()
    >>> crit = xr.open_dataset(path_to_ensemble_file).data
    >>> ids, cluster, fig_data = kmeans_reduce_ensemble(
    ...     data=crit, method={"rsq_cutoff": 0.9}, random_state=42, make_graph=True
    ... )
    >>> plot_rsqprofile(fig_data)
    """
    if not MPL_INSTALLED:
        raise ModuleNotFoundError("Matplotlib is not installed. No plotting functions are supported.")
    else:
        import matplotlib.pyplot as plt

    rsq = fig_data["rsq"]
    n_sim = fig_data["realizations"]
    n_clusters = fig_data["n_clusters"]
    # make a plot of rsq profile
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_sim + 1), rsq, "k-o", label="R²", linewidth=0.8, markersize=4)
    # plt.plot(np.arange(1.5, n_sim + 0.5), np.diff(rsq), 'r', label='ΔR²')
    axes = plt.gca()
    axes.set_xlim([0, fig_data["realizations"]])
    axes.set_ylim([0, 1])
    plt.xlabel("Number of groups")
    plt.ylabel("R²")
    plt.legend(loc="lower right")
    plt.title("R² of groups vs. full ensemble")
    if "rsq_cutoff" in fig_data["method"].keys():
        col = "k--"
        label = f"R² selection > {fig_data['method']['rsq_cutoff']} (n = {n_clusters})"
        if "max_clusters" in fig_data.keys():
            if rsq[n_clusters - 1] < fig_data["method"]["rsq_cutoff"]:
                col = "r--"
                label = (
                    f"R² selection = {rsq[n_clusters - 1].round(2)} (n = {n_clusters}) :"
                    f" Max cluster set to {fig_data['max_clusters']}"
                )
            else:
                label = (
                    f"R² selection > {fig_data['method']['rsq_cutoff']} (n = {n_clusters}) :"
                    f" Max cluster set to {fig_data['max_clusters']}"
                )

        plt.plot(
            (0, n_clusters, n_clusters),
            (rsq[n_clusters - 1], rsq[n_clusters - 1], 0),
            col,
            label=label,
            linewidth=0.75,
        )
        plt.legend(loc="lower right")
    elif "rsq_optimize" in fig_data["method"].keys():
        onetoone = -1 * (1.0 / (n_sim - 1)) + np.arange(1, n_sim + 1) * (1.0 / (n_sim - 1))
        plt.plot(
            range(1, n_sim + 1),
            onetoone,
            color=[0.25, 0.25, 0.75],
            label="Theoretical constant increase in R²",
            linewidth=0.5,
        )
        plt.plot(
            range(1, n_sim + 1),
            rsq - onetoone,
            color=[0.75, 0.25, 0.25],
            label="Real benefits (R² - theoretical)",
            linewidth=0.5,
        )
        col = "k--"
        label = f"Optimized R² cost / benefit (n = {n_clusters})"
        if "max_clusters" in fig_data.keys():
            opt = rsq - onetoone
            imax = np.where(opt == opt.max())[0]

            if rsq[n_clusters - 1] < rsq[imax]:
                col = "r--"
            label = (
                f"R² selection = {rsq[n_clusters - 1].round(2)} (n = {n_clusters}) :"
                f" Max cluster set to {fig_data['max_clusters']}"
            )

        plt.plot(
            (0, n_clusters, n_clusters),
            (rsq[n_clusters - 1], rsq[n_clusters - 1], 0),
            col,
            linewidth=0.75,
            label=label,
        )
        plt.legend(loc="center right")
    else:
        plt.plot(
            (0, n_clusters, n_clusters),
            (rsq[n_clusters - 1], rsq[n_clusters - 1], 0),
            "k--",
            label=f"n = {n_clusters} (R² selection = {rsq[n_clusters - 1].round(2)})",
            linewidth=0.75,
        )
        plt.legend(loc="lower right")