Source code for earthkit.transforms.climatology._aggregate

# Copyright 2024-, European Centre for Medium Range Weather Forecasts.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import typing as T

import xarray as xr
from earthkit.utils.decorators import format_handler

from earthkit.transforms import _tools
from earthkit.transforms._aggregate import reduce as _reduce
from earthkit.transforms._tools import groupby_time
from earthkit.transforms.temporal import reduce as _temporal_reduce



[docs]
@format_handler()
@_tools.time_dim_decorator
@_tools.groupby_kwargs_decorator(climatology=True)
@_tools.season_order_decorator
def reduce(
    dataarray: xr.Dataset | xr.DataArray,
    time_dim: str | None = None,
    how: str | T.Callable | None = "mean",
    groupby_kwargs: dict | None = None,
    climatology_range: tuple | list | None = None,
    **reduce_kwargs,
):
    """Group data annually over a given `frequency` and reduce using the specified `how` method.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological mean. Must
        contain a `time` dimension.
    how: str or callable
        Method used to reduce data. Default='mean', which will implement the xarray in-built mean.
        If string, it must be an in-built xarray reduce method, an earthkit how method or
        any method compatible with the array namespace of the data.
        In the case of duplicate names, method selection is first in the order: xarray, earthkit,
        array_namespace.
        Otherwise it can be any function which can be called in the form `f(x, axis=axis, **kwargs)`
        to return the result of reducing an array over an integer valued axis
    frequency : str (optional)
        Frequency used for grouping the data in climatology mode. Typical values include
        `dayofyear`, `weekofyear`, `month`, `year`, etc. The full set of accepted options
        matches those supported by `earthkit.transforms._tools.groupby_time`. If not
        provided, the climatology is calculated over the entire period.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on the
        frequency provided by `frequency`. If `bin_widths` is a sequence it defines the
        edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    climatology_range : (list or tuple, optional)
        Start and end year of the period to be used for the reference climatology. Default
        is to use the entire time-series.
    groupby_kwargs : dict
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.groupby_time`
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    # Validate and normalize climatology_range if provided
    if climatology_range is not None:
        try:
            start, end = climatology_range  # expect exactly two items
        except (TypeError, ValueError) as exc:
            raise ValueError(
                "climatology_range must be a sequence of exactly two items (start, end), or None."
            ) from exc
        climatology_range = (start, end)

    # If climate range is defined, use it
    if climatology_range is not None and all(c_r is not None for c_r in climatology_range):
        selection = dataarray.sel({time_dim: slice(*climatology_range)})
    else:
        selection = dataarray

    groupby_kwargs = groupby_kwargs or {}
    if groupby_kwargs.get("frequency") is not None:
        grouped_data = groupby_time(
            selection,
            time_dim=time_dim,
            **groupby_kwargs,
        )
        return _reduce(grouped_data, how=how, dim=time_dim, **reduce_kwargs)

    return _reduce(selection, how=how, dim=time_dim, **reduce_kwargs)




[docs]
def mean(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the climatological mean.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological mean. Must
        contain a `time` dimension.
    frequency : str (optional)
        Valid options are `day`, `week`, `month` and `year`. The default is `year`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "mean"
    return reduce(*_args, **_kwargs)




[docs]
def median(*_args, **_kwargs) -> xr.DataArray:
    """Calculate the climatological median.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological median. Must
        contain a `time` dimension.
    frequency : str (optional)
        Valid options are `day`, `week`, `month` and `year`. The default is `year`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    result = quantiles(*_args, q=[0.5], **_kwargs)
    return result.isel(quantile=0)




[docs]
def min(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the climatological minimum.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological minimum. Must
        contain a `time` dimension.
    frequency : str (optional)
        Valid options are `day`, `week`, `month` and `year`. The default is `year`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "min"
    return reduce(*_args, **_kwargs)




[docs]
def max(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the climatological maximum.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological maximum. Must
        contain a `time` dimension.
    frequency : str (optional)
        Valid options are `day`, `week`, `month` and `year`. The default is `year`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "max"
    return reduce(*_args, **_kwargs)




[docs]
def std(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the climatological standard deviation.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological standard deviation.
        Must contain a `time` dimension.
    frequency : str (optional)
        Valid options are `day`, `week`, `month` and `year`. The default is `year`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)


    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "std"
    return reduce(*_args, **_kwargs)




[docs]
def daily_reduce(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Reduce the data to the daily climatology of the provided "how" method.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological mean. Must
        contain a `time` dimension.
    how: str or callable
        Method used to reduce data. Default='mean', which will implement the xarray in-built mean.
        If string, it must be an in-built xarray reduce method, an earthkit how method or
        any method compatible with the array namespace of the data.
        In the case of duplicate names, method selection is first in the order: xarray, earthkit,
        array_namespace.
        Otherwise it can be any function which can be called in the form `f(x, axis=axis, **kwargs)`
        to return the result of reducing an array over an integer valued axis
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["frequency"] = "dayofyear"
    return reduce(*_args, **_kwargs)




[docs]
def daily_mean(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the daily climatological mean.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological mean. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "mean"
    return daily_reduce(*_args, **_kwargs)




[docs]
def daily_median(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the daily climatological median.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological median. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "median"
    return daily_reduce(*_args, **_kwargs)




[docs]
def daily_min(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the daily climatological min.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological min. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "min"
    return daily_reduce(*_args, **_kwargs)




[docs]
def daily_max(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the daily climatological max.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological max. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "max"
    return daily_reduce(*_args, **_kwargs)




[docs]
def daily_std(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the daily climatological standard deviation.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological standard deviation.
        Must contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)


    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "std"
    return daily_reduce(*_args, **_kwargs)




[docs]
def monthly_reduce(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Reduce the data to the monthly climatology of the provided "how" method.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological mean. Must
        contain a `time` dimension.
    how: str or callable
        Method used to reduce data. Default='mean', which will implement the xarray in-built mean.
        If string, it must be an in-built xarray reduce method, an earthkit how method or
        any method compatible with the array namespace of the data.
        In the case of duplicate names, method selection is first in the order: xarray, earthkit,
        array_namespace.
        Otherwise it can be any function which can be called in the form `f(x, axis=axis, **kwargs)`
        to return the result of reducing an array over an integer valued axis
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["frequency"] = "month"
    return reduce(*_args, **_kwargs)




[docs]
def monthly_mean(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the monthly climatological mean.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological mean. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "mean"
    return monthly_reduce(*_args, **_kwargs)




[docs]
def monthly_median(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the monthly climatological median.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological median. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "median"
    return monthly_reduce(*_args, **_kwargs)




[docs]
def monthly_min(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the monthly climatological min.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological min. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "min"
    return monthly_reduce(*_args, **_kwargs)




[docs]
def monthly_max(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the monthly climatological max.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological max. Must
        contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "max"
    return monthly_reduce(*_args, **_kwargs)




[docs]
def monthly_std(*_args, **_kwargs) -> xr.Dataset | xr.DataArray:
    """Calculate the monthly climatological standard deviation.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological standard deviation.
        Must contain a `time` dimension.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)


    Returns
    -------
    xarray.DataArray

    """
    _kwargs["how"] = "std"
    return monthly_reduce(*_args, **_kwargs)




[docs]
@format_handler()
@_tools.time_dim_decorator
@_tools.groupby_kwargs_decorator(climatology=True)
@_tools.season_order_decorator
def quantiles(
    dataarray: xr.Dataset | xr.DataArray,
    q: float | list,
    time_dim: str | None = None,
    groupby_kwargs: dict | None = None,
    climatology_range: tuple | list | None = None,
    **reduce_kwargs,
) -> xr.DataArray:
    """Calculate a set of climatological quantiles.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological quantiles. Must
        contain a `time` dimension.
    q : float | list
        The quantile, or list of quantiles, to calculate the climatology.
    frequency : str (optional)
        Valid options are `day`, `week`, `month` and `year`. The default is `year`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    climatology_range : (list or tuple, optional)
        Start and end year of the period to be used for the reference climatology. Default
        is to use the entire time-series.
    groupby_kwargs : dict
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.groupby_time`
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    # Validate and normalize climatology_range if provided
    if climatology_range is not None:
        try:
            start, end = climatology_range  # expect exactly two items
        except (TypeError, ValueError) as exc:
            raise ValueError(
                "climatology_range must be a sequence of exactly two items (start, end), or None."
            ) from exc
        climatology_range = (start, end)

    # If climate range is defined, use it
    if climatology_range is not None and all(c_r is not None for c_r in climatology_range):
        selection = dataarray.sel({time_dim: slice(*climatology_range)})
    else:
        selection = dataarray

    groupby_kwargs = groupby_kwargs or {}
    groupby_kwargs.setdefault("frequency", "year")
    grouped_data = groupby_time(selection.chunk({time_dim: -1}), time_dim=time_dim, **groupby_kwargs)
    results = []
    if not isinstance(q, (list, tuple)):
        q = [q]
    for quantile in q:
        results.append(
            grouped_data.quantile(
                q=quantile,
                dim=time_dim,
                **reduce_kwargs,
            )
        )
    result = xr.concat(results, dim="quantile")
    return result




[docs]
def percentiles(
    dataarray: xr.Dataset | xr.DataArray,
    p: float | list,
    **_kwargs,
) -> xr.DataArray:
    """Calculate a set of climatological percentiles.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the climatological percentiles. Must
        contain a `time` dimension.
    p : float | list
        The pecentile, or list of percentiles, to calculate the climatology.
    frequency : str (optional)
        Valid options are `day`, `week` and `month`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.reduce` (except how)

    Returns
    -------
    xarray.DataArray

    """
    if not isinstance(p, (list, tuple)):
        p = [p]
    q = [_p * 1e-2 for _p in p]
    quantile_data = quantiles(
        dataarray,
        q,
        **_kwargs,
    )
    result = quantile_data.assign_coords(percentile=("quantile", p))
    result = result.swap_dims({"quantile": "percentile"})
    result = result.drop_vars("quantile")
    return result




[docs]
@format_handler()
def anomaly(
    dataarray: xr.Dataset | xr.DataArray,
    climatology: xr.Dataset | xr.DataArray,
    **_kwargs,
) -> xr.Dataset | xr.DataArray:
    """Calculate the anomaly from a reference climatology.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the anomaly from the reference
        climatology. Must contain a time dimension indicated by time_dim.
    climatology :  (xarray.DataArray, optional)
        Reference climatology data against which the anomaly is to be calculated.
        If not provided then the climatological mean is calculated from dataarray.
    frequency : str (optional)
        Valid options are `day`, `week` and `month`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    relative : bool (optional)
        Return the relative anomaly, i.e. the percentage change w.r.t the climatological period
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.climatology.mean`

    Returns
    -------
    xarray.DataArray

    """
    if isinstance(dataarray, xr.Dataset):
        out_ds = xr.Dataset().assign_attrs(dataarray.attrs)
        for var in dataarray.data_vars:
            out_da = _anomaly_dataarray(dataarray[var], climatology, **_kwargs)
            out_ds[out_da.name] = out_da
        return out_ds
    else:
        return _anomaly_dataarray(dataarray, climatology, **_kwargs)



@_tools.time_dim_decorator
@_tools.groupby_kwargs_decorator(climatology=True)
@_tools.season_order_decorator
def _anomaly_dataarray(
    dataarray: xr.DataArray,
    climatology: xr.Dataset | xr.DataArray,
    time_dim: str | None = None,
    groupby_kwargs: dict | None = None,
    relative: bool = False,
    climatology_how_tag: str = "",
    how_label: str | None = None,
    **reduce_kwargs,
) -> xr.DataArray:
    """Calculate the anomaly from a reference climatology.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the anomaly from the reference
        climatology. Must contain a time dimension indicated by time_dim.
    climatology :  (xarray.DataArray)
        Reference climatology data against which the anomaly is to be calculated.
        If not provided then the climatological mean is calculated from dataarray.
    frequency : str (optional)
        Valid options are `day`, `week` and `month`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    relative : bool (optional)
        Return the relative anomaly, i.e. the percentage change w.r.t the climatological period
    climatology_how_tag : str (optional)
        Tag to identify the climatology variable in the climatology dataset
    how_label : str (optional)
        Label to append to the variable name of the anomaly dataarray
    groupby_kwargs : dict
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.groupby_time`
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.climatology.mean`

    Returns
    -------
    xarray.DataArray

    """
    reduce_kwargs.setdefault("how", "mean")
    var_name = dataarray.name
    if isinstance(climatology, xr.Dataset):
        if var_name in climatology:
            climatology_da = climatology[var_name]
        else:
            potential_clim_vars = [c_var for c_var in climatology.data_vars if str(var_name) in str(c_var)]
            if len(potential_clim_vars) == 1:
                climatology_da = climatology[potential_clim_vars[0]]
            elif f"{var_name}_{climatology_how_tag}" in potential_clim_vars:
                climatology_da = climatology[f"{var_name}_{climatology_how_tag}"]
            elif len(potential_clim_vars) > 1:
                raise KeyError(
                    "Multiple potential climatologies found in climatology dataset, "
                    "please identify appropriate statistic with `climatology_how_tag`.\n"
                    f"Potential climatology variables found: {potential_clim_vars}"
                )
            else:
                raise ValueError(
                    "Could not find a variable in the climatology dataset that matches "
                    f"the name of the anomaly dataarray: {var_name}"
                )
    else:
        climatology_da = climatology

    # If frequency not defined, it is deduced from the climatology.
    # This is somewhat hardcoded, but it is best practice, so for now it can stay here
    for clim_freq in _tools.VALID_CLIMATOLOGY_FREQUENCIES:
        if clim_freq in climatology_da.dims:
            break
    else:
        clim_freq = "year"

    groupby_kwargs = groupby_kwargs or {}
    if groupby_kwargs.get("frequency") == "climatology":
        groupby_kwargs["frequency"] = clim_freq

    # Annual anomalies are simpler and do not need to be subtracted from before resampling
    frequency = groupby_kwargs.get("frequency")
    clim_groupby_kwargs = {k: v for k, v in groupby_kwargs.items() if k != "frequency"}

    if frequency is None:
        if clim_freq == "year":
            # If frequency is None, and clim frequency is year, then we can just take the difference
            anomaly_array = dataarray - climatology_da
            if relative:
                anomaly_array = (anomaly_array / climatology_da) * 100.0
        else:
            # If clim_freq is not year, then we need to groupby the dataarray before taking
            # the difference
            anomaly_array = (
                groupby_time(dataarray, time_dim=time_dim, frequency=clim_freq, **clim_groupby_kwargs) - climatology_da
            )
            if relative:
                anomaly_array = (
                    groupby_time(anomaly_array, time_dim=time_dim, frequency=clim_freq, **clim_groupby_kwargs)
                    / climatology_da
                    * 100.0
                )
            anomaly_array = anomaly_array.broadcast_like(dataarray)

    elif frequency == "year":
        anomaly_array = (
            _temporal_reduce(dataarray, time_dim=time_dim, **groupby_kwargs, **reduce_kwargs) - climatology_da
        )

        if relative:
            anomaly_array = (anomaly_array / climatology_da) * 100.0

    else:
        if clim_freq == "year":
            anomaly_array = dataarray - climatology_da
            if relative:
                anomaly_array = (anomaly_array / climatology_da) * 100.0
        else:
            # Need to group the dataarray to the same frequency as the climatology before taking the difference,
            # and then broadcast back to the original dataarray dimensions
            anomaly_array = (
                groupby_time(dataarray, time_dim=time_dim, frequency=clim_freq, **clim_groupby_kwargs) - climatology_da
            )

            if relative:
                anomaly_array = (
                    groupby_time(anomaly_array, time_dim=time_dim, frequency=clim_freq, **clim_groupby_kwargs)
                    / climatology_da
                    * 100.0
                )

            # The broadcast_like is probably not necessary as the _temporal_reduce should take care
            # of things, but it is a useful safeguard against any potential changes in downstream processing
            anomaly_array = anomaly_array.broadcast_like(dataarray)

        anomaly_array = _temporal_reduce(anomaly_array, time_dim=time_dim, **groupby_kwargs, **reduce_kwargs)

    if relative:
        name_tag = "relative anomaly"
        update_attrs = {"units": "%"}
    else:
        name_tag = "anomaly"
        update_attrs = {}

    return _update_anomaly_array(anomaly_array, dataarray, var_name, name_tag, update_attrs, how_label=how_label)


def _update_anomaly_array(anomaly_array, original_array, var_name, name_tag, update_attrs, how_label=None):
    if how_label is not None:
        var_name = f"{var_name}_{how_label}"
    anomaly_array = anomaly_array.rename(f"{var_name}")
    update_attrs = {**original_array.attrs, **update_attrs}
    if "standard_name" in update_attrs:
        update_attrs["standard_name"] += f"_{name_tag.replace(' ', '_')}"
    if "long_name" in update_attrs:
        update_attrs["long_name"] += f" {name_tag}"
    anomaly_array = anomaly_array.assign_attrs(update_attrs)
    return anomaly_array



[docs]
@_tools.time_dim_decorator
@_tools.groupby_kwargs_decorator(climatology=True)
@_tools.season_order_decorator
def relative_anomaly(*_args, **_kwargs):
    """Calculate the relative anomaly from a reference climatology, i.e. percentage change.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the anomaly from the reference
        climatology. Must contain a `time` dimension.
    climatology :  (xarray.DataArray, optional)
        Reference climatology data against which the anomaly is to be calculated.
        If not provided then the climatological mean is calculated from dataarray.
    climatology_range : (list or tuple, optional)
        Start and end year of the period to be used for the reference climatology. Default
        is to use the entire time-series.
    frequency : str (optional)
        Valid options are `day`, `week` and `month`.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.aggregate.climatology.mean`

    Returns
    -------
    xarray.DataArray

    """
    anomaly_xarray = anomaly(*_args, relative=True, **_kwargs)

    return anomaly_xarray




[docs]
@_tools.time_dim_decorator
@_tools.groupby_kwargs_decorator(climatology=True)
@format_handler()
def auto_anomaly(
    dataarray: xr.Dataset | xr.DataArray,
    *_args,
    climatology_range: tuple | None = None,
    climatology_how: str = "mean",
    climatology_frequency: str | None = None,
    relative: bool = False,
    **_kwargs,
):
    """Calculate the anomaly from a reference climatology.

    Parameters
    ----------
    dataarray : xarray.DataArray
        The DataArray over which to calculate the anomaly from the reference
        climatology. Must contain a `time` dimension.
    climatology_range : (list or tuple, optional)
        Start and end year of the period to be used for the reference climatology. Default
        is to use the entire time-series.
    climatology_how : string
        Method used to calculate climatology, default is "mean". Accepted values are "median", "min", "max"
    climatology_frequency : str (optional)
        Valid options are None (default), `dayofyear`, `weekofyear` and `month`. If None,
        the climatology is calculated over all time-steps
        and the anomaly is returned on the same frequency as the input data.
    frequency : str (optional)
        Valid options are `day`, `week`, `month` and `year`. The default is to return the anomaly on the
        same frequency as the input data.
    bin_widths : int or list (optional)
        If `bin_widths` is an `int`, it defines the width of each group bin on
        the frequency provided by `frequency`. If `bin_widths` is a sequence
        it defines the edges of each bin, allowing for non-uniform bin widths.
    time_dim : str (optional)
        Name of the time dimension in the data object, default behaviour is to detect the
        time dimension from the input object
    relative : bool (optional)
        Return the relative anomaly, i.e. the percentage change w.r.t the climatological period
    **reduce_kwargs :
        Any other kwargs that are accepted by `earthkit.transforms.resample`

    Returns
    -------
    xarray.DataArray

    """
    climatology = reduce(
        dataarray,
        *_args,
        how=climatology_how,
        climatology_range=climatology_range,
        frequency=climatology_frequency,
    )

    return anomaly(dataarray, climatology, *_args, relative=relative, **_kwargs)