# Copyright 2024-, European Centre for Medium Range Weather Forecasts.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import logging
import typing as T
from copy import deepcopy
import geopandas as gpd
import pandas as pd
import xarray as xr
from earthkit.utils.decorators import format_handler
from numpy import ndarray
from earthkit.transforms._tools import (
array_namespace_from_object,
ensure_list,
get_how_xp,
get_spatial_info,
standard_weights,
)
logger = logging.getLogger(__name__)
def _transform_from_latlon(lat, lon):
"""Return an Affine transformation of input 1D arrays of lat and lon.
This assumes that both lat and lon are regular and contiguous.
Parameters
----------
lat : list
arrays or lists of latitude and longitude
lon : list
arrays or lists of latitude and longitude
"""
from affine import Affine
trans = Affine.translation(lon[0] - (lon[1] - lon[0]) / 2, lat[0] - (lat[1] - lat[0]) / 2)
scale = Affine.scale(lon[1] - lon[0], lat[1] - lat[0])
return trans * scale
def rasterize(
shape_list: T.List,
coords: xr.core.coordinates.Coordinates,
lat_key: str = "latitude",
lon_key: str = "longitude",
**kwargs,
) -> xr.DataArray:
"""Rasterize a list of geometries onto the given xarray coordinates.
This only works for regular and contiguous latitude and longitude grids.
Parameters
----------
shape_list : list
List of geometries
coords : xarray.coords
Coordinates of dataarray to be masked
lat_key : str, optional
name of the latitude variables in the coordinates object
lon_key : str, optional
name of the longitude variables in the coordinates object
dtype : type, optional
datatype of the returned mask, default is `int`
kwargs:
Any other kwargs accepted by rasterio.features.rasterize
Returns
-------
xarray.DataArray
A mask where points not inside the shape_list are set to `fill` value
"""
from rasterio import features
transform = _transform_from_latlon(coords[lat_key], coords[lon_key])
out_shape = (len(coords[lat_key]), len(coords[lon_key]))
raster = features.rasterize(shape_list, out_shape=out_shape, transform=transform, **kwargs)
spatial_coords = {lat_key: coords[lat_key], lon_key: coords[lon_key]}
return xr.DataArray(raster, coords=spatial_coords, dims=(lat_key, lon_key))
def mask_contains_points(
shape_list: T.List,
coords: xr.core.coordinates.Coordinates,
lat_key: str = "latitude",
lon_key: str = "longitude",
**_kwargs,
) -> xr.DataArray:
"""Return a mask array for the spatial points of data that lie within shapes in shape_list.
Function uses shapely.contains_xy and handles Polygon, MultiPolygon, and polygons with holes.
It was initially included for use with irregular data but has been constructed to also accept
regular data and return in the same format as the rasterize function.
Parameters
----------
shape_list :
List of geometries
coords : xarray.coords
Coordinates of dataarray to be masked
lat_key : str, optional
name of the latitude variables in the coordinates object
lon_key : str, optional
name of the longitude variables in the coordinates object
dtype : type, optional
datatype of the returned mask, default is `int`
Returns
-------
xarray.DataArray
A mask where points not inside the shape_list are set to `fill` value
"""
import shapely
xp = array_namespace_from_object(coords[lat_key])
lat_dims = coords[lat_key].dims
lon_dims = coords[lon_key].dims
# Assert that latitude and longitude have the same dimensions
# (irregular data, e.g. x,y or obs)
# or the dimensions are themselves (regular data) but we will probably
# just use the rasterize function for the regular case
assert (lat_dims == lon_dims) or (lat_dims == (lat_key,) and lon_dims == (lon_key,))
if lat_dims == (lat_key,) and lon_dims == (lon_key,):
lon_full, lat_full = xp.meshgrid(
coords[lon_key].data,
coords[lat_key].data,
)
else:
lon_full, lat_full = (
coords[lon_key].data,
coords[lat_key].data,
)
# get spatial dims and create output array:
spatial_dims = list(dict.fromkeys(lat_dims + lon_dims))
outdata_shape = tuple(len(coords[dim]) for dim in spatial_dims)
outdata = xp.full(outdata_shape, xp.nan)
# loop over shapes and mask any point that is in the shape
for shape in shape_list:
inside = shapely.contains_xy(shape, lon_full.flat, lat_full.flat)
outdata.flat[inside] = True
out_coords = {coord: coords[coord] for coord in spatial_dims}
outarray = xr.DataArray(outdata, coords=out_coords, dims=spatial_dims)
return outarray
def _geopandas_to_shape_list(geodataframe):
"""Iterate over rows of a geodataframe."""
return [row[1]["geometry"] for row in geodataframe.iterrows()]
def _array_mask_iterator(mask_arrays):
"""Iterate over mask arrays."""
for mask_array in mask_arrays:
yield mask_array > 0
def _shape_mask_iterator(shapes, target, regular=True, **kwargs):
"""Iterate over shape mask methods."""
if isinstance(shapes, gpd.GeoDataFrame):
shapes = _geopandas_to_shape_list(shapes)
if regular:
mask_function = rasterize
else:
mask_function = mask_contains_points
for shape in shapes:
shape_da = mask_function([shape], target.coords, **kwargs)
yield shape_da
def shapes_to_masks(shapes: gpd.GeoDataFrame | list[gpd.GeoDataFrame], target, regular=True, **kwargs):
"""Create a list of masked dataarrays, if possible use the shape_mask_iterator.
Parameters
----------
shapes :
A geodataframe or list of geodataframes containing the polygons for masks
target :
A dataarray to to create a mask for, only the geospatial coordinates are used
regular :
If True, data is on a regular grid so use rasterize method, if False use mask_contains_points
all_touched :
If True, all pixels touched by geometries will be considered in,
if False, only pixels whose center is within. Default is False.
Only valid for regular data.
kwargs:
kwargs accepted by the masking methods, rasterize or mask_contains_points
Returns
-------
list[xarray.DataArray]
A list of masks where points inside each geometry are 1, and those outside are xp.nan
"""
if isinstance(shapes, gpd.GeoDataFrame):
shapes = _geopandas_to_shape_list(shapes)
if regular:
mask_function = rasterize
else:
mask_function = mask_contains_points
return [mask_function([shape], target.coords, **kwargs) for shape in shapes]
def shapes_to_mask(shapes, target, regular=True, **kwargs):
"""Create a single masked dataarray based on all features in shapes.
If possible use the shape_mask_iterator.
Parameters
----------
shapes :
A geodataframe or list of geodataframes containing the polygons for masks
target :
A dataarray to to create a mask for, only the geospatial coordinates are used
regular :
If True, data is on a regular grid so use rasterize method,
if False use mask_contains_points
all_touched :
If True, all pixels touched by geometries will be considered in,
if False, only pixels whose center is within. Default is False.
Only valid for regular data.
kwargs:
kwargs accepted by the masking methods, rasterize or mask_contains_points
Returns
-------
xarray.DataArray
A mask where points inside any geometry are 1, and those outside are xp.nan
"""
if isinstance(shapes, gpd.GeoDataFrame):
shapes = _geopandas_to_shape_list(shapes)
if regular:
mask_function = rasterize
else:
mask_function = mask_contains_points
return mask_function(shapes, target.coords, **kwargs)
def get_mask_dim_index(
mask_dim: str | None | T.Dict[str, T.Any],
geodataframe: gpd.geodataframe.GeoDataFrame,
default_index_name: str = "index",
):
if isinstance(mask_dim, str):
if mask_dim in geodataframe:
mask_dim_index = pd.Index(geodataframe[mask_dim])
else:
mask_dim_index = geodataframe.index.rename(mask_dim)
elif isinstance(mask_dim, dict):
assert len(mask_dim) == 1, "If provided as a dictionary, mask_dim should have only one key value pair"
_mask_dim, _mask_dim_values = mask_dim.items()
mask_dim_index = pd.Index(_mask_dim_values, name=_mask_dim)
elif mask_dim is None:
# Use the index of the data frame
mask_dim_index = geodataframe.index
if mask_dim_index.name is None:
mask_dim_index = mask_dim_index.rename(default_index_name)
else:
raise ValueError("Unrecognised mask_dim format")
return mask_dim_index
def _area_to_geodataframe(area: dict) -> gpd.GeoDataFrame:
"""Convert an area dictionary to a GeoDataFrame with a bounding box polygon.
Parameters
----------
area : dict
Dictionary with keys ``"north"``, ``"south"``, ``"east"``, ``"west"``
defining the bounding box.
Returns
-------
gpd.GeoDataFrame
A GeoDataFrame containing a single bounding-box polygon.
"""
from shapely.geometry import box
required_keys = {"north", "south", "east", "west"}
missing = required_keys - set(area)
if missing:
raise ValueError(f"area dictionary is missing required keys: {missing}")
if area["west"] > area["east"]:
raise ValueError("Areas that cross the anti-meridian (where 'west' > 'east') are not currently supported.")
polygon = box(area["west"], area["south"], area["east"], area["north"])
return gpd.GeoDataFrame(geometry=[polygon])
def area_to_geodataframe_decorator(func):
"""Decorator that converts an ``area`` kwarg to a ``geodataframe`` kwarg.
If ``area`` is provided as a dictionary with keys
``{"north", "south", "east", "west"}``, it is converted to a
`geopandas.GeoDataFrame` with a single bounding-box polygon and passed
as the ``geodataframe`` argument.
Raises ``ValueError`` if both ``area`` and ``geodataframe`` are provided.
.. note::
Areas that cross the anti-meridian (where ``west > east``) are not
currently supported.
"""
@functools.wraps(func)
def wrapper(*args, area: dict | None = None, **kwargs):
positional_geodataframe = args[1] if len(args) >= 2 else None
keyword_geodataframe = kwargs.get("geodataframe")
if area is not None and (positional_geodataframe is not None or keyword_geodataframe is not None):
raise ValueError("Only one of 'area' or 'geodataframe' may be provided, not both.")
if area is not None:
area_geodataframe = _area_to_geodataframe(area)
if len(args) >= 2:
args = (args[0], area_geodataframe, *args[2:])
kwargs.pop("geodataframe", None)
else:
kwargs["geodataframe"] = area_geodataframe
return func(*args, **kwargs)
return wrapper
[docs]
@format_handler()
@area_to_geodataframe_decorator
def mask(
dataarray: xr.Dataset | xr.DataArray,
geodataframe: gpd.geodataframe.GeoDataFrame | None = None,
mask_dim: str | None = None,
lat_key: str | None = None,
lon_key: str | None = None,
chunk: bool = True,
union_geometries: bool = False,
area: dict | None = None,
**mask_kwargs,
) -> xr.Dataset | xr.DataArray:
"""Apply multiple shape masks to some gridded data.
Each feature in shape is treated as an individual mask to apply to
data. The data provided is returned with an additional dimension equal in
length to the number of features in the shape object, this can result in very
large files which will slow down your script. It may be better to loop
over individual features, or directly apply the mask with the shapes.reduce.
Parameters
----------
dataarray :
Xarray data object (must have geospatial coordinates).
geodataframe :
Geopandas Dataframe containing the polygons for aggregations.
Either ``geodataframe`` or ``area`` must be provided, but not both.
area : dict, optional
Dictionary with keys ``"north"``, ``"south"``, ``"east"``, ``"west"``
defining a bounding box. Converted to a single-polygon GeoDataFrame
internally. Areas that cross the anti-meridian (``west > east``) are
not currently supported.
mask_dim :
dimension that will be created to accommodate the masked arrays, default is the index
of the geodataframe
all_touched :
If True, all pixels touched by geometries will be considered in,
if False, only pixels whose center is within. Default is False.
Only valid for regular data.
lat_key :
key for latitude variable, default behaviour is to detect variable keys.
lon_key :
key for longitude variable, default behaviour is to detect variable keys.
chunk : bool
Boolean to indicate whether to use chunking, default = `True`.
This is advised as spatial.masks can create large results. If you are working with small
arrays, or you have implemented you own chunking rules you may wish to disable it.
union_geometries : bool
Boolean to indicate whether to union all geometries before masking.
Default is `False`, which will apply each geometry in the geodataframe as a separate mask.
mask_kwargs :
Any kwargs to pass into the mask method
Returns
-------
xarray.Dataset | xarray.DataArray
A masked data array with dimensions [feature_id] + [data.dims].
Each slice of layer corresponds to a feature in layer.
"""
if geodataframe is None:
raise ValueError("Either 'geodataframe' or 'area' must be provided.")
spatial_info = get_spatial_info(dataarray, lat_key=lat_key, lon_key=lon_key)
# Get spatial info required by mask functions:
mask_kwargs = {**mask_kwargs, **{key: spatial_info[key] for key in ["lat_key", "lon_key", "regular"]}}
mask_dim_index = get_mask_dim_index(mask_dim, geodataframe)
if union_geometries:
loop_masks = [shapes_to_mask(geodataframe, dataarray, **mask_kwargs)]
else:
loop_masks = _shape_mask_iterator(geodataframe, dataarray, **mask_kwargs)
masked_arrays = []
for this_mask in loop_masks:
this_masked_array = dataarray.where(this_mask > 0)
if chunk:
this_masked_array = this_masked_array.chunk()
masked_arrays.append(this_masked_array.copy())
if union_geometries:
out = masked_arrays[0]
else:
# TODO: remove ignore type if xarray concat typing is updated
out = xr.concat(masked_arrays, dim=mask_dim_index.name) # type: ignore
if chunk:
out = out.chunk({mask_dim_index.name: 1})
out = out.assign_coords({mask_dim_index.name: mask_dim_index})
out.attrs.update(geodataframe.attrs)
return out
[docs]
@format_handler()
@area_to_geodataframe_decorator
def reduce(
dataarray: xr.Dataset | xr.DataArray,
geodataframe: gpd.GeoDataFrame | None = None,
mask_arrays: xr.DataArray | list[xr.DataArray] | None = None,
area: dict | None = None,
**kwargs,
) -> xr.Dataset | xr.DataArray:
"""Apply a shape object to an xarray.DataArray object using the specified 'how' method.
Geospatial coordinates are reduced to a dimension representing the list of features in the shape object.
Parameters
----------
dataarray :
Xarray data object (must have geospatial coordinates).
geodataframe :
Geopandas Dataframe containing the polygons for aggregations.
Cannot be provided together with ``area``.
area : dict, optional
Dictionary with keys ``"north"``, ``"south"``, ``"east"``, ``"west"``
defining a bounding box. Converted to a single-polygon GeoDataFrame
internally. Areas that cross the anti-meridian (``west > east``) are
not currently supported.
mask_arrays :
precomputed mask array[s], if provided this will be used instead of creating a new mask.
They must be on the same spatial grid as the dataarray.
how :
method used to apply mask. Default='mean', which calls xp.nanmean
weights :
Provide weights for aggregation, also accepts recognised keys for weights, e.g.
'latitude'
lat_key/lon_key :
key for latitude/longitude variable, default behaviour is to detect variable keys.
extra_reduce_dims :
any additional dimensions to aggregate over when reducing over spatial dimensions
mask_dim :
dimension that will be created after the reduction of the spatial dimensions, default is the index
of the dataframe
all_touched :
If True, all pixels touched by geometries will be considered in,
if False, only pixels whose center is within. Default is False.
Only valid for regular data.
mask_kwargs :
Any kwargs to pass into the mask method
mask_arrays :
precomputed mask array[s], if provided this will be used instead of creating a new mask.
They must be on the same spatial grid as the dataarray.
return_as :
what format to return the data object, `pandas` or `xarray`. Work In Progress
compact :
If True, return a compact pandas.DataFrame with the reduced data as a new column.
If False, return a fully expanded pandas.DataFrame.
Only valid if return_as is `pandas`
how_label :
label to append to variable name in returned object, default is not to append
kwargs :
kwargs recognised by the how function
Returns
-------
xarray.Dataset | xarray.DataArray
A data array with dimensions `features` + `data.dims not in 'lat','lon'`.
Each slice of layer corresponds to a feature in layer.
"""
assert not (geodataframe is not None and mask_arrays is not None), (
"Either a geodataframe or mask arrays must be provided, not both"
)
if mask_arrays is not None:
_mask_arrays: list[xr.DataArray] | None = ensure_list(mask_arrays)
else:
_mask_arrays = None
if isinstance(dataarray, xr.Dataset):
return_as: str = kwargs.pop("return_as", "xarray")
if return_as in ["xarray"]:
out_ds = xr.Dataset().assign_attrs(dataarray.attrs)
for var in dataarray.data_vars:
out_da = _reduce_dataarray_as_xarray(
dataarray[var], geodataframe=geodataframe, mask_arrays=_mask_arrays, **kwargs
)
out_ds[out_da.name] = out_da
return out_ds
elif "pandas" in return_as:
if geodataframe is not None:
out = geodataframe
for var in dataarray.data_vars:
out = _reduce_dataarray_as_pandas(dataarray[var], geodataframe=out, **kwargs)
else:
out = None
for var in dataarray.data_vars:
_out = _reduce_dataarray_as_pandas(dataarray[var], mask_arrays=_mask_arrays, **kwargs)
if out is None:
out = _out
else:
out = pd.merge(out, _out)
return out
else:
raise TypeError("Return as type not recognised or incompatible with inputs")
else:
return _reduce_dataarray_as_xarray(dataarray, geodataframe=geodataframe, mask_arrays=_mask_arrays, **kwargs)
def _reduce_dataarray_as_xarray(
dataarray: xr.DataArray,
geodataframe: gpd.GeoDataFrame | None = None,
mask_arrays: list[xr.DataArray] | None = None,
how: T.Callable | str = "mean",
weights: None | str | ndarray = None,
lat_key: str | None = None,
lon_key: str | None = None,
extra_reduce_dims: list | str = [],
mask_dim: str | None = None,
how_label: str | None = None,
squeeze: bool = True,
all_touched: bool = False,
mask_kwargs: dict[str, T.Any] = dict(),
return_geometry_as_coord: bool = False,
**reduce_kwargs,
) -> xr.DataArray:
"""Reduce an xarray.DataArray object over its geospatial dimensions using the specified 'how' method.
If a geodataframe is provided the DataArray is reduced over each feature in the geodataframe.
Geospatial coordinates are reduced to a dimension representing the list of features in the shape object.
Parameters
----------
dataarray :
Xarray data object (must have geospatial coordinates).
geodataframe :
Geopandas Dataframe containing the polygons for aggregations
mask_arrays :
precomputed mask array[s], if provided this will be used instead of creating a new mask.
They must be on the same spatial grid as the dataarray.
how :
method used to apply mask. Default='mean', which calls xp.nanmean
weights :
Provide weights for aggregation, also accepts recognised keys for weights, e.g.
'latitude'
lat_key :
key for latitude variable, default behaviour is to detect variable keys.
lon_key :
key for longitude variable, default behaviour is to detect variable keys.
extra_reduce_dims :
any additional dimensions to aggregate over when reducing over spatial dimensions
mask_dim :
dimension that will be created after the reduction of the spatial dimensions, default = `"index"`
return_as :
what format to return the data object, `"pandas"` or `"xarray"`. Work In Progress
how_label :
label to append to variable name in returned object, default is `how`
mask_kwargs :
Any kwargs to pass into the mask method
reduce_kwargs :
kwargs recognised by the how function
return_geometry_as_coord :
include the geometries as a coordinate in the returned xarray object. WARNING: geometries are not
serialisable objects, therefore this xarray will not be saveable as netCDF.
all_touched :
If True, all pixels touched by geometries will be considered in,
if False, only pixels whose center is within. Default is False.
Only valid for regular data.
squeeze :
If True, squeeze the output xarray object, default is True
Returns
-------
xarray.DataArray
A data array with dimensions [features] + [data.dims not in ['lat','lon']].
Each slice of layer corresponds to a feature in layer
"""
xp = array_namespace_from_object(dataarray)
extra_out_attrs = {}
how_str: None | str = None
if weights is None:
# convert how string to a method to apply
if isinstance(how, str):
how_str = deepcopy(how)
how = reduce_how = get_how_xp(how, xp=xp)
else:
reduce_how = how
assert callable(how), f"how must be a callable: {how}"
if how_str is None:
# get label from how method
how_str = how.__name__
else:
# Create any standard weights, e.g. latitude.
# TODO: handle kwargs better, currently only lat_key is accepted
if isinstance(weights, str):
_weights = standard_weights(dataarray, weights, lat_key=lat_key)
else:
_weights = weights
# We ensure the callable is a string
if callable(how):
how = weighted_how = how.__name__
else:
weighted_how = how
if how_str is None:
how_str = how
how_str = how_str or how_label
new_long_name_components = [
comp for comp in [how_str, dataarray.attrs.get("long_name", dataarray.name)] if comp is not None
]
new_long_name = " ".join(new_long_name_components)
extra_out_attrs.update({"long_name": new_long_name})
new_short_name_components = [f"{comp}" for comp in [dataarray.name, how_label] if comp is not None]
new_short_name = "_".join(new_short_name_components)
if isinstance(extra_reduce_dims, str):
extra_reduce_dims = [extra_reduce_dims]
spatial_info = get_spatial_info(dataarray, lat_key=lat_key, lon_key=lon_key)
# Get spatial info required by mask functions:
mask_kwargs = {**mask_kwargs, **{key: spatial_info[key] for key in ["lat_key", "lon_key", "regular"]}}
# All touched only valid for rasterize method
if spatial_info["regular"]:
mask_kwargs.setdefault("all_touched", all_touched)
else:
if all_touched:
logger.warning("all_touched only valid for regular data, ignoring")
spatial_dims = spatial_info.get("spatial_dims")
reduce_dims = spatial_dims + extra_reduce_dims
extra_out_attrs.update({"reduce_dims": reduce_dims})
reduce_kwargs.update({"dim": reduce_dims})
# If using a pre-computed mask arrays,
# then iterator is just dataarray*mask_array
if mask_arrays is not None:
masked_data_list = _array_mask_iterator(mask_arrays)
else:
# If no geodataframe, then no mask, so create a dummy mask:
if geodataframe is None:
masked_data_list = [xr.ones_like(dataarray)]
else:
masked_data_list = _shape_mask_iterator(geodataframe, dataarray, **mask_kwargs)
reduced_list = []
for masked_data in masked_data_list:
this = dataarray.where(masked_data > 0, other=xp.nan)
# If weighted, use xarray weighted arrays which
# correctly handle missing values etc.
if weights is not None:
this_weighted = this.weighted(_weights)
reduced_list.append(this_weighted.__getattribute__(weighted_how)(**reduce_kwargs))
else:
reduced = this.reduce(reduce_how, **reduce_kwargs).compute()
reduced = reduced.assign_attrs(dataarray.attrs)
reduced_list.append(reduced)
if squeeze:
reduced_list = [red_data.squeeze() for red_data in reduced_list]
# If no geodataframe, there is just one reduced array
if geodataframe is not None:
mask_dim_index = get_mask_dim_index(mask_dim, geodataframe)
out_xr = xr.concat(reduced_list, dim=mask_dim_index.name)
out_xr = out_xr.assign_coords({mask_dim_index.name: mask_dim_index})
elif mask_dim is None and len(reduced_list) == 1:
out_xr = reduced_list[0]
else:
_concat_dim_name = mask_dim or "index"
out_xr = xr.concat(reduced_list, dim=_concat_dim_name)
out_xr = out_xr.rename(new_short_name)
if geodataframe is not None:
if return_geometry_as_coord:
out_xr = out_xr.assign_coords(
**{"geometry": (mask_dim_index.name, [_g for _g in geodataframe["geometry"]])}
)
out_xr = out_xr.assign_attrs({**geodataframe.attrs, **extra_out_attrs})
return out_xr
def _reduce_dataarray_as_pandas(
dataarray: xr.DataArray, geodataframe: gpd.GeoDataFrame | None = None, compact: bool = False, **kwargs
) -> pd.DataFrame:
"""Reduce an xarray.DataArray object over its geospatial dimensions using the specified 'how' method.
If a geodataframe is provided the DataArray is reduced over each feature in the geodataframe.
Geospatial coordinates are reduced to a dimension representing the list of features in the shape object.
Parameters
----------
dataarray :
Xarray data object (must have geospatial coordinates).
geodataframe :
Geopandas Dataframe containing the polygons for aggregations
compact :
If True, return a compact pandas.DataFrame with the reduced data as a new column.
If False, return a fully expanded pandas.DataFrame.
kwargs :
kwargs accepted by the :function:_reduce_dataarray_as_xarray function
Returns
-------
pandas.DataFrame
A pandas.DataFrame similar to the geopandas dataframe, with the reduced data
added as a new column.
"""
out_xr = _reduce_dataarray_as_xarray(dataarray, geodataframe=geodataframe, **kwargs)
reduce_attrs = {f"{dataarray.name}": dataarray.attrs, f"{out_xr.name}": out_xr.attrs}
if geodataframe is None:
mask_dim = kwargs.get("mask_dim", "index")
if mask_dim not in out_xr.dims:
out_xr = xr.concat([out_xr], dim=mask_dim)
# If no geodataframe, then just convert xarray to dataframe
out = out_xr.to_dataframe()
# Add attributes to the dataframe
out.attrs.update({"reduce_attrs": reduce_attrs})
return out
# Otherwise, splice the geodataframe and reduced xarray
reduce_attrs = {
**geodataframe.attrs.get("reduce_attrs", {}),
**reduce_attrs,
}
# TODO: somehow remove repeat call of get_mask_dim_index (see _reduce_dataarray_as_xarray)
mask_dim_index = get_mask_dim_index(kwargs.get("mask_dim"), geodataframe)
mask_dim_name = mask_dim_index.name
out = geodataframe.set_index(mask_dim_index)
if mask_dim_name not in out_xr.dims:
out_xr = xr.concat([out_xr], dim=mask_dim_name)
if not compact: # Return as a fully expanded pandas.DataFrame
# Convert to DataFrame
out = out.join(out_xr.to_dataframe())
else:
# add the reduced data into a new column as a numpy array,
# store the dim information in the attributes
_out_dims = [str(dim) for dim in dataarray.coords if dim in out_xr.dims]
out_dims = {dim: dataarray[dim].data for dim in _out_dims}
reduce_attrs[f"{out_xr.name}"].update({"dims": out_dims})
reduced_list = [
out_xr.sel(**{mask_dim_name: mask_dim_value}).data for mask_dim_value in out_xr[mask_dim_name].data
]
out = out.assign(**{f"{out_xr.name}": reduced_list})
out.attrs.update({"reduce_attrs": reduce_attrs})
return out