Source code for imod.prepare.spatial

import pathlib
from typing import TYPE_CHECKING, Callable, Iterable, Optional, Union, cast

import dask
import numba
import numpy as np
import pandas as pd
import scipy.ndimage
import xarray as xr
from numpy.typing import DTypeLike

import imod
from imod.prepare import common, laplace
from imod.util.imports import MissingOptionalModule
from imod.util.spatial import _polygonize

# since rasterio is a big dependency that is sometimes hard to install
# and not always required, we made this an optional dependency
try:
    import rasterio
    import rasterio.features
    import rasterio.warp
except ImportError:
    rasterio = MissingOptionalModule("rasterio")

if TYPE_CHECKING:
    import geopandas as gpd


def round_extent(
    extent: tuple[float, float, float, float], cellsize: float
) -> tuple[float, float, float, float]:
    """Increases the extent until all sides lie on a coordinate
    divisible by cellsize."""
    xmin, ymin, xmax, ymax = extent
    xmin = np.floor(xmin / cellsize) * cellsize
    ymin = np.floor(ymin / cellsize) * cellsize
    xmax = np.ceil(xmax / cellsize) * cellsize
    ymax = np.ceil(ymax / cellsize) * cellsize
    return xmin, ymin, xmax, ymax


def round_z(z_extent: tuple[float, float], dz: float):
    """Increases the extent until all sides lie on a coordinate
    divisible by dz."""
    zmin, zmax = z_extent
    zmin = np.floor(zmin / dz) * dz
    zmax = np.ceil(zmax / dz) * dz
    return zmin, zmax


def _fill_np(data: np.ndarray) -> np.ndarray:
    nodata = np.isnan(data)
    if not nodata.any():
        return data.copy()
    # see: https://stackoverflow.com/questions/5551286/filling-gaps-in-a-numpy-array
    indices = scipy.ndimage.distance_transform_edt(
        input=nodata,
        return_distances=False,
        return_indices=True,
    )
    return data[tuple(indices)]



[docs]
def fill(da: xr.DataArray, dims: Optional[tuple[str, ...]] = None) -> xr.DataArray:
    """
    Fill in NaNs using basic nearest neighbour interpolation.

    Parameters
    ----------
    da: xr.DataArray with gaps
        array containing NaN values.
    dims: sequence of str, optional, default is ("y", "x").
        Dimensions along which to search for nearest neighbors. For example,
        ("y", "x") will perform 2D interpolation in the horizontal plane, while
        ("layer", "y", "x") will perform 3D interpolation including the
        vertical dimension.

    Returns
    -------
    xarray.DataArray
        with the same coordinates as the input.

    Examples
    --------

    A common use case is filling holes in a DataArray, filling it with the
    value of its nearest (valid) neighbor:

    >>> filled = imod.prepare.fill(da)

    In case of a tie (e.g. neighbors in x and y are both one cell removed), the
    neighbor in the last dimension is chosen (for rasters, that's generally x).

    A typical use case is filling a 3D array (layer, y, x), but only in the
    horizontal dimensions. The ``dims`` keyword can be used to do control this:

    >>> filled = imod.prepare.fill(da, dims=("y", "x"))

    In this case, the array is filled by one layer at a time.

    To also incorporate nearest values across the layer dimension:

    >>> filled = imod.prepare.fill(da, dims=("layer", "y", "x"))
    """
    if dims is None:
        dims = ("y", "x")

    return xr.apply_ufunc(
        _fill_np,
        da,
        input_core_dims=[dims],
        output_core_dims=[dims],
        output_dtypes=[da.dtype],
        dask="parallelized",
        vectorize=True,
        keep_attrs=True,
    ).transpose(*da.dims)




[docs]
def laplace_interpolate(
    source: xr.DataArray,
    dims: tuple[str, ...] = ("y", "x"),
    direct: bool = False,
    delta: float = 0.0,
    relax: float = 0.97,
    atol: float = 0.0,
    rtol: float = 1.0e-5,
    maxiter: int = 500,
):
    """
    Fills gaps in ``source`` (NaN values) by interpolating from existing values
    using Laplace interpolation.

    Parameters
    ----------
    source : xr.DataArray of floats
        Data values to interpolate.
    dims: sequence of str, default is ``("y", "x")``.
        Dimensions along which to search for nearest neighbors. For example,
        ("y", "x") will perform 2D interpolation in the horizontal plane, while
        ("layer", "y", "x") will perform 3D interpolation including the
        vertical dimension.
    direct: bool, optional, default ``False``
        Whether to use a direct or an iterative solver or a conjugate gradient
        solver. Direct method provides an exact answer, but are unsuitable
        for large problems (> 10 000 unknowns).
    delta: float, default 0.0.
        ILU0 preconditioner non-diagonally dominant correction.
    relax: float, default 0.97.
        Modified ILU0 preconditioner relaxation factor.
    rtol: float, optional, default 1.0e-5.
        Convergence tolerance for ``scipy.sparse.linalg.cg``.
    atol: float, optional, default 0.0.
        Convergence tolerance for ``scipy.sparse.linalg.cg``.
    maxiter: int, default 500.
        Maximum number of iterations for ``scipy.sparse.linalg.cg``.

    Returns
    -------
    interpolated : xr.DataArray
        source, with interpolated values.
    """
    missing_dims = set(dims) - set(cast(tuple[str, ...], source.dims))
    if missing_dims:
        raise ValueError(f"Dimensions not in source: {missing_dims}")
    # Ensure order matches the order in source.
    dims = tuple(cast(str, dim) for dim in source.dims if dim in dims)
    shape = tuple(source.sizes[d] for d in dims)
    connectivity = laplace._build_connectivity(shape)
    arr = xr.apply_ufunc(
        laplace._interpolate,
        source,
        input_core_dims=[dims],
        output_core_dims=[dims],
        output_dtypes=[source.dtype],
        dask="parallelized",
        vectorize=True,
        keep_attrs=True,
        kwargs={
            "connectivity": connectivity,
            "direct": direct,
            "delta": delta,
            "relax": relax,
            "atol": atol,
            "rtol": rtol,
            "maxiter": maxiter,
        },
    ).transpose(*source.dims)
    return arr




[docs]
def rasterize(
    geodataframe: "gpd.GeoDataFrame",
    like: xr.DataArray,
    column: Optional[str | int | float] = None,
    fill: Optional[float | int] = np.nan,
    dtype: Optional[DTypeLike] = None,
    **kwargs,
) -> xr.DataArray:
    """
    Rasterize a geopandas GeoDataFrame onto the given
    xarray coordinates.

    Parameters
    ----------
    geodataframe: geopandas.GeoDataFrame
        Geodataframe with polygons to rasterize.
    like: xarray.DataArray
        Example DataArray. The rasterized result will match the shape and
        coordinates of this DataArray.
    column: str, int, float, optional
        column name of geodataframe to burn into raster
    fill: float, int, optional
        Fill value for nodata areas. Optional, default value is np.nan.
    dtype: dtypelike, optional
        Data type of output raster. Defaults to data type of grid provided as
        "like".
    kwargs: additional keyword arguments for rasterio.features.rasterize, optional
        See: https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize

    Returns
    -------
    rasterized : xarray.DataArray
        Vector data rasterized. Matches shape and coordinates of ``like``.
    """

    if column is not None:
        shapes = list(zip(geodataframe.geometry, geodataframe[column]))
    else:
        shapes = list(geodataframe.geometry)

    # shapes must be an iterable
    try:
        iter(shapes)
        iterable_shapes: Iterable = shapes
    except TypeError:
        iterable_shapes: Iterable = (shapes,)  # type: ignore[no-redef]

    if dtype is None:
        dtype = like.dtype

    raster = rasterio.features.rasterize(
        iterable_shapes,
        out_shape=like.shape,
        fill=fill,
        transform=imod.util.spatial.transform(like),
        dtype=dtype,
        **kwargs,
    )

    return xr.DataArray(raster, like.coords, like.dims)




[docs]
def polygonize(da: xr.DataArray) -> "gpd.GeoDataFrame":
    """
    Polygonize a 2D-DataArray into a GeoDataFrame of polygons.

    Parameters
    ----------
    da : xr.DataArray

    Returns
    -------
    polygonized : geopandas.GeoDataFrame
    """
    return _polygonize(da)



def _handle_dtype(dtype, nodata):
    # Largely taken from rasterio.dtypes
    # https://github.com/mapbox/rasterio/blob/master/rasterio/dtypes.py
    # Not supported:
    # GDT_CInt16 = 8, GDT_CInt32 = 9, GDT_CFloat32 = 10, GDT_CFloat64 = 11
    dtype_mapping = {
        "uint8": 1,  # GDT_Byte
        "uint16": 2,  # GDT_Uint16
        "int16": 3,  # GDT_Int16
        "uint32": 4,  # GDT_Uint32
        "int32": 5,  # GDT_Int32
        "float32": 6,  # GDT_Float32
        "float64": 7,  # GDT_Float64
        "uint64": 12,  # GDT_UInt64
        "int64": 13,  # GDT_Int64
    }
    dtype_ranges = {
        "uint8": (0, 255),
        "uint16": (0, 65535),
        "int16": (-32768, 32767),
        "uint32": (0, 4294967295),
        "int32": (-2147483648, 2147483647),
        "float32": (-3.4028235e38, 3.4028235e38),
        "float64": (-1.7976931348623157e308, 1.7976931348623157e308),
        "uint64": (0, 18446744073709551615),
        "int64": (-9223372036854775808, 9223372036854775807),
    }

    def format_invalid(str_dtype):
        str_dtypes = dtype_mapping.keys()
        return "Invalid dtype: {0}, must be one of: {1}".format(
            str_dtype, ", ".join(str_dtypes)
        )

    str_dtype = str(np.dtype(dtype))
    if str_dtype not in dtype_mapping.keys():
        raise ValueError(format_invalid(str_dtype))
    gdal_dtype = dtype_mapping[str_dtype]

    if nodata is None:
        if np.issubdtype(dtype, np.integer):
            # Default to lowest value in case of integers
            nodata = dtype_ranges[str_dtype][0]
        elif np.issubdtype(dtype, np.floating):
            # Default to NaN in case of floats
            nodata = np.nan
    else:
        lower, upper = dtype_ranges[str_dtype]
        if nodata < lower or nodata > upper:
            raise ValueError(f"Nodata value {nodata} is out of bounds for {str_dtype}")

    return gdal_dtype, nodata



[docs]
def gdal_rasterize(
    path,
    column,
    like=None,
    nodata=None,
    dtype=None,
    spatial_reference=None,
    all_touched=False,
):
    """
    Use GDAL to rasterize a vector file into an xarray.DataArray.

    Can be significantly more efficient than rasterize. This doesn't load the
    vector data into a GeoDataFrame and loops over the individual shapely
    geometries like rasterio.rasterize does, but loops over the features within
    GDAL instead.

    Parameters
    ----------
    path : str or pathlib.Path
        path to OGR supported vector file (e.g. a shapefile)
    column : str
        column name of column to burn into raster
    like : xr.DataArray, optional
        example of raster
    nodata : int, float; optional
    dtype : numpy.dtype, optional
    spatial_reference : dict, optional
        Optional dict to avoid allocating the like DataArray. Used if template
        is None. Dict has keys "bounds" and "cellsizes", with:

        * bounds = (xmin, xmax, ymin, ymax)
        * cellsizes = (dx, dy)
    all_touched : bool
        If True: all pixels touched by lines or polygons will be updated, not
        just those on the line render path, or whose center point is within the
        polygon. Default value is False.

    Returns
    -------
    rasterized : xr.DataArray
    """
    from osgeo import gdal, ogr

    if isinstance(path, pathlib.Path):
        p = path
        path = str(p)
    else:
        p = pathlib.Path(path)
    if not p.exists():
        raise FileNotFoundError(f"No such file: {path}")

    if dtype is None:
        if like is None:
            raise ValueError("If ``like`` is not provided, ``dtype`` has to be given")
        else:
            dtype = like.dtype
    gdal_dtype, nodata = _handle_dtype(dtype, nodata)

    # Exceptions will get raised on anything >= gdal.CE_Failure
    gdal.UseExceptions()

    # An attempt at decent errors
    class GdalErrorHandler:
        def __init__(self):
            self.err_level = gdal.CE_None
            self.err_no = 0
            self.err_msg = ""

        def handler(self, err_level, err_no, err_msg):
            self.err_level = err_level
            self.err_no = err_no
            self.err_msg = err_msg

    error = GdalErrorHandler()
    handler = error.handler
    gdal.PushErrorHandler(handler)

    # Get spatial data from template
    if like is not None:
        dx, xmin, _, dy, _, ymax = imod.util.spatial.spatial_reference(like)
        nrow, ncol = like.shape
    else:
        cellsizes = spatial_reference["cellsizes"]
        bounds = spatial_reference["bounds"]
        dx, dy = cellsizes
        if not isinstance(dx, (int, float)) or not isinstance(dy, (int, float)):
            raise ValueError("Cannot rasterize to a non-equidistant grid")
        coords = imod.util.spatial._xycoords(bounds, cellsizes)
        xmin, _, _, ymax = bounds
        nrow = coords["y"].size
        ncol = coords["x"].size
        dims = ("y", "x")

    # File will be closed when vector is dereferenced, after return
    vector = ogr.Open(path)
    vector_layer = vector.GetLayer()

    memory_driver = gdal.GetDriverByName("MEM")
    memory_raster = memory_driver.Create("", ncol, nrow, 1, gdal_dtype)
    memory_raster.SetGeoTransform([xmin, dx, 0, ymax, 0, dy])
    memory_band = memory_raster.GetRasterBand(1)
    memory_band.SetNoDataValue(nodata)
    memory_band.Fill(nodata)

    options = [f"ATTRIBUTE={column}", f"ALL_TOUCHED={str(all_touched).upper()}"]
    gdal.RasterizeLayer(memory_raster, [1], vector_layer, None, None, [1], options)
    if error.err_level >= gdal.CE_Warning:
        message = error.err_msg
        if message.startswith(
            "Failed to fetch spatial reference on layer"
        ) and message.endswith("assuming matching coordinate systems."):
            pass
        else:
            raise RuntimeError("GDAL error: " + error.err_msg)

    if like is not None:
        rasterized = like.copy(data=memory_raster.ReadAsArray())
    else:
        rasterized = xr.DataArray(memory_raster.ReadAsArray(), coords, dims)

    return rasterized



@numba.njit(cache=True)
def _cell_count(src, values, frequencies, nodata, *inds_weights):
    """
    numba compiled function to count the number of src cells occuring in the dst
    cells.

    Parameters
    ----------
    src : np.array
    values : np.array
        work array to store the unique values
    frequencies : np.array
        work array to store the tallied counts
    nodata : int, float
    inds_weights : tuple of np.arrays
        Contains indices of dst, indices of src, and weights.

    Returns
    -------
    tuple of np.arrays

       * row_indices
       * col_indices
       * values
       * frequencies
    """
    jj, blocks_iy, blocks_weights_y, kk, blocks_ix, blocks_weights_x = inds_weights

    # Use list for dynamic allocation, since we don't know number of rows in
    # advance.
    row_indices = []
    col_indices = []
    value_list = []
    count_list = []

    # j, k are indices of dst array
    # block_i contains indices of src array
    # block_w contains weights of src array
    for countj, j in enumerate(jj):
        block_iy = blocks_iy[countj]
        block_wy = blocks_weights_y[countj]
        for countk, k in enumerate(kk):
            block_ix = blocks_ix[countk]
            block_wx = blocks_weights_x[countk]

            # TODO: use weights in frequency count, and area sum?
            # Since src is equidistant, normed weights are easy to calculate.

            # Add the values and weights per cell in multi-dim block
            value_count = 0
            for iy, wy in zip(block_iy, block_wy):
                if iy < 0:
                    break
                for ix, wx in zip(block_ix, block_wx):
                    if ix < 0:
                        break

                    v = src[iy, ix]
                    if v == nodata:  # Skip nodata cells
                        continue
                    # Work on a single destination cell
                    # Count the number of polygon id's occuring in the cell
                    # a new row per id
                    found = False
                    for i in range(value_count):
                        if v == values[i]:
                            frequencies[i] += 1
                            found = True
                            break
                    if not found:
                        values[value_count] = v
                        frequencies[value_count] = 1
                        value_count += 1
                        # Add a new entry
                        row_indices.append(j)
                        col_indices.append(k)

            # Store for output
            value_list.extend(values[:value_count])
            count_list.extend(frequencies[:value_count])

            # reset storage
            values[:value_count] = 0
            frequencies[:value_count] = 0

    # Cast to numpy arrays
    row_i_arr = np.array(row_indices)
    col_i_arr = np.array(col_indices)
    value_arr = np.array(value_list)
    count_arr = np.array(count_list)

    return row_i_arr, col_i_arr, value_arr, count_arr


def _celltable(
    path: str | pathlib.Path,
    column: str,
    resolution: int,
    like: xr.DataArray,
    dtype: DTypeLike,
    rowstart: int = 0,
    colstart: int = 0,
) -> pd.DataFrame:
    """
    Returns a table of cell indices (row, column) with feature ID, and feature
    area within cell. Essentially returns a COO sparse matrix, but with
    duplicate values per cell, since more than one geometry may be present.

    The feature area within the cell is approximated by first rasterizing the
    feature, and then counting the number of occuring cells. This means the
    accuracy of the area depends on the resolution of the rasterization step.

    Parameters
    ----------
    path : str or pathlib.Path
        path to OGR supported vector file (e.g. a shapefile)
    column : str
        column name of column to burn into raster
    resolution : float
        cellsize at which the rasterization, and determination of area within
        cellsize occurs. Very small values are recommended (e.g. <= 0.5 m).
    like : xarray.DataArray
        Example DataArray of where the cells will be located. Used only for the
        coordinates.
    dtype: numpy.dtype
        datatype of data referred to with "column".

    Returns
    -------
    cell_table : pandas.DataFrame
    """
    # Avoid side-effects
    like = like.copy(deep=False)
    _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
    dx = resolution
    dy = -dx
    nodata = -1
    spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}

    rasterized = gdal_rasterize(
        path, column, nodata=nodata, dtype=dtype, spatial_reference=spatial_reference
    )

    # Make sure the coordinates are increasing.
    dims = ("y", "x")
    rasterized, _ = common._increasing_dims(rasterized, dims)
    like, flip_dst = common._increasing_dims(like, dims)

    dst_coords = [imod.prepare.common._coord(like, dim) for dim in ("y", "x")]
    src_coords = [imod.prepare.common._coord(rasterized, dim) for dim in ("y", "x")]
    # Determine weights for every regrid dimension, and alloc_len,
    # the maximum number of src cells that may end up in a single dst cell
    inds_weights = []
    alloc_len = 1
    for src_x, dst_x in zip(src_coords, dst_coords):
        size, i_w = imod.prepare.common._weights_1d(src_x, dst_x)
        for elem in i_w:
            inds_weights.append(elem)
        alloc_len *= size

    # Pre-allocate work arrays
    values = np.full(alloc_len, 0, dtype=dtype)
    frequencies = np.full(alloc_len, 0, dtype=dtype)
    rows, cols, values, counts = _cell_count(
        rasterized.values, values, frequencies, nodata, *inds_weights
    )

    if "y" in flip_dst:
        rows = (like["y"].size - 1) - rows
    if "x" in flip_dst:
        cols = (like["x"].size - 1) - cols

    df = pd.DataFrame()
    df["row_index"] = rows + rowstart
    df["col_index"] = cols + colstart
    df[column] = values
    df["area"] = counts * (dx * dx)

    return df


def _create_chunks(like, resolution, chunksize):
    """
    Cuts data into chunksize by chunksize.

    Parameters
    ----------
    like : xarray.DataArray
    resolution : float
    chunksize : int

    Returns
    -------
    chunks : list of xr.DataArray
    """

    _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
    # Compute how many rows and columns are necessary for fine resolution
    nrow = int((ymax - ymin) / resolution)
    ncol = int((xmax - xmin) / resolution)
    # Find out where to cut
    x_starts = np.arange(0, ncol, chunksize) * resolution + xmin
    y_starts = np.arange(0, nrow, chunksize) * resolution + ymin
    # Searchsorted assumes the arrays are pre-sorted.
    x = np.sort(like.coords["x"].values)
    y = np.sort(like.coords["y"].values)
    # Get the matching indices of like.
    ix_starts = list(np.searchsorted(x, x_starts))
    iy_starts = list(np.searchsorted(y, y_starts))
    # Append None. In python's slice object, None denotes "slice including
    # first/last element"
    ix_ends = ix_starts[1:] + [None]
    iy_ends = iy_starts[1:] + [None]
    # Use xarray to grab the chunks. The chunks have x and y coordinates.
    # These will inform GDAL on which part to rasterize.
    # GDAL will only rasterize within the boundaries of the chunks, so there's
    # no need to clip the shapefile beforehand.
    chunks = []
    rowstarts = []
    colstarts = []
    for j0, j1 in zip(iy_starts, iy_ends):
        for i0, i1 in zip(ix_starts, ix_ends):
            chunks.append(like.isel(y=slice(j0, j1), x=slice(i0, i1)))
            rowstarts.append(j0)
            colstarts.append(i0)
    return chunks, rowstarts, colstarts



[docs]
def celltable(
    path: str | pathlib.Path,
    column: str,
    resolution: int,
    like: xr.DataArray,
    dtype: DTypeLike = np.int32,
    chunksize: int = 10_000,
) -> pd.DataFrame:
    r"""
    Process area of features by rasterizing in a chunkwise manner to limit
    memory usage.

    Returns a table of cell indices (row, column) with for example feature ID,
    and feature area within cell. Essentially returns a COO sparse matrix, but
    with duplicate values per cell, since more than one geometry may be present.

    The feature area within the cell is approximated by first rasterizing the
    feature, and then counting the number of occuring cells. This means the
    accuracy of the area depends on the cellsize of the rasterization step.

    A celltable is returned, as a ``pandas.DataFrame``. It has the following
    columns:

    1. ``"row_index"``
    2. ``"col_index"``
    3. the value of the ``column`` argument
    4. ``"area"``

    ``"row_index"`` and ``"col_index"`` are the indices of the like array in
    which the polygon is located. The ``column`` value holds the rasterized
    value of the specified column. ``"area"`` contains the area of the
    polygon within the cell.

    The most convenient way of using this celltable is by specifying a feature
    ID as ``column``. After creating a celltable, ``pandas.DataFrame.merge()``
    can be used to join additional data on this ID. Refer to the examples.

    Parameters
    ----------
    path : str or pathlib.Path
        path to OGR supported vector file (e.g. a shapefile)
    column : str
        column name of column to burn into raster
    resolution : float
        cellsize at which the rasterization, and determination of area within
        cellsize occurs. Very small values are recommended (e.g. <= 0.5 m).
    like : xarray.DataArray
        Example DataArray of where the cells will be located. Used only for the
        coordinates.
    dtype: DtypeLike, optional
        datatype of data referred to with "column", defaults to 32-bit integer.
    chunksize : int, optional
        The size of the chunksize. Used for both x and y dimension.

    Returns
    -------
    celltable : pandas.DataFrame

    Examples
    --------
    Assume we have a shapefile called ``waterways.shp`` and information on the
    model discretization is described by a ``like`` DataArray. The feature ID is
    provided by a column in the shapefile called "ID-code". Additionally, this
    shapefile also specifies bed hydraulic resistance (c0). For this specific
    discretization, we wish to calculate a conductance (area divided by
    hydraulic resistance). To do so, we:

    1. create a ``celltable``
    2. join the additional attributes (such as c0)
    3. compute the conductance per feature
    4. sum conductances per cell

    Import the required packages.

    >>> import imod
    >>> import geopandas as gpd

    Generate the celltable.

    >>> celltable = imod.prepare.celltable(
            path="waterways.shp",
            column="ID-code",
            resolution=0.5,
            like=like,
        )

    Load the shapefile with geopandas into a ``GeoDataFrame``.

    >>> gdf = gpd.read_file("waterways.shp)

    Select the relevant columns into a ``pandas.DataFrame`` and merge with the
    celltable.

    >>> df = gdf[["ID-code", "c0"]]
    >>> joined = celltable.merge(gdf, on="ID-code")

    We compute the conductance, and sum it per cell using ``pandas`` methods:

    >>> joined["conductance"] = joined["area"] / joined["c0"]
    >>> summed_conductance = joined.groupby(["row_index", "col_index"], as_index=False)[
            "conductance"
        ].sum()

    Finally, turn the result into a DataArray so it can be used as model input:

    >>> conductance = imod.prepare.rasterize_celltable(
            table=summed_conductance,
            column="conductance",
            like=like,
        )

    """
    dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like)
    if not imod.util.spatial.is_divisor(dx, resolution):
        raise ValueError("resolution is not an (integer) divisor of dx")
    if not imod.util.spatial.is_divisor(dy, resolution):
        raise ValueError("resolution is not an (integer) divisor of dy")

    like_chunks, rowstarts, colstarts = _create_chunks(like, resolution, chunksize)
    collection = [
        dask.delayed(_celltable)(
            path, column, resolution, chunk, dtype, rowstart, colstart
        )
        for chunk, rowstart, colstart in zip(like_chunks, rowstarts, colstarts)
    ]
    result = dask.compute(collection)[0]
    return pd.concat(result)



@numba.njit
def _burn_cells(raster, rows, cols, values):
    """
    Burn values of sparse COO-matrix into raster.
    rows, cols, and values form a sparse matrix in coordinate format (COO)
    (also known as "ijv" or "triplet" format).

    Parameters
    ----------
    raster : np.array
        raster to burn values into.
    rows : np.array of integers
        row indices (i)
    cols : np.array of integers
        column indices (j)
    values : np.array of floats
        values to burn (v)
    """
    for i, j, v in zip(rows, cols, values):
        raster[i, j] = v
    return raster



[docs]
def rasterize_celltable(table, column, like):
    """
    Rasterizes a table, such as produced by ``imod.prepare.spatial.celltable``.
    Before rasterization, multiple values should be grouped and aggregated per
    cell. Values will be overwritten otherwise.

    Parameters
    ----------
    like : xr.DataArray
    table : pandas.DataFrame
        with columns: "row_index", "col_index"
    column : str, int, float
        column name of values to rasterize

    Returns
    -------
    rasterized : xr.DataArray
    """
    rows = table["row_index"].values
    cols = table["col_index"].values
    area = table[column].values
    dst = like.copy()
    dst.values = _burn_cells(dst.values, rows, cols, area)
    return dst



def _zonal_aggregate_raster(
    path: Union[str, pathlib.Path],
    column: str,
    resolution: float,
    raster: xr.DataArray,
    method: Union[str, Callable],
) -> pd.DataFrame:
    """
    * Rasterize the polygon at given `resolution`
    * Sample the raster at the rasterized polygon cells
    * Store both in a dataframe, groupby and aggregate according to `method`
    """
    data_dx, xmin, xmax, data_dy, ymin, ymax = imod.util.spatial.spatial_reference(
        raster
    )
    dx = resolution
    dy = -dx
    nodata = -1
    spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}

    rasterized = gdal_rasterize(
        path,
        column,
        nodata=nodata,
        dtype=np.int32,
        spatial_reference=spatial_reference,
        all_touched=True,
    )
    ravelled = rasterized.values.ravel()

    y = np.arange(ymax + 0.5 * dy, ymin, dy)
    x = np.arange(xmin + 0.5 * dx, xmax, dx)
    is_data = ravelled != nodata
    feature_id = ravelled[is_data]
    yy, xx = [a.ravel()[is_data] for a in np.meshgrid(y, x, indexing="ij")]

    dims = ("y", "x")
    rasterized, _ = common._increasing_dims(rasterized, dims)
    raster, _ = common._increasing_dims(raster, dims)
    y_ind = ((yy - ymin) / abs(data_dy)).astype(int)
    x_ind = ((xx - xmin) / abs(data_dx)).astype(int)
    sample = raster.values[y_ind, x_ind]

    df = pd.DataFrame({column: feature_id, "data": sample})
    # Remove entries where the raster has nodata.
    # This may result in areas significantly smaller than the polygon geometry,
    # but should come in handy for weighting later?
    df = df[df["data"].notnull()]
    result = df.groupby(column, as_index=False).agg(["count", method])
    # Reset index to set "column" as column again, make sure index is dropped
    result = result.reset_index(drop=True)
    # Compute the area from the counted number of cells
    result["data", "count"] *= resolution * resolution
    name = raster.name if raster.name else "aggregated"
    result.columns = [column, "area", name]
    return result


def _zonal_aggregate_polygons(
    path_a: Union[str, pathlib.Path],
    path_b: Union[str, pathlib.Path],
    column_a: str,
    column_b: str,
    resolution: float,
    like: xr.DataArray,
    method: Union[str, Callable],
) -> pd.DataFrame:
    """
    * Rasterize a, rasterize b for the same domain
    * Store both in a dataframe, groupby and aggregate according to `method`
    """
    _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
    dx = resolution
    dy = -dx
    nodata = -1
    spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}

    rasterized_a = gdal_rasterize(
        path_a,
        column_a,
        nodata=nodata,
        dtype=np.int32,
        spatial_reference=spatial_reference,
        all_touched=True,
    )
    rasterized_b = gdal_rasterize(
        path_b,
        column_b,
        nodata=np.nan,
        dtype=np.float64,
        spatial_reference=spatial_reference,
        all_touched=True,
    )
    is_data = ((rasterized_a != nodata) & (rasterized_b.notnull())).values
    a = rasterized_a.values[is_data].ravel()
    b = rasterized_b.values[is_data].ravel()
    df = pd.DataFrame({column_a: a, column_b: b})
    # Remove entries where the raster has nodata.
    # This may result in areas significantly smaller than the polygon geometry,
    # but should come in handy for weighting later?
    result = df.groupby(column_a, as_index=False).agg(["count", method])
    # Reset index to set "column_a" as column again, make sure index is dropped
    result = result.reset_index(drop=True)
    # Compute the area from the counted number of cells
    result[column_b, "count"] *= resolution * resolution
    result.columns = [column_a, "area", column_b]
    return result



[docs]
def zonal_aggregate_raster(
    path: Union[pathlib.Path, str],
    column: str,
    raster: xr.DataArray,
    resolution: float,
    method: Union[str, Callable],
    chunksize: int = 10_000,
) -> pd.DataFrame:
    """
    Compute a zonal aggregate of raster data for polygon geometries, e.g. a mean,
    mode, or percentile.

    Parameters
    ----------
    path : str or pathlib.Path
        path to OGR supported vector file (e.g. a shapefile). Defines zones
        of aggregation.
    column : str
        column name of path, integer IDs defining zones.
    raster : xarray.DataArray
        Raster data from which to sample and aggregate data
    resolution : float
        cellsize at which the rasterization of polygons and sampling occurs
    method : Union[str, Callable]
        Aggregation method.
        Anything that is acceptable by a pandas groupby aggregate:
        https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html
    chunksize : int, optional
        The size of the chunksize. Used for both x and y dimension.

    Returns
    -------
    zonal_aggregates : pandas.DataFrame

    Examples
    --------

    To compute the mean surface level at polygons of water bodies:

    >>> import imod
    >>> surface_level = imod.rasterio.open("surface_level.tif")
    >>> df = imod.prepare.spatial.zonal_aggregate_raster(
    >>>    path="water-bodies.shp",
    >>>    column="id",
    >>>    raster=surface_level,
    >>>    resolution=1.0,
    >>>    method="mean",
    >>> )

    For some functions, like the mode, a function should be passed instead:

    >>> import pandas as pd
    >>> df = imod.prepare.spatial.zonal_aggregate_raster(
    >>>    path="water-bodies.shp",
    >>>    column="id",
    >>>    raster=surface_level,
    >>>    resolution=1.0,
    >>>    method=pd.Series.mode,
    >>> )
    """
    dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(raster)
    if not imod.util.spatial.is_divisor(dx, resolution):
        raise ValueError("resolution is not an (integer) divisor of dx")
    if not imod.util.spatial.is_divisor(dy, resolution):
        raise ValueError("resolution is not an (integer) divisor of dy")

    without_chunks = (raster.chunks is None) or (
        all(length == 1 for length in map(len, raster.chunks))
    )
    if without_chunks:
        raster = raster.compute()

    raster_chunks, _, _ = _create_chunks(raster, resolution, chunksize)
    collection = [
        dask.delayed(_zonal_aggregate_raster)(path, column, resolution, chunk, method)
        for chunk in raster_chunks
    ]
    result = dask.compute(collection)[0]
    return pd.concat(result)




[docs]
def zonal_aggregate_polygons(
    path_a: Union[pathlib.Path, str],
    path_b: Union[pathlib.Path, str],
    column_a: str,
    column_b: str,
    like: xr.DataArray,
    resolution: float,
    method: Union[str, Callable],
    chunksize: int = 10_000,
) -> pd.DataFrame:
    """
    Compute a zonal aggregate of polygon data for (other) polygon geometries,
    e.g. a mean, mode, or percentile.

    Parameters
    ----------
    path_a : str or pathlib.Path
        path to OGR supported vector file (e.g. a shapefile)
    path_b : str or pathlib.Path
        path to OGR supported vector file (e.g. a shapefile)
    column_a : str
        column name of path_a. Defines zones of aggregation.
    column_b : str
        column name of path_b. Data to aggregate.
    like : xarray.DataArray with dims ("y", "x")
        Example DataArray of where the cells will be located. Used only for its
        x and y coordinates.
    resolution : float
        cellsize at which the rasterization, sampling, and area measurement occur.
    method: Union[str, Callable]
        Aggregation method.
        Anything that is acceptable by a pandas groupby aggregate:
        https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html
    chunksize : int, optional
        The size of the chunksize. Used for both x and y dimension.

    Returns
    -------
    zonal_aggregates: pandas.DataFrame
    """
    dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like)
    if not imod.util.spatial.is_divisor(dx, resolution):
        raise ValueError("resolution is not an (integer) divisor of dx")
    if not imod.util.spatial.is_divisor(dy, resolution):
        raise ValueError("resolution is not an (integer) divisor of dy")

    like_chunks, _, _ = _create_chunks(like, resolution, chunksize)
    collection = [
        dask.delayed(_zonal_aggregate_polygons)(
            path_a, path_b, column_a, column_b, resolution, chunk, method
        )
        for chunk in like_chunks
    ]
    result = dask.compute(collection)[0]
    return pd.concat(result)