Source code for cf_xarray.geometry

from __future__ import annotations

from collections.abc import Sequence

import numpy as np
import pandas as pd
import xarray as xr


def reshape_unique_geometries(
    ds: xr.Dataset,
    geom_var: str = "geometry",
    new_dim: str = "features",
) -> xr.Dataset:
    """Reshape a dataset containing a geometry variable so that all unique features are
    identified along a new dimension.

    This function only makes sense if the dimension of the geometry variable has no coordinate,
    or if that coordinate has repeated values for each geometry.

    Parameters
    ----------
    ds : xr.Dataset
        A Dataset.
    geom_var : string
        Name of the variable in `ds` that contains the geometry objects of type shapely.geometry.
        The variable must be 1D.
    new_dim : string
        Name of the new dimension in the returned object.

    Returns
    -------
    Dataset
        All variables sharing the dimension of `ds[geom_var]` are reshaped so that `new_dim`
        as a length equal to the number of unique geometries.
    """
    if ds[geom_var].ndim > 1:
        raise ValueError(
            f"The geometry variable must be 1D. Got ds[{geom_var}] with dims {ds[geom_var].dims}."
        )

    # Shapely objects are not hashable, thus np.unique cannot be used directly.
    # This trick is stolen from geopandas.
    _, unique_indexes, inv_indexes = np.unique(
        [g.wkb for g in ds[geom_var].values], return_index=True, return_inverse=True
    )
    old_name = ds[geom_var].dims[0]

    if old_name in ds.coords:
        old_values = ds[old_name].values
    else:
        # A dummy coord, a kind of counter, independent for each unique geometries
        old_values = np.array(
            [(inv_indexes[:i] == ind).sum() for i, ind in enumerate(inv_indexes)]
        )

    multi_index = pd.MultiIndex.from_arrays(
        (inv_indexes, old_values), names=(new_dim, old_name)
    )
    temp_name = "__temp_multi_index__"
    out = ds.rename({old_name: temp_name})
    out[temp_name] = multi_index
    out = out.unstack(temp_name)

    # geom_var was reshaped also, reconstruct it from the unique values.
    unique_indexes = xr.DataArray(unique_indexes, dims=(new_dim,))
    out[geom_var] = ds[geom_var].isel({old_name: unique_indexes})
    if old_name not in ds.coords:
        # If there was no coord before, drop the dummy one we made.
        out = out.drop_vars(old_name)
    return out


[docs] def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None = None): """ Convert a DataArray with shapely geometry objects into a CF-compliant dataset. Parameters ---------- geometries : sequence of shapely geometries or xarray.DataArray A sequence of geometry objects or a Dataset with a "geometry" variable storing such geometries. All geometries must be of the same base type : Point, Line or Polygon, but multipart geometries are accepted. grid_mapping : str, optional A CF grid mapping name. When given, coordinates and attributes are named and set accordingly. Defaults to None, in which case the coordinates are simply names "crd_x" and "crd_y". .. warning:: Only the `longitude_latitude` grid mapping is currently implemented. Returns ------- xr.Dataset A dataset with shapely geometry objects translated into CF-compliant variables : - 'x', 'y' : the node coordinates - 'crd_x', 'crd_y' : the feature coordinates (might have different names if `grid_mapping` is available). - 'node_count' : The number of nodes per feature. Absent if all instances are Points. - 'geometry_container' : Empty variable with attributes describing the geometry type. - Other variables are not implemented as only Points are currently understood. References ---------- Please refer to the CF conventions document: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#geometries """ # Get all types to call the appropriate translation function. types = { geom.item().geom_type if isinstance(geom, xr.DataArray) else geom.geom_type for geom in geometries } if types.issubset({"Point", "MultiPoint"}): ds = points_to_cf(geometries) elif types.issubset({"LineString", "MultiLineString"}): ds = lines_to_cf(geometries) elif types.issubset({"Polygon", "MultiPolygon"}): ds = polygons_to_cf(geometries) else: raise ValueError( f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" ) # Special treatment of selected grid mappings if grid_mapping == "longitude_latitude": # Special case for longitude_latitude grid mapping ds = ds.rename(crd_x="lon", crd_y="lat") ds.lon.attrs.update(units="degrees_east", standard_name="longitude") ds.lat.attrs.update(units="degrees_north", standard_name="latitude") ds.geometry_container.attrs.update(coordinates="lon lat") ds.x.attrs.update(units="degrees_east", standard_name="longitude") ds.y.attrs.update(units="degrees_north", standard_name="latitude") elif grid_mapping is not None: raise NotImplementedError( f"Only grid mapping longitude_latitude is implemented. Got {grid_mapping}." ) return ds
[docs] def cf_to_shapely(ds: xr.Dataset): """ Convert geometries stored in a CF-compliant way to shapely objects stored in a single variable. Parameters ---------- ds : xr.Dataset Must contain a ``geometry_container`` variable with attributes giving the geometry specifications. Must contain all variables needed to reconstruct the geometries listed in these specifications. Returns ------- da: xr.DataArray A 1D DataArray of shapely objects. It has the same dimension as the ``node_count`` or the coordinates variables, or ``features`` if those were not present in ``ds``. References ---------- Please refer to the CF conventions document: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#geometries """ geom_type = ds.geometry_container.attrs["geometry_type"] if geom_type == "point": geometries = cf_to_points(ds) elif geom_type == "line": geometries = cf_to_lines(ds) elif geom_type == "polygon": geometries = cf_to_polygons(ds) else: raise ValueError( f"Valid CF geometry types are 'point', 'line' and 'polygon'. Got {geom_type}" ) return geometries.rename("geometry")
def points_to_cf(pts: xr.DataArray | Sequence): """Get a list of points (shapely.geometry.[Multi]Point) and return a CF-compliant geometry dataset. Parameters ---------- pts : sequence of shapely.geometry.Point or MultiPoint The sequence of [multi]points to translate to a CF dataset. Returns ------- xr.Dataset A Dataset with variables 'x', 'y', 'crd_x', 'crd_y', 'node_count' and 'geometry_container'. The coordinates of MultiPoint instances are their first point. """ from shapely.geometry import MultiPoint if isinstance(pts, xr.DataArray): dim = pts.dims[0] coord = pts[dim] if dim in pts.coords else None pts_ = pts.values.tolist() else: dim = "features" coord = None pts_ = pts x, y, node_count, crdX, crdY = [], [], [], [], [] for pt in pts_: if isinstance(pt, MultiPoint): xy = np.concatenate([p.coords for p in pt.geoms]) else: xy = np.atleast_2d(pt.coords) x.extend(xy[:, 0]) y.extend(xy[:, 1]) node_count.append(xy.shape[0]) crdX.append(xy[0, 0]) crdY.append(xy[0, 1]) ds = xr.Dataset( data_vars={ "node_count": xr.DataArray(node_count, dims=(dim,)), "geometry_container": xr.DataArray( attrs={ "geometry_type": "point", "node_count": "node_count", "node_coordinates": "x y", "coordinates": "crd_x crd_y", } ), }, coords={ "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), }, ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiPoints if (ds.node_count == 1).all(): ds = ds.drop_vars("node_count") del ds.geometry_container.attrs["node_count"] return ds def cf_to_points(ds: xr.Dataset): """Convert point geometries stored in a CF-compliant way to shapely points stored in a single variable. Parameters ---------- ds : xr.Dataset A dataset with CF-compliant point geometries. Must have a "geometry_container" variable with at least a 'node_coordinates' attribute. Must also have the two 1D variables listed by this attribute. Returns ------- geometry : xr.DataArray A 1D array of shapely.geometry.[Multi]Point objects. It has the same dimension as the ``node_count`` or the coordinates variables, or ``'features'`` if those were not present in ``ds``. """ from shapely.geometry import MultiPoint, Point # Shorthand for convenience geo = ds.geometry_container.attrs # The features dimension name, defaults to the one of 'node_count' or the dimension of the coordinates, if present. feat_dim = None if "coordinates" in geo and feat_dim is None: xcoord_name, _ = geo["coordinates"].split(" ") (feat_dim,) = ds[xcoord_name].dims x_name, y_name = ds.geometry_container.attrs["node_coordinates"].split(" ") xy = np.stack([ds[x_name].values, ds[y_name].values], axis=-1) node_count_name = ds.geometry_container.attrs.get("node_count") if node_count_name is None: # No node_count means all geometries are single points (node_count = 1) # And if we had no coordinates, then the dimension defaults to "features" feat_dim = feat_dim or "features" node_count = xr.DataArray([1] * xy.shape[0], dims=(feat_dim,)) if feat_dim in ds.coords: node_count = node_count.assign_coords({feat_dim: ds[feat_dim]}) else: node_count = ds[node_count_name] j = 0 # The index of the first node. geoms = np.empty(node_count.shape, dtype=object) # i is the feature index, n its number of nodes for i, n in enumerate(node_count.values): if n == 1: geoms[i] = Point(xy[j, :]) else: geoms[i] = MultiPoint(xy[j : j + n, :]) j += n return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) def lines_to_cf(lines: xr.DataArray | Sequence): """Convert an iterable of lines (shapely.geometry.[Multi]Line) into a CF-compliant geometry dataset. Parameters ---------- lines : sequence of shapely.geometry.Line or MultiLine The sequence of [multi]lines to translate to a CF dataset. Returns ------- xr.Dataset A Dataset with variables 'x', 'y', 'crd_x', 'crd_y', 'node_count' and 'geometry_container' and optionally 'part_node_count'. """ from shapely import to_ragged_array if isinstance(lines, xr.DataArray): dim = lines.dims[0] coord = lines[dim] if dim in lines.coords else None lines_ = lines.values else: dim = "index" coord = None lines_ = np.array(lines) _, arr, offsets = to_ragged_array(lines_) x = arr[:, 0] y = arr[:, 1] part_node_count = np.diff(offsets[0]) if len(offsets) == 1: indices = offsets[0] node_count = part_node_count else: indices = np.take(offsets[0], offsets[1]) node_count = np.diff(indices) geom_coords = arr.take(indices[:-1], 0) crdX = geom_coords[:, 0] crdY = geom_coords[:, 1] ds = xr.Dataset( data_vars={ "node_count": xr.DataArray(node_count, dims=(dim,)), "part_node_count": xr.DataArray(part_node_count, dims=("part",)), "geometry_container": xr.DataArray( attrs={ "geometry_type": "line", "node_count": "node_count", "part_node_count": "part_node_count", "node_coordinates": "x y", "coordinates": "crd_x crd_y", } ), }, coords={ "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), }, ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiLines if len(ds.part_node_count) == len(ds.node_count): ds = ds.drop_vars("part_node_count") del ds.geometry_container.attrs["part_node_count"] return ds def cf_to_lines(ds: xr.Dataset): """Convert line geometries stored in a CF-compliant way to shapely lines stored in a single variable. Parameters ---------- ds : xr.Dataset A dataset with CF-compliant line geometries. Must have a "geometry_container" variable with at least a 'node_coordinates' attribute. Must also have the two 1D variables listed by this attribute. Returns ------- geometry : xr.DataArray A 1D array of shapely.geometry.[Multi]Line objects. It has the same dimension as the ``part_node_count`` or the coordinates variables, or ``'features'`` if those were not present in ``ds``. """ from shapely import GeometryType, from_ragged_array # Shorthand for convenience geo = ds.geometry_container.attrs # The features dimension name, defaults to the one of 'node_count' # or the dimension of the coordinates, if present. feat_dim = None if "coordinates" in geo: xcoord_name, _ = geo["coordinates"].split(" ") (feat_dim,) = ds[xcoord_name].dims x_name, y_name = geo["node_coordinates"].split(" ") xy = np.stack([ds[x_name].values, ds[y_name].values], axis=-1) node_count_name = geo.get("node_count") part_node_count_name = geo.get("part_node_count", node_count_name) if node_count_name is None: raise ValueError("'node_count' must be provided for line geometries") else: node_count = ds[node_count_name] feat_dim = feat_dim or "index" if feat_dim in ds.coords: node_count = node_count.assign_coords({feat_dim: ds[feat_dim]}) # first get geometries for all the parts part_node_count = ds[part_node_count_name] offset1 = np.insert(np.cumsum(part_node_count.values), 0, 0) lines = from_ragged_array(GeometryType.LINESTRING, xy, offsets=(offset1,)) # get index of offset2 values that are edges for part_node_count offset2 = np.nonzero(np.isin(offset1, np.insert(np.cumsum(node_count), 0, 0)))[0] multilines = from_ragged_array( GeometryType.MULTILINESTRING, xy, offsets=(offset1, offset2) ) # get items from lines or multilines depending on number of parts geoms = np.where(np.diff(offset2) == 1, lines[offset2[:-1]], multilines) return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) def polygons_to_cf(polygons: xr.DataArray | Sequence): """Convert an iterable of polygons (shapely.geometry.[Multi]Polygon) into a CF-compliant geometry dataset. Parameters ---------- polygons : sequence of shapely.geometry.Polygon or MultiPolygon The sequence of [multi]polygons to translate to a CF dataset. Returns ------- xr.Dataset A Dataset with variables 'x', 'y', 'crd_x', 'crd_y', 'node_count' and 'geometry_container' and optionally 'part_node_count'. """ from shapely import to_ragged_array if isinstance(polygons, xr.DataArray): dim = polygons.dims[0] coord = polygons[dim] if dim in polygons.coords else None polygons_ = polygons.values else: dim = "index" coord = None polygons_ = np.array(polygons) _, arr, offsets = to_ragged_array(polygons_) x = arr[:, 0] y = arr[:, 1] part_node_count = np.diff(offsets[0]) if len(offsets) == 1: indices = offsets[0] node_count = part_node_count elif len(offsets) >= 2: indices = np.take(offsets[0], offsets[1]) interior_ring = np.isin(offsets[0], indices, invert=True)[:-1].astype(int) if len(offsets) == 3: indices = np.take(indices, offsets[2]) node_count = np.diff(indices) geom_coords = arr.take(indices[:-1], 0) crdX = geom_coords[:, 0] crdY = geom_coords[:, 1] ds = xr.Dataset( data_vars={ "node_count": xr.DataArray(node_count, dims=(dim,)), "interior_ring": xr.DataArray(interior_ring, dims=("part",)), "part_node_count": xr.DataArray(part_node_count, dims=("part",)), "geometry_container": xr.DataArray( attrs={ "geometry_type": "polygon", "node_count": "node_count", "part_node_count": "part_node_count", "interior_ring": "interior_ring", "node_coordinates": "x y", "coordinates": "crd_x crd_y", } ), }, coords={ "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), }, ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiPolygons and no holes if len(ds.part_node_count) == len(ds.node_count): ds = ds.drop_vars("part_node_count") del ds.geometry_container.attrs["part_node_count"] # Special case when we have no holes if (ds.interior_ring == 0).all(): ds = ds.drop_vars("interior_ring") del ds.geometry_container.attrs["interior_ring"] return ds def cf_to_polygons(ds: xr.Dataset): """Convert polygon geometries stored in a CF-compliant way to shapely polygons stored in a single variable. Parameters ---------- ds : xr.Dataset A dataset with CF-compliant polygon geometries. Must have a "geometry_container" variable with at least a 'node_coordinates' attribute. Must also have the two 1D variables listed by this attribute. Returns ------- geometry : xr.DataArray A 1D array of shapely.geometry.[Multi]Polygon objects. It has the same dimension as the ``part_node_count`` or the coordinates variables, or ``'features'`` if those were not present in ``ds``. """ from shapely import GeometryType, from_ragged_array # Shorthand for convenience geo = ds.geometry_container.attrs # The features dimension name, defaults to the one of 'part_node_count' # or the dimension of the coordinates, if present. feat_dim = None if "coordinates" in geo: xcoord_name, _ = geo["coordinates"].split(" ") (feat_dim,) = ds[xcoord_name].dims x_name, y_name = geo["node_coordinates"].split(" ") xy = np.stack([ds[x_name].values, ds[y_name].values], axis=-1) node_count_name = geo.get("node_count") part_node_count_name = geo.get("part_node_count", node_count_name) interior_ring_name = geo.get("interior_ring") if node_count_name is None: raise ValueError("'node_count' must be provided for polygon geometries") else: node_count = ds[node_count_name] feat_dim = feat_dim or "index" if feat_dim in ds.coords: node_count = node_count.assign_coords({feat_dim: ds[feat_dim]}) # first get geometries for all the rings part_node_count = ds[part_node_count_name] offset1 = np.insert(np.cumsum(part_node_count.values), 0, 0) if interior_ring_name is None: offset2 = np.array(list(range(len(offset1)))) else: interior_ring = ds[interior_ring_name] if not interior_ring[0] == 0: raise ValueError("coordinate array must start with an exterior ring") offset2 = np.append(np.where(interior_ring == 0)[0], [len(part_node_count)]) polygons = from_ragged_array(GeometryType.POLYGON, xy, offsets=(offset1, offset2)) # get index of offset2 values that are edges for node_count offset3 = np.nonzero( np.isin( offset2, np.nonzero(np.isin(offset1, np.insert(np.cumsum(node_count), 0, 0)))[0], ) )[0] multipolygons = from_ragged_array( GeometryType.MULTIPOLYGON, xy, offsets=(offset1, offset2, offset3) ) # get items from polygons or multipolygons depending on number of parts geoms = np.where(np.diff(offset3) == 1, polygons[offset3[:-1]], multipolygons) return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords)