Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions workflow/internal/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ $defs:
default: 1
minimum: 0

gdalSettings:
description: Optional GDAL runtime configuration.
type: object
additionalProperties: false
properties:
geojson_max_obj_size_mb:
description: >
Maximum accepted size in megabytes for a single GeoJSON object read by
GDAL. Set to 0 to disable GDAL's single-object limit.
type: integer
default: 1024
minimum: 0

scenario:
type: object
required:
Expand Down Expand Up @@ -222,6 +235,8 @@ properties:
$ref: "#/$defs/crsSettings"
voronoi_eez:
$ref: "#/$defs/partialVoronoiEEZSettings"
gdal:
$ref: "#/$defs/gdalSettings"
overture_release:
description: |
Overture data release to use. Two options are possible:
Expand Down
2 changes: 2 additions & 0 deletions workflow/internal/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ timeouts:
connect_seconds: 15
read_seconds: 30
initial_retry_seconds: 5
gdal:
geojson_max_obj_size_mb: 1024
overture_release: "latest"
6 changes: 6 additions & 0 deletions workflow/rules/_utils.smk
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ def get_voronoi_eez_config(scenario: str) -> dict:
return internal["voronoi_eez"] | user_overrides


def get_gdal_config() -> dict:
"""Get GDAL configuration with user overrides."""
gdal = internal["gdal"] | config.get("gdal", {})
return gdal


def get_eez_file(scenario: str, country: str) -> str:
"""Build an EEZ filename reusable by scenarios with matching EEZ requests."""
extra_eez = config["scenarios"][scenario]["countries"][country].get("extra_eez", [])
Expand Down
2 changes: 2 additions & 0 deletions workflow/rules/download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rule download_geoboundaries:
"../envs/shape.yaml"
params:
timeouts=internal["timeouts"],
geojson_max_obj_size_mb=get_gdal_config()["geojson_max_obj_size_mb"],
message:
"Downloading '{wildcards.country}_{wildcards.subtype}_{wildcards.release_type}' dataset from geoBoundaries."
script:
Expand All @@ -42,6 +43,7 @@ rule download_gadm:
"../envs/shape.yaml"
params:
timeouts=internal["timeouts"],
geojson_max_obj_size_mb=get_gdal_config()["geojson_max_obj_size_mb"],
message:
"Download '{wildcards.country}_{wildcards.subtype}' dataset from GADM."
script:
Expand Down
32 changes: 31 additions & 1 deletion workflow/scripts/_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import geopandas as gpd
from pyproj import CRS
from rasterio import warp
from shapely import MultiPolygon, Polygon
from shapely import MultiPolygon, Polygon, make_valid
from shapely.geometry import GeometryCollection, mapping, shape
from shapely.geometry.base import BaseGeometry

Expand Down Expand Up @@ -48,6 +48,36 @@ def extract_polygonal_geometry(
return result


def make_geometry_valid(geom: BaseGeometry | None) -> Polygon | MultiPolygon | None:
"""Return a valid polygonal geometry, dropping empty or non-polygonal parts."""
result = None

if geom is not None and not geom.is_empty:
if isinstance(geom, (Polygon, MultiPolygon)) and geom.is_valid:
result = geom
else:
polygonal = extract_polygonal_geometry(make_valid(geom))
if (
polygonal is not None
and not polygonal.is_empty
and not polygonal.is_valid
):
polygonal = extract_polygonal_geometry(make_valid(polygonal))

if polygonal is not None and not polygonal.is_empty and polygonal.is_valid:
result = polygonal

return result


def make_geometries_valid(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
"""Apply polygonal geometry repair to a GeoDataFrame and drop empty results."""
result = gdf.copy()
result["geometry"] = result["geometry"].map(make_geometry_valid)
result = result.loc[result["geometry"].notna()]
return result


def _rasterio_to_crs(gdf: gpd.GeoDataFrame, to_crs: CRS) -> gpd.GeoDataFrame:
"""CRS conversion using rasterio's more powerful toolset.

Expand Down
11 changes: 2 additions & 9 deletions workflow/scripts/_schemas.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Reusable schemas."""

import geopandas as gpd
from _geo import extract_polygonal_geometry
from _geo import make_geometries_valid
from pandera import pandas as pa
from pandera.typing.geopandas import GeoSeries
from pandera.typing.pandas import Series
Expand Down Expand Up @@ -37,14 +37,7 @@ class Config:
@pa.dataframe_parser
def fix_geometries(cls, gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: # type: ignore[misc]
"""Attempt to correct empty, malformed, or non-polygonal geometries."""
mask = gdf["geometry"].notna() & ~gdf["geometry"].is_empty
gdf = gdf.loc[mask].copy()

invalid = ~gdf.geometry.is_valid
gdf.loc[invalid, "geometry"] = gdf.loc[invalid, "geometry"].make_valid()

gdf["geometry"] = gdf["geometry"].apply(extract_polygonal_geometry)
return gdf.loc[gdf["geometry"].notna()]
return make_geometries_valid(gdf)

@pa.check("geometry", element_wise=True)
def check_geometries(cls, geom) -> bool:
Expand Down
9 changes: 9 additions & 0 deletions workflow/scripts/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

import geopandas as gpd
import pyogrio
import requests
from matplotlib import pyplot as plt
from matplotlib.axes import Axes
Expand Down Expand Up @@ -75,3 +76,11 @@ def download_file(url: str, path: Path, timeouts: DownloadTimeouts) -> None:
raise RuntimeError(
f"Maximum retries {max_retries} reached without a valid response"
)


def read_geojson_file(path: Path, geojson_max_obj_size_mb: int) -> gpd.GeoDataFrame:
"""Read a GeoJSON file with the configured GDAL object-size limit."""
pyogrio.set_gdal_config_options(
{"OGR_GEOJSON_MAX_OBJ_SIZE": str(geojson_max_obj_size_mb)}
)
return gpd.read_file(path)
8 changes: 3 additions & 5 deletions workflow/scripts/build_combined_area.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def remove_overlaps(gdf: gpd.GeoDataFrame, crs: dict[str, CRS]) -> gpd.GeoDataFr
from the later geometry.
"""
projected = _geo.to_projected_crs(gdf, crs["projected"])
projected = _geo.make_geometries_valid(projected)

left_idx, right_idx = projected.sindex.query(
projected.geometry, predicate="intersects"
Expand Down Expand Up @@ -51,13 +52,10 @@ def remove_overlaps(gdf: gpd.GeoDataFrame, crs: dict[str, CRS]) -> gpd.GeoDataFr
continue

geom = geom.difference(shapely.unary_union(cutters))
if geom is not None and not geom.is_empty and not geom.is_valid:
geom = shapely.make_valid(geom)

geoms[right] = geom
geoms[right] = _geo.make_geometry_valid(geom)

projected["geometry"] = geoms
projected = projected.loc[projected.geometry.notna() & ~projected.geometry.is_empty]
projected = projected.loc[projected.geometry.notna()]
result = _geo.to_geographic_crs(projected, crs["geographic"])

return result
Expand Down
23 changes: 14 additions & 9 deletions workflow/scripts/build_country.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import geopandas as gpd
import pandas as pd
from pyproj import CRS
from shapely import make_valid, voronoi_polygons
from shapely import voronoi_polygons
from shapely.geometry import (
GeometryCollection,
LineString,
Expand Down Expand Up @@ -233,7 +233,7 @@ def _split_one_maritime(
cells = gpd.GeoDataFrame(geometry=[], crs=crs)
else:
# Run Voronoi
maritime_geometry = make_valid(maritime_row.geometry)
maritime_geometry = maritime_row.geometry
cells = gpd.GeoDataFrame(
geometry=list(
voronoi_polygons(
Expand All @@ -243,7 +243,7 @@ def _split_one_maritime(
),
crs=crs,
)
cells["geometry"] = cells.geometry.map(make_valid)
cells = _geo.make_geometries_valid(cells)

# Identify where each voronoi cell belongs
cells = gpd.sjoin(
Expand All @@ -259,15 +259,18 @@ def _split_one_maritime(

# Keep only cells on the maritime area and test for completeness
cells["geometry"] = cells.geometry.intersection(maritime_geometry)
cells = cells.loc[~cells.geometry.is_empty & (cells.geometry.area > 0)]
cells = _geo.make_geometries_valid(cells)
cells = cells.loc[cells.geometry.area > 0]
pieces = (
cells[["assigned_shape_id", "geometry"]]
.dissolve(by="assigned_shape_id", as_index=False)
.rename(columns={"assigned_shape_id": "_assigned_shape_id"})
)
# FIXME: check if rerunning `warp` is necessary to fix things near the antimeridian
pieces = _geo.to_projected_crs(pieces, crs)
uncovered_area = maritime_geometry.difference(pieces.geometry.union_all()).area
pieces = _geo.make_geometries_valid(pieces)
uncovered = _geo.make_geometry_valid(
maritime_geometry.difference(pieces.geometry.union_all())
)
uncovered_area = 0 if uncovered is None else uncovered.area
if uncovered_area > voronoi_config.uncovered_area_tolerance:
raise RuntimeError(
f"Voronoi split did not fully cover maritime shape {maritime_row.shape_id!r}. "
Expand Down Expand Up @@ -297,6 +300,7 @@ def split_maritime_by_shoreline_voronoi(
return shapes, cells

shapes_proj = _geo.to_projected_crs(shapes, crs["projected"])
shapes_proj = _geo.make_geometries_valid(shapes_proj)
land = shapes_proj.loc[shapes_proj["shape_class"].eq("land")].copy()
maritime = shapes_proj.loc[shapes_proj["shape_class"].eq("maritime")].copy()
if maritime.empty:
Expand Down Expand Up @@ -326,7 +330,6 @@ def split_maritime_by_shoreline_voronoi(
geometry="geometry",
crs=crs["projected"],
)
result.geometry = result.geometry.make_valid()
result = _geo.to_geographic_crs(result, crs["geographic"])
return result, cells

Expand All @@ -341,14 +344,16 @@ def build_country(
# Reproject
p_land = _geo.to_projected_crs(land, crs["projected"])
p_marine = _geo.to_projected_crs(maritime, crs["projected"])
p_land = _geo.make_geometries_valid(p_land)
p_marine = _geo.make_geometries_valid(p_marine)
# Remove contested zones
p_marine = p_marine[p_marine["contested"].eq(False)].drop(columns="contested")
# Give priority to EEZs
p_land.geometry = p_land.geometry.difference(p_marine.geometry.union_all())
p_land = _geo.make_geometries_valid(p_land)
combined = gpd.GeoDataFrame(
pd.concat([p_land, p_marine], ignore_index=True), crs=crs["projected"]
)
combined.geometry = combined.geometry.make_valid()
return _geo.to_geographic_crs(combined, crs["geographic"])


Expand Down
11 changes: 7 additions & 4 deletions workflow/scripts/download_gadm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import TYPE_CHECKING, Any

import geopandas as gpd
from _utils import DownloadTimeouts, download_file
from _utils import DownloadTimeouts, download_file, read_geojson_file

if TYPE_CHECKING:
snakemake: Any
Expand All @@ -23,7 +23,7 @@


def download_country_gadm(
country: str, subtype: str, timeouts: DownloadTimeouts
country: str, subtype: str, timeouts: DownloadTimeouts, geojson_max_obj_size_mb: int
) -> gpd.GeoDataFrame:
"""Attempts to download country GADM data in .json or zipped json."""
last_error: Exception | None = None
Expand All @@ -35,7 +35,7 @@ def download_country_gadm(
tmp_path = Path(tmp_dir) / f"download.json{zip_ext}"

download_file(url, tmp_path, timeouts)
gdf = gpd.read_file(tmp_path)
gdf = read_geojson_file(tmp_path, geojson_max_obj_size_mb)
if gdf.empty:
raise RuntimeError(f"Downloaded empty GADM file from {url!r}.")
return gdf.to_crs(GADM_CRS)
Expand All @@ -51,7 +51,10 @@ def main():
"""Main snakemake process."""
timeouts = DownloadTimeouts(**snakemake.params.timeouts)
country = download_country_gadm(
snakemake.wildcards.country, snakemake.wildcards.subtype, timeouts
snakemake.wildcards.country,
snakemake.wildcards.subtype,
timeouts,
snakemake.params.geojson_max_obj_size_mb,
)
country.to_parquet(snakemake.output.path)

Expand Down
11 changes: 8 additions & 3 deletions workflow/scripts/download_geoboundaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import TYPE_CHECKING, Any

import geopandas as gpd
from _utils import DownloadTimeouts, download_file
from _utils import DownloadTimeouts, download_file, read_geojson_file

if TYPE_CHECKING:
snakemake: Any
Expand All @@ -23,7 +23,11 @@


def download_country_geoboundaries(
country: str, subtype: str, release_type: str, timeouts: DownloadTimeouts
country: str,
subtype: str,
release_type: str,
timeouts: DownloadTimeouts,
geojson_max_obj_size_mb: int,
) -> gpd.GeoDataFrame:
"""Download country data from geoBoundaries.

Expand All @@ -46,7 +50,7 @@ def download_country_geoboundaries(
geojson_path = tmp_path / "download.geojson"
download_file(geojson_url, geojson_path, timeouts)

gdf = gpd.read_file(geojson_path)
gdf = read_geojson_file(geojson_path, geojson_max_obj_size_mb)
if gdf.empty:
raise RuntimeError(
f"Downloaded empty geoBoundaries file from {geojson_url!r}."
Expand All @@ -64,6 +68,7 @@ def main():
snakemake.wildcards.subtype,
snakemake.wildcards.release_type,
timeouts,
snakemake.params.geojson_max_obj_size_mb,
)
country.to_parquet(snakemake.output.path)

Expand Down
Loading