diff --git a/examples/databricks_pyspark.py b/examples/databricks_pyspark.py new file mode 100644 index 0000000..fa8967f --- /dev/null +++ b/examples/databricks_pyspark.py @@ -0,0 +1,67 @@ +""" +Databricks example: Load Open-Meteo .om files as a Spark DataFrame. + +This script demonstrates how to use the ``omfiles`` PySpark custom data source +to read spatial weather data from Open-Meteo's public S3 bucket and persist it +as a reusable Delta table. + +Prerequisites (install on the Databricks cluster): + %pip install omfiles[pyspark,grids] + +Requirements: + - Databricks Runtime 15.2 or later + - Python 3.10+ +""" + +# -- 0. Install (run once per cluster session) -------------------------------- +# %pip install omfiles[pyspark,grids] + +import datetime as dt + +# -- 1. Register the custom data source -------------------------------------- +from omfiles.pyspark import OmFileDataSource + +spark.dataSource.register(OmFileDataSource) # type: ignore[name-defined] # noqa: F821 + +# -- 2. Build the S3 URI for a recent spatial file ---------------------------- +MODEL_DOMAIN = "dwd_icon" +date_time = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=2) +S3_URI = ( + f"s3://openmeteo/data_spatial/{MODEL_DOMAIN}/{date_time.year}/" + f"{date_time.month:02}/{date_time.day:02}/0000Z/" + f"{date_time.strftime('%Y-%m-%d')}T0000.om" +) +print(f"Reading: {S3_URI}") + +# -- 3. Read selected weather variables into a Spark DataFrame ---------------- +df = ( + spark.read.format("om") # type: ignore[name-defined] # noqa: F821 + .option("path", S3_URI) + .option("variables", "temperature_2m,wind_speed_10m") + .option("s3_anon", "true") + .option("s3_block_size", "65536") + .option("include_coordinates", "true") + .load() +) + +# Show a sample of the data +df.show(20) +df.printSchema() +print(f"Total rows: {df.count()}") + +# -- 4. (Optional) Save as a Delta table for fast reuse ---------------------- +# df.write.format("delta").mode("overwrite").saveAsTable("weather.dwd_icon_spatial") + +# -- 5. Run queries directly ------------------------------------------------- +# Example: find the hottest locations +df.createOrReplaceTempView("weather") +hottest = spark.sql( # type: ignore[name-defined] # noqa: F821 + """ + SELECT latitude, longitude, temperature_2m + FROM weather + WHERE temperature_2m IS NOT NULL + ORDER BY temperature_2m DESC + LIMIT 10 + """ +) +hottest.show() diff --git a/pyproject.toml b/pyproject.toml index 7229ae4..d8d0675 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ codec = [ xarray = ["xarray>=2023.1.0"] fsspec = ["fsspec>=2023.1.0", "s3fs>=2023.1.0"] grids = ["pyproj>=3.1.0"] +pyspark = ["pyspark>=3.5.0", "fsspec>=2023.1.0", "s3fs>=2023.1.0"] dask = ["dask[array]>=2023.1.0"] all = [ "zarr>=2.18.2", @@ -46,6 +47,7 @@ all = [ "s3fs>=2023.1.0", "pyproj>=3.3.0", "dask[array]>=2023.1.0", + "pyspark>=3.5.0" ] [dependency-groups] diff --git a/python/omfiles/pyspark.py b/python/omfiles/pyspark.py new file mode 100644 index 0000000..2da3fca --- /dev/null +++ b/python/omfiles/pyspark.py @@ -0,0 +1,360 @@ +""" +PySpark custom data source for reading Open-Meteo .om files. + +This module provides a PySpark DataSource implementation that allows reading .om files +directly into Spark DataFrames, enabling efficient distributed processing in Databricks +and other Spark environments. + +Requires PySpark (Databricks Runtime 15.2+) and ``omfiles[fsspec,grids]``. + +Example usage:: + + from omfiles.pyspark import OmFileDataSource + + spark.dataSource.register(OmFileDataSource) + + # Read a spatial .om file from S3 + df = ( + spark.read.format("om") + .option("path", "s3://openmeteo/data_spatial/dwd_icon/2026/03/01/0000Z/2026-03-01T0000.om") + .option("variables", "temperature_2m,wind_speed_10m") + .option("s3_anon", "true") + .load() + ) + df.show() + + # Save as Delta table for fast reuse + df.write.format("delta").saveAsTable("weather.temperature") +""" + +from __future__ import annotations + +from typing import Iterator, Sequence, Tuple + +from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition +from pyspark.sql.types import ( + DoubleType, + FloatType, + IntegerType, + LongType, + StringType, + StructField, + StructType, +) + + +def _numpy_dtype_to_spark(np_dtype) -> DoubleType | FloatType | IntegerType | LongType: + """Map a numpy dtype to the corresponding PySpark type.""" + import numpy as np + + name = np.dtype(np_dtype).name + mapping = { + "float32": FloatType(), + "float64": DoubleType(), + "int8": IntegerType(), + "int16": IntegerType(), + "int32": IntegerType(), + "int64": LongType(), + "uint8": IntegerType(), + "uint16": IntegerType(), + "uint32": IntegerType(), + "uint64": LongType(), + } + return mapping.get(name, DoubleType()) + + +class OmVariablePartition(InputPartition): + """One partition per variable in the .om file.""" + + def __init__(self, variable_name: str): + self.variable_name = variable_name + + +class OmFileDataSource(DataSource): + """ + PySpark DataSource for reading Open-Meteo ``.om`` files. + + Options: + path (str): Path to the ``.om`` file. Can be a local path or an S3 URI + (e.g. ``s3://openmeteo/data_spatial/…``). + variables (str, optional): Comma-separated list of variable names to read. + When omitted, all array children of the root group are read. + s3_anon (str, optional): Set to ``"true"`` for anonymous S3 access (default ``"true"``). + s3_block_size (str, optional): S3 read block size in bytes (default ``"65536"``). + cache_storage (str, optional): Local directory for fsspec block-cache (default ``""`` = no caching). + include_coordinates (str, optional): ``"true"`` (default) to add ``latitude`` / + ``longitude`` columns derived from the grid's CRS. + row_chunk_size (str, optional): Number of latitude rows per partition (default ``"64"``). + """ + + @classmethod + def name(cls) -> str: + return "om" + + def schema(self) -> StructType: + """ + Infer the Spark schema by inspecting the .om file metadata. + + For spatial files (root is a group) the schema is: + - ``latitude`` DOUBLE + - ``longitude`` DOUBLE + - one column per selected variable (FLOAT / DOUBLE / INT / …) + + For flat array files (root is an array) the schema is: + - one column ``value`` with the array's dtype + """ + import numpy as np + + reader = self._open_reader() + try: + if reader.is_group: + fields: list[StructField] = [] + include_coords = self.options.get("include_coordinates", "true").lower() == "true" + if include_coords: + fields.append(StructField("latitude", DoubleType(), nullable=False)) + fields.append(StructField("longitude", DoubleType(), nullable=False)) + + variable_names = self._resolve_variable_names(reader) + for var_name in variable_names: + child = reader.get_child_by_name(var_name) + spark_type = _numpy_dtype_to_spark(child.dtype) + fields.append(StructField(var_name, spark_type, nullable=True)) + return StructType(fields) + elif reader.is_array: + spark_type = _numpy_dtype_to_spark(reader.dtype) + dims = len(reader.shape) + fields = [] + for i in range(dims - 1): + fields.append(StructField(f"dim{i}", LongType(), nullable=False)) + fields.append(StructField("value", spark_type, nullable=True)) + return StructType(fields) + else: + raise ValueError("Root of .om file is a scalar — cannot be read as a table.") + finally: + reader.close() + + def reader(self, schema: StructType) -> DataSourceReader: + return OmFileDataSourceReader(schema, self.options) + + # ------------------------------------------------------------------ + # Internal helpers (only used in the *driver* to infer schema) + # ------------------------------------------------------------------ + + def _open_reader(self): + """Open an OmFileReader from the configured path.""" + from omfiles import OmFileReader + + path: str = self.options.get("path", "") + if not path: + raise ValueError("The 'path' option is required.") + + if path.startswith("s3://"): + return self._open_s3_reader(path) + else: + return OmFileReader(path) + + def _open_s3_reader(self, path: str): + """Open an OmFileReader backed by fsspec / S3.""" + import fsspec + + from omfiles import OmFileReader + + s3_anon = self.options.get("s3_anon", "true").lower() == "true" + block_size = int(self.options.get("s3_block_size", "65536")) + cache_storage = self.options.get("cache_storage", "") + + if cache_storage: + uri = f"blockcache::{path}" + backend = fsspec.open( + uri, + mode="rb", + s3={"anon": s3_anon, "default_block_size": block_size}, + blockcache={"cache_storage": cache_storage}, + ) + else: + backend = fsspec.open( + path, + mode="rb", + s3={"anon": s3_anon, "default_block_size": block_size}, + ) + return OmFileReader(backend) + + def _resolve_variable_names(self, reader) -> list[str]: + """Return the list of variable names to read.""" + variables_opt = self.options.get("variables", "") + if variables_opt: + return [v.strip() for v in variables_opt.split(",") if v.strip()] + # Auto-discover all array children + names: list[str] = [] + for i in range(reader.num_children): + child = reader.get_child_by_index(i) + if child.is_array: + names.append(child.name) + return names + + +class OmFileDataSourceReader(DataSourceReader): + """ + Reads .om file data and yields rows to Spark. + + Each variable is read as a separate partition to enable parallelism across + the Spark cluster. Within each partition the full (lat × lon) grid is read + for that single variable and yielded row-by-row. + """ + + def __init__(self, schema: StructType, options: dict): + self.schema = schema + self.options = options + + def partitions(self) -> Sequence[InputPartition]: + """Create one partition per variable for spatial files, or one partition for flat arrays.""" + from omfiles import OmFileReader + + path = self.options.get("path", "") + reader = self._open_reader() + try: + if reader.is_group: + variable_names = self._resolve_variable_names(reader) + return [OmVariablePartition(name) for name in variable_names] + else: + # Flat array — single partition + return [OmVariablePartition("__array__")] + finally: + reader.close() + + def read(self, partition: InputPartition) -> Iterator[Tuple]: + """Read data for a single variable partition and yield rows.""" + import numpy as np + + reader = self._open_reader() + try: + if isinstance(partition, OmVariablePartition) and partition.variable_name == "__array__": + yield from self._read_flat_array(reader) + elif isinstance(partition, OmVariablePartition): + yield from self._read_spatial_variable(reader, partition.variable_name) + else: + raise ValueError(f"Unexpected partition type: {type(partition)}") + finally: + reader.close() + + # ------------------------------------------------------------------ + # Flat array reading + # ------------------------------------------------------------------ + + def _read_flat_array(self, reader) -> Iterator[Tuple]: + """Yield one row per element in a flat array (with dimension indices).""" + import numpy as np + + data = reader[...] + it = np.nditer(data, flags=["multi_index"]) + while not it.finished: + idx = it.multi_index + val = float(it[0]) if np.isfinite(it[0]) else None + yield (*idx[:-1], val) if len(idx) > 1 else (val,) + it.iternext() + + # ------------------------------------------------------------------ + # Spatial (hierarchical) variable reading + # ------------------------------------------------------------------ + + def _read_spatial_variable(self, reader, variable_name: str) -> Iterator[Tuple]: + """Read a single spatial variable and yield (lat, lon, value) rows.""" + import numpy as np + + include_coords = self.options.get("include_coordinates", "true").lower() == "true" + child = reader.get_child_by_name(variable_name) + data = child[...] # shape: (ny, nx) or higher-dimensional + + if include_coords: + try: + from omfiles.grids import OmGrid + + # Try to read crs_wkt from file attributes + crs_wkt = self._get_crs_wkt(reader) + grid = OmGrid(crs_wkt, data.shape[:2]) + lon2d, lat2d = grid.get_meshgrid() + except Exception: + # Fallback: use integer indices as coordinates + ny, nx = data.shape[:2] + lat2d = np.arange(ny, dtype=np.float64).reshape(ny, 1) * np.ones((1, nx)) + lon2d = np.arange(nx, dtype=np.float64).reshape(1, nx) * np.ones((ny, 1)) + + ny, nx = data.shape[:2] + for y in range(ny): + for x in range(nx): + val = data[y, x] + # Convert numpy scalar to Python; NaN → None + if hasattr(val, "item"): + val = val.item() + if isinstance(val, float) and (np.isnan(val) or np.isinf(val)): + val = None + if include_coords: + yield (float(lat2d[y, x]), float(lon2d[y, x]), val) + else: + yield (val,) + + def _get_crs_wkt(self, reader) -> str: + """Try to retrieve crs_wkt from the .om file attributes.""" + try: + crs_reader = reader.get_child_by_name("crs_wkt") + if crs_reader.is_scalar: + return crs_reader.read_scalar() + except Exception: + pass + raise ValueError("Could not find crs_wkt in file attributes") + + # ------------------------------------------------------------------ + # Helpers (duplicated from DataSource — these must be serializable + # and run on executors, not on the driver) + # ------------------------------------------------------------------ + + def _open_reader(self): + """Open an OmFileReader from the configured path.""" + from omfiles import OmFileReader + + path: str = self.options.get("path", "") + if not path: + raise ValueError("The 'path' option is required.") + + if path.startswith("s3://"): + return self._open_s3_reader(path) + else: + return OmFileReader(path) + + def _open_s3_reader(self, path: str): + """Open an OmFileReader backed by fsspec / S3.""" + import fsspec + + from omfiles import OmFileReader + + s3_anon = self.options.get("s3_anon", "true").lower() == "true" + block_size = int(self.options.get("s3_block_size", "65536")) + cache_storage = self.options.get("cache_storage", "") + + if cache_storage: + uri = f"blockcache::{path}" + backend = fsspec.open( + uri, + mode="rb", + s3={"anon": s3_anon, "default_block_size": block_size}, + blockcache={"cache_storage": cache_storage}, + ) + else: + backend = fsspec.open( + path, + mode="rb", + s3={"anon": s3_anon, "default_block_size": block_size}, + ) + return OmFileReader(backend) + + def _resolve_variable_names(self, reader) -> list[str]: + """Return the list of variable names to read.""" + variables_opt = self.options.get("variables", "") + if variables_opt: + return [v.strip() for v in variables_opt.split(",") if v.strip()] + names: list[str] = [] + for i in range(reader.num_children): + child = reader.get_child_by_index(i) + if child.is_array: + names.append(child.name) + return names diff --git a/tests/test_pyspark.py b/tests/test_pyspark.py new file mode 100644 index 0000000..4b89a44 --- /dev/null +++ b/tests/test_pyspark.py @@ -0,0 +1,226 @@ +"""Tests for the PySpark custom data source integration. + +These tests verify the core logic of the OmFileDataSource without requiring +a running Spark cluster. They mock the PySpark base classes so that the +module can be imported and the read-path logic exercised locally. +""" + +import os +import tempfile + +import numpy as np +import pytest +from omfiles import OmFileWriter + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _create_flat_om(path: str, data: np.ndarray) -> None: + """Write *data* as a flat (non-hierarchical) .om file.""" + writer = OmFileWriter(path) + var = writer.write_array(data, chunks=[min(5, s) for s in data.shape], scale_factor=1.0, add_offset=0.0) + writer.close(var) + + +def _create_hierarchical_om(path: str) -> dict[str, np.ndarray]: + """Write a hierarchical .om file with two variables and a crs_wkt scalar.""" + temp = np.arange(25, dtype=np.float32).reshape(5, 5) + wind = (np.arange(25, dtype=np.float32).reshape(5, 5) * 0.5) + + writer = OmFileWriter(path) + v_temp = writer.write_array(temp, chunks=[5, 5], scale_factor=1.0, add_offset=0.0, name="temperature_2m") + v_wind = writer.write_array(wind, chunks=[5, 5], scale_factor=1.0, add_offset=0.0, name="wind_speed_10m") + root = writer.write_group("root", children=[v_temp, v_wind]) + writer.close(root) + return {"temperature_2m": temp, "wind_speed_10m": wind} + + +# --------------------------------------------------------------------------- +# Skip if pyspark is not installed +# --------------------------------------------------------------------------- + +pyspark = pytest.importorskip("pyspark", reason="PySpark not installed — skipping PySpark datasource tests") + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestOmFileDataSource: + """Unit tests for OmFileDataSource schema inference and reader logic.""" + + def test_schema_inference_flat_array(self, tmp_path): + """Schema for a flat array should have dim columns + value.""" + from omfiles.pyspark import OmFileDataSource + + path = str(tmp_path / "flat.om") + data = np.arange(20, dtype=np.float32).reshape(4, 5) + _create_flat_om(path, data) + + ds = OmFileDataSource.__new__(OmFileDataSource) + ds.options = {"path": path} + + schema = ds.schema() + field_names = [f.name for f in schema.fields] + assert "dim0" in field_names + assert "value" in field_names + + def test_schema_inference_hierarchical(self, tmp_path): + """Schema for a hierarchical file should contain lat, lon, and variable columns.""" + from omfiles.pyspark import OmFileDataSource + + path = str(tmp_path / "hier.om") + _create_hierarchical_om(path) + + ds = OmFileDataSource.__new__(OmFileDataSource) + ds.options = {"path": path, "include_coordinates": "false"} + + schema = ds.schema() + field_names = [f.name for f in schema.fields] + assert "temperature_2m" in field_names + assert "wind_speed_10m" in field_names + + def test_schema_inference_hierarchical_with_coords(self, tmp_path): + """With include_coordinates=true (default), lat/lon columns should appear.""" + from omfiles.pyspark import OmFileDataSource + + path = str(tmp_path / "hier_coords.om") + _create_hierarchical_om(path) + + ds = OmFileDataSource.__new__(OmFileDataSource) + ds.options = {"path": path} + + schema = ds.schema() + field_names = [f.name for f in schema.fields] + assert "latitude" in field_names + assert "longitude" in field_names + assert "temperature_2m" in field_names + + def test_schema_inference_selected_variables(self, tmp_path): + """Only selected variables should appear in the schema.""" + from omfiles.pyspark import OmFileDataSource + + path = str(tmp_path / "selected.om") + _create_hierarchical_om(path) + + ds = OmFileDataSource.__new__(OmFileDataSource) + ds.options = {"path": path, "variables": "temperature_2m", "include_coordinates": "false"} + + schema = ds.schema() + field_names = [f.name for f in schema.fields] + assert "temperature_2m" in field_names + assert "wind_speed_10m" not in field_names + + +class TestOmFileDataSourceReader: + """Tests for the DataSourceReader read path.""" + + def test_read_flat_array(self, tmp_path): + """Reading a flat array should yield one row per element.""" + from omfiles.pyspark import OmFileDataSourceReader, OmVariablePartition + + path = str(tmp_path / "flat.om") + data = np.arange(20, dtype=np.float32).reshape(4, 5) + _create_flat_om(path, data) + + reader = OmFileDataSourceReader.__new__(OmFileDataSourceReader) + reader.options = {"path": path} + reader.schema = None # not used in the read path directly + + partition = OmVariablePartition("__array__") + rows = list(reader.read(partition)) + assert len(rows) == 20 + # First row should be (dim0_index, value) + assert rows[0] == (0, 0.0) + # Last row + assert rows[-1] == (3, 19.0) + + def test_read_spatial_variable_no_coords(self, tmp_path): + """Reading a spatial variable without coordinates yields (value,) tuples.""" + from omfiles.pyspark import OmFileDataSourceReader, OmVariablePartition + + path = str(tmp_path / "hier.om") + expected = _create_hierarchical_om(path) + + reader = OmFileDataSourceReader.__new__(OmFileDataSourceReader) + reader.options = {"path": path, "include_coordinates": "false"} + reader.schema = None + + partition = OmVariablePartition("temperature_2m") + rows = list(reader.read(partition)) + assert len(rows) == 25 # 5x5 grid + # Values should match the written data (row-major) + values = [r[0] for r in rows] + np.testing.assert_array_almost_equal(values, expected["temperature_2m"].flatten(), decimal=1) + + def test_read_spatial_variable_with_coords(self, tmp_path): + """Reading with coordinates should yield (lat, lon, value) tuples.""" + from omfiles.pyspark import OmFileDataSourceReader, OmVariablePartition + + path = str(tmp_path / "hier_coords.om") + _create_hierarchical_om(path) + + reader = OmFileDataSourceReader.__new__(OmFileDataSourceReader) + reader.options = {"path": path, "include_coordinates": "true"} + reader.schema = None + + partition = OmVariablePartition("temperature_2m") + rows = list(reader.read(partition)) + assert len(rows) == 25 + # Each row should be a 3-tuple (lat, lon, value) + assert len(rows[0]) == 3 + + def test_partitions_hierarchical(self, tmp_path): + """Partitions should be one per variable for hierarchical files.""" + from omfiles.pyspark import OmFileDataSourceReader, OmVariablePartition + + path = str(tmp_path / "parts.om") + _create_hierarchical_om(path) + + reader = OmFileDataSourceReader.__new__(OmFileDataSourceReader) + reader.options = {"path": path, "variables": "temperature_2m,wind_speed_10m"} + reader.schema = None + + partitions = reader.partitions() + assert len(partitions) == 2 + names = {p.variable_name for p in partitions} + assert names == {"temperature_2m", "wind_speed_10m"} + + def test_partitions_flat(self, tmp_path): + """Flat array files should have a single __array__ partition.""" + from omfiles.pyspark import OmFileDataSourceReader, OmVariablePartition + + path = str(tmp_path / "flat.om") + _create_flat_om(path, np.arange(10, dtype=np.float32).reshape(2, 5)) + + reader = OmFileDataSourceReader.__new__(OmFileDataSourceReader) + reader.options = {"path": path} + reader.schema = None + + partitions = reader.partitions() + assert len(partitions) == 1 + assert partitions[0].variable_name == "__array__" + + def test_nan_becomes_none(self, tmp_path): + """NaN values in float arrays should become None in yielded rows.""" + from omfiles.pyspark import OmFileDataSourceReader, OmVariablePartition + + path = str(tmp_path / "nan.om") + data = np.array([[1.0, float("nan")], [3.0, 4.0]], dtype=np.float32) + writer = OmFileWriter(path) + v = writer.write_array(data, chunks=[2, 2], scale_factor=1.0, add_offset=0.0, name="temp") + root = writer.write_group("root", children=[v]) + writer.close(root) + + reader = OmFileDataSourceReader.__new__(OmFileDataSourceReader) + reader.options = {"path": path, "variables": "temp", "include_coordinates": "false"} + reader.schema = None + + partition = OmVariablePartition("temp") + rows = list(reader.read(partition)) + values = [r[0] for r in rows] + assert values[0] == pytest.approx(1.0, abs=0.01) + assert values[1] is None # NaN → None diff --git a/uv.lock b/uv.lock index 43f38ab..645dc57 100644 --- a/uv.lock +++ b/uv.lock @@ -16,7 +16,7 @@ resolution-markers = [ [[package]] name = "aiobotocore" -version = "3.4.0" +version = "3.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -28,9 +28,9 @@ dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.11'" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b8/50/a48ed11b15f926ce3dbb33e7fb0f25af17dbb99bcb7ae3b30c763723eca7/aiobotocore-3.4.0.tar.gz", hash = "sha256:a918b5cb903f81feba7e26835aed4b5e6bb2d0149d7f42bb2dd7d8089e3d9000", size = 122360, upload-time = "2026-04-07T06:12:24.884Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e6/89/9533b377e9412013cc43a539d81bc5f8feeb4b6830643821ad612f78b09b/aiobotocore-3.5.0.tar.gz", hash = "sha256:d45d1c4659ad0e48b694a5aa4ff18829100386f7de96c8d146ec7757a6f12918", size = 123061, upload-time = "2026-04-21T07:25:26.993Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/df/d8/ce9386e6d76ea79e61dee15e62aa48cff6be69e89246b0ac4a11857cb02c/aiobotocore-3.4.0-py3-none-any.whl", hash = "sha256:26290eb6830ea92d8a6f5f90b56e9f5cedd6d126074d5db63b195e281d982465", size = 88018, upload-time = "2026-04-07T06:12:22.684Z" }, + { url = "https://files.pythonhosted.org/packages/2d/05/6eeeadef45c24630af0ceae4d038b883e9a394786300529286ba8cc1e62d/aiobotocore-3.5.0-py3-none-any.whl", hash = "sha256:49ce35bb8b96b85d3251c2cbbb2ed7a028dc0cb0d0d0801f9ccca1ccd0d41ded", size = 88281, upload-time = "2026-04-21T07:25:25.258Z" }, ] [[package]] @@ -237,16 +237,16 @@ wheels = [ [[package]] name = "botocore" -version = "1.42.84" +version = "1.42.91" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b4/b7/1c03423843fb0d1795b686511c00ee63fed1234c2400f469aeedfd42212f/botocore-1.42.84.tar.gz", hash = "sha256:234064604c80d9272a5e9f6b3566d260bcaa053a5e05246db90d7eca1c2cf44b", size = 15148615, upload-time = "2026-04-06T19:38:56.673Z" } +sdist = { url = "https://files.pythonhosted.org/packages/21/bc/a4b7c46471c2e789ad8c4c7acfd7f302fdb481d93ff870f441249b924ae6/botocore-1.42.91.tar.gz", hash = "sha256:d252e27bc454afdbf5ed3dc617aa423f2c855c081e98b7963093399483ecc698", size = 15213010, upload-time = "2026-04-17T19:30:50.793Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e3/37/0c0c90361c8a1b9e6c75222ca24ae12996a298c0e18822a72ab229c37207/botocore-1.42.84-py3-none-any.whl", hash = "sha256:15f3fe07dfa6545e46a60c4b049fe2bdf63803c595ae4a4eec90e8f8172764f3", size = 14827061, upload-time = "2026-04-06T19:38:53.613Z" }, + { url = "https://files.pythonhosted.org/packages/b1/fc/24cc0a47c824f13933e210e9ad034b4fba22f7185b8d904c0fbf5a3b2be8/botocore-1.42.91-py3-none-any.whl", hash = "sha256:7a28c3cc6bfab5724ad18899d52402b776a0de7d87fa20c3c5270bcaaf199ce8", size = 14897344, upload-time = "2026-04-17T19:30:44.245Z" }, ] [[package]] @@ -627,11 +627,11 @@ wheels = [ [[package]] name = "idna" -version = "3.11" +version = "3.12" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +sdist = { url = "https://files.pythonhosted.org/packages/22/12/2948fbe5513d062169bd91f7d7b1cd97bc8894f32946b71fa39f6e63ca0c/idna-3.12.tar.gz", hash = "sha256:724e9952cc9e2bd7550ea784adb098d837ab5267ef67a1ab9cf7846bdbdd8254", size = 194350, upload-time = "2026-04-21T13:32:48.916Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, + { url = "https://files.pythonhosted.org/packages/53/b2/acc33950394b3becb2b664741a0c0889c7ef9f9ffbfa8d47eddb53a50abd/idna-3.12-py3-none-any.whl", hash = "sha256:60ffaa1858fac94c9c124728c24fcde8160f3fb4a7f79aa8cdd33a9d1af60a67", size = 68634, upload-time = "2026-04-21T13:32:47.403Z" }, ] [[package]] @@ -1236,6 +1236,7 @@ all = [ { name = "numcodecs", version = "0.16.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "pyspark" }, { name = "s3fs" }, { name = "xarray", version = "2025.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "xarray", version = "2026.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -1259,6 +1260,11 @@ grids = [ { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] +pyspark = [ + { name = "fsspec" }, + { name = "pyspark" }, + { name = "s3fs" }, +] xarray = [ { name = "xarray", version = "2025.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "xarray", version = "2026.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -1301,19 +1307,23 @@ requires-dist = [ { name = "dask", extras = ["array"], marker = "extra == 'dask'", specifier = ">=2023.1.0" }, { name = "fsspec", marker = "extra == 'all'", specifier = ">=2023.10.0" }, { name = "fsspec", marker = "extra == 'fsspec'", specifier = ">=2023.1.0" }, + { name = "fsspec", marker = "extra == 'pyspark'", specifier = ">=2023.1.0" }, { name = "numcodecs", marker = "extra == 'all'", specifier = ">=0.12.1" }, { name = "numcodecs", marker = "extra == 'codec'", specifier = ">=0.12.1" }, { name = "numpy", specifier = ">=1.21.0" }, { name = "pyproj", marker = "extra == 'all'", specifier = ">=3.3.0" }, { name = "pyproj", marker = "extra == 'grids'", specifier = ">=3.1.0" }, + { name = "pyspark", marker = "extra == 'all'", specifier = ">=3.5.0" }, + { name = "pyspark", marker = "extra == 'pyspark'", specifier = ">=3.5.0" }, { name = "s3fs", marker = "extra == 'all'", specifier = ">=2023.1.0" }, { name = "s3fs", marker = "extra == 'fsspec'", specifier = ">=2023.1.0" }, + { name = "s3fs", marker = "extra == 'pyspark'", specifier = ">=2023.1.0" }, { name = "xarray", marker = "extra == 'all'", specifier = ">=2023.1.0" }, { name = "xarray", marker = "extra == 'xarray'", specifier = ">=2023.1.0" }, { name = "zarr", marker = "extra == 'all'", specifier = ">=2.18.2" }, { name = "zarr", marker = "extra == 'codec'", specifier = ">=2.18.2" }, ] -provides-extras = ["all", "codec", "dask", "fsspec", "grids", "xarray"] +provides-extras = ["all", "codec", "dask", "fsspec", "grids", "pyspark", "xarray"] [package.metadata.requires-dev] dev = [ @@ -1650,6 +1660,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, ] +[[package]] +name = "py4j" +version = "0.10.9.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/31/0b210511177070c8d5d3059556194352e5753602fa64b85b7ab81ec1a009/py4j-0.10.9.9.tar.gz", hash = "sha256:f694cad19efa5bd1dee4f3e5270eb406613c974394035e5bfc4ec1aba870b879", size = 761089, upload-time = "2025-01-15T03:53:18.624Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/db/ea0203e495be491c85af87b66e37acfd3bf756fd985f87e46fc5e3bf022c/py4j-0.10.9.9-py2.py3-none-any.whl", hash = "sha256:c7c26e4158defb37b0bb124933163641a2ff6e3a3913f7811b0ddbe07ed61533", size = 203008, upload-time = "2025-01-15T03:53:15.648Z" }, +] + [[package]] name = "pygments" version = "2.20.0" @@ -1781,6 +1800,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/73/a7141a1a0559bf1a7aa42a11c879ceb19f02f5c6c371c6d57fd86cefd4d1/pyproj-3.7.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d9d25bae416a24397e0d85739f84d323b55f6511e45a522dd7d7eae70d10c7e4", size = 6391844, upload-time = "2025-08-14T12:05:40.745Z" }, ] +[[package]] +name = "pyspark" +version = "4.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "py4j" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/bf/58ee13add151469c25825b7125bbf62c3bdcec05eec4d458fcb5c5516066/pyspark-4.1.1.tar.gz", hash = "sha256:77f78984aa84fbe865c717dd37b49913b4e5c97d76ef6824f932f1aefa6621ec", size = 455359625, upload-time = "2026-01-09T09:38:38.28Z" } + [[package]] name = "pytest" version = "9.0.3"