Skip to content

Commit ed8374b

Browse files
Accommodate Pandas 3 breaking changes (#25)
* Accommodate Pandas 3 breaking changes * Format using ruff * Escape TRY300 / RET505 loop * Add missing packages to pyproject.toml files * Use only one is_missing function * Update Changelog * Fix ruff issues --------- Co-authored-by: Lukas Bindreiter <lukas.bindreiter@tilebox.com>
1 parent 2fad2e6 commit ed8374b

17 files changed

Lines changed: 600 additions & 378 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ repos:
99
hooks:
1010
- id: sync-with-uv
1111
- repo: https://github.com/charliermarsh/ruff-pre-commit
12-
rev: v0.14.11
12+
rev: v0.14.14
1313
hooks:
1414
- id: ruff-check
1515
args: [--fix, --exit-non-zero-on-fix]

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.47.0] - 2026-01-28
11+
12+
### Added
13+
14+
`tilebox-datasets` and `tilebox-workflows`: Added support for pandas v3.
15+
1016
### Changed
1117

1218
- `tilebox-datasets`: The `create_dataset` method of the `Client` has been removed. Use `create_or_update_dataset` instead.
@@ -304,7 +310,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
304310
- Released under the [MIT](https://opensource.org/license/mit) license.
305311
- Released packages: `tilebox-datasets`, `tilebox-workflows`, `tilebox-storage`, `tilebox-grpc`
306312

307-
[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.46.0...HEAD
313+
[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.47.0...HEAD
314+
[0.47.0]: https://github.com/tilebox/tilebox-python/compare/v0.46.0...v0.47.0
308315
[0.46.0]: https://github.com/tilebox/tilebox-python/compare/v0.45.0...v0.46.0
309316
[0.45.0]: https://github.com/tilebox/tilebox-python/compare/v0.44.0...v0.45.0
310317
[0.44.0]: https://github.com/tilebox/tilebox-python/compare/v0.43.0...v0.44.0

matrix.toml

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Matrix test configuration for testing pandas compatibility across Python versions
2+
# Run with: pymatrix --config matrix.toml
3+
#
4+
# Split into scenarios per package due to pytest conftest collision when running
5+
# multiple packages together (each has tests/conftest.py).
6+
7+
[[scenarios]]
8+
name = "datasets-pandas2"
9+
python = ["3.10", "3.11", "3.12", "3.13"]
10+
working-dir = "tilebox-datasets"
11+
test-command = "pytest"
12+
test-args = ["-v"]
13+
14+
[scenarios.packages]
15+
pandas = ["2.2.3"]
16+
17+
[[scenarios]]
18+
name = "datasets-pandas3"
19+
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
20+
working-dir = "tilebox-datasets"
21+
test-command = "pytest"
22+
test-args = ["-v"]
23+
24+
[scenarios.packages]
25+
pandas = ["3.0.0"]
26+
27+
[[scenarios]]
28+
name = "storage-pandas2"
29+
python = ["3.10", "3.11", "3.12", "3.13"]
30+
working-dir = "tilebox-storage"
31+
test-command = "pytest"
32+
test-args = ["-v"]
33+
34+
[scenarios.packages]
35+
pandas = ["2.2.3"]
36+
37+
[[scenarios]]
38+
name = "storage-pandas3"
39+
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
40+
working-dir = "tilebox-storage"
41+
test-command = "pytest"
42+
test-args = ["-v"]
43+
44+
[scenarios.packages]
45+
pandas = ["3.0.0"]
46+
47+
[[scenarios]]
48+
name = "grpc-pandas2"
49+
python = ["3.10", "3.11", "3.12", "3.13"]
50+
working-dir = "tilebox-grpc"
51+
test-command = "pytest"
52+
test-args = ["-v"]
53+
54+
[scenarios.packages]
55+
pandas = ["2.2.3"]
56+
57+
[[scenarios]]
58+
name = "grpc-pandas3"
59+
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
60+
working-dir = "tilebox-grpc"
61+
test-command = "pytest"
62+
test-args = ["-v"]
63+
64+
[scenarios.packages]
65+
pandas = ["3.0.0"]
66+
67+
[[scenarios]]
68+
name = "workflows-pandas2"
69+
python = ["3.10", "3.11", "3.12", "3.13"]
70+
working-dir = "tilebox-workflows"
71+
test-command = "pytest"
72+
# Ignore FutureWarning: google-cloud-storage raises deprecation warning on Python 3.10
73+
test-args = ["-v", "-W", "ignore::FutureWarning"]
74+
75+
[scenarios.packages]
76+
pandas = ["2.2.3"]
77+
78+
[[scenarios]]
79+
name = "workflows-pandas3"
80+
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
81+
working-dir = "tilebox-workflows"
82+
test-command = "pytest"
83+
test-args = ["-v"]
84+
85+
[scenarios.packages]
86+
pandas = ["3.0.0"]

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ dev = [
2323
# DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0)
2424
"pyarrow>=17.0.0",
2525
# some dev tooling
26-
"ruff>=0.11.10",
26+
"ruff>=0.14.10",
2727
"types-protobuf>=6.30",
2828
"junitparser>=3.2.0",
29-
"ty>=0.0.11",
29+
"ty>=0.0.14",
3030
"prek>=0.2.27",
3131
]
3232

tilebox-datasets/pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ dev = [
4242
"pytest>=8.3.2",
4343
]
4444

45-
4645
[project.urls]
4746
Homepage = "https://tilebox.com"
4847
Documentation = "https://docs.tilebox.com/datasets/introduction"

tilebox-datasets/tests/protobuf_conversion/test_protobuf_xarray.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from uuid import UUID
22

3+
import pandas as pd
34
import pytest
45
from hypothesis import given, settings
56
from hypothesis.strategies import lists
@@ -152,21 +153,21 @@ def test_convert_datapoints(datapoints: list[ExampleDatapoint]) -> None: # noqa
152153
for uuid in dataset.some_id.to_numpy():
153154
assert isinstance(uuid, str)
154155

155-
# strings should be stored as object arrays, with None as the fill value if missing
156+
# strings should be stored as object arrays, with missing values (None or NaN) as fill
156157
if "some_string" in dataset:
157158
for string in dataset.some_string.to_numpy():
158-
assert string is None or isinstance(string, str)
159+
assert pd.isna(string) or isinstance(string, str)
159160
if "some_repeated_string" in dataset:
160161
for string in dataset.some_repeated_string.to_numpy().ravel():
161-
assert string is None or isinstance(string, str)
162+
assert pd.isna(string) or isinstance(string, str)
162163

163-
# bytes should be stored as object arrays, with None as the fill value if missing
164+
# bytes should be stored as object arrays, with missing values (None or NaN) as fill
164165
if "some_bytes" in dataset:
165166
for bytes_ in dataset.some_bytes.to_numpy():
166-
assert bytes_ is None or isinstance(bytes_, bytes)
167+
assert pd.isna(bytes_) or isinstance(bytes_, bytes)
167168
if "some_repeated_bytes" in dataset:
168169
for bytes_ in dataset.some_repeated_bytes.to_numpy().ravel():
169-
assert bytes_ is None or isinstance(bytes_, bytes)
170+
assert pd.isna(bytes_) or isinstance(bytes_, bytes)
170171

171172

172173
@given(lists(example_datapoints(missing_fields=True), min_size=1, max_size=10))

tilebox-datasets/tilebox/datasets/progress.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
from types import TracebackType
44
from typing import Any
55

6+
try:
7+
from typing import Self # ty: ignore[unresolved-import]
8+
except ImportError: # Self is only available in Python 3.11+
9+
from typing_extensions import Self
10+
611
from tqdm.auto import tqdm
712

813
from tilebox.datasets.query.time_interval import TimeInterval
@@ -42,7 +47,7 @@ def __init__(
4247
self._actual_start_time = actual_start_time
4348
self._total_data_points = 0
4449

45-
def __enter__(self) -> "TimeIntervalProgressBar":
50+
def __enter__(self) -> Self:
4651
self._progress_bar = tqdm(
4752
bar_format="{l_bar}{bar}[{elapsed}<{remaining}{postfix}]",
4853
total=self._calc_progress_seconds(self._interval.end),

tilebox-datasets/tilebox/datasets/protobuf_conversion/field_types.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from uuid import UUID
55

66
import numpy as np
7+
import pandas as pd
78
from google.protobuf.descriptor import FieldDescriptor
89
from google.protobuf.duration_pb2 import Duration
910
from google.protobuf.message import Message
@@ -17,6 +18,8 @@
1718
from tilebox.datasets.datasets.v1.well_known_types_pb2 import Geometry, LatLon, LatLonAlt, Quaternion, Vec3
1819

1920
ScalarProtoFieldValue = Message | float | str | bool | bytes
21+
22+
2023
ProtoFieldValue = ScalarProtoFieldValue | Sequence[ScalarProtoFieldValue] | None
2124

2225
_FILL_VALUES_BY_DTYPE: dict[type[np.dtype[Any]], Any] = {
@@ -107,7 +110,7 @@ def from_proto(self, value: ProtoFieldValue) -> int:
107110
return value.seconds * 10**9 + value.nanos
108111

109112
def to_proto(self, value: DatetimeScalar) -> Timestamp | None:
110-
if value is None or (isinstance(value, np.datetime64) and np.isnat(value)):
113+
if is_missing(value) or (isinstance(value, np.datetime64) and np.isnat(value)):
111114
return None
112115
# we use pandas to_datetime function to handle a variety of input types that can be coerced to datetimes
113116
seconds, nanos = divmod(to_datetime(value, utc=True).value, 10**9)
@@ -124,10 +127,10 @@ def from_proto(self, value: ProtoFieldValue) -> int:
124127
return value.seconds * 10**9 + value.nanos
125128

126129
def to_proto(self, value: str | float | timedelta | np.timedelta64) -> Duration | None:
127-
if value is None or (isinstance(value, np.timedelta64) and np.isnat(value)):
130+
if is_missing(value) or (isinstance(value, np.timedelta64) and np.isnat(value)):
128131
return None
129132
# we use pandas to_timedelta function to handle a variety of input types that can be coerced to timedeltas
130-
seconds, nanos = divmod(to_timedelta(value).value, 10**9) # type: ignore[arg-type]
133+
seconds, nanos = divmod(to_timedelta(value).value, 10**9)
131134
return Duration(seconds=seconds, nanos=nanos)
132135

133136

@@ -141,7 +144,7 @@ def from_proto(self, value: ProtoFieldValue) -> str:
141144
return str(UUID(bytes=value.uuid))
142145

143146
def to_proto(self, value: str | UUID) -> UUIDMessage | None:
144-
if not value: # None or empty string
147+
if is_missing(value) or value == "": # missing or empty string
145148
return None
146149

147150
if isinstance(value, str):
@@ -160,7 +163,7 @@ def from_proto(self, value: ProtoFieldValue) -> Any:
160163
return from_wkb(value.wkb)
161164

162165
def to_proto(self, value: Any) -> Geometry | None:
163-
if value is None:
166+
if is_missing(value):
164167
return None
165168
return Geometry(wkb=value.wkb)
166169

@@ -175,7 +178,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float, float]:
175178
return value.x, value.y, value.z
176179

177180
def to_proto(self, value: tuple[float, float, float]) -> Vec3 | None:
178-
if value is None or np.all(np.isnan(value)):
181+
if is_missing(value) or np.all(np.isnan(value)):
179182
return None
180183
return Vec3(x=value[0], y=value[1], z=value[2])
181184

@@ -190,7 +193,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float, float, float
190193
return value.q1, value.q2, value.q3, value.q4
191194

192195
def to_proto(self, value: tuple[float, float, float, float]) -> Quaternion | None:
193-
if value is None or np.all(np.isnan(value)):
196+
if is_missing(value) or np.all(np.isnan(value)):
194197
return None
195198
return Quaternion(q1=value[0], q2=value[1], q3=value[2], q4=value[3])
196199

@@ -205,7 +208,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float]:
205208
return value.latitude, value.longitude
206209

207210
def to_proto(self, value: tuple[float, float]) -> LatLon | None:
208-
if value is None or np.all(np.isnan(value)):
211+
if is_missing(value) or np.all(np.isnan(value)):
209212
return None
210213
return LatLon(latitude=value[0], longitude=value[1])
211214

@@ -221,7 +224,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float, float]:
221224
return value.latitude, value.longitude, value.altitude
222225

223226
def to_proto(self, value: tuple[float, float, float]) -> LatLonAlt | None:
224-
if value is None or np.all(np.isnan(value)):
227+
if is_missing(value) or np.all(np.isnan(value)):
225228
return None
226229
return LatLonAlt(latitude=value[0], longitude=value[1], altitude=value[2])
227230

@@ -301,3 +304,19 @@ def _camel_to_uppercase(name: str) -> str:
301304
'PROCESSING_LEVEL'
302305
"""
303306
return "".join(["_" + c.lower() if c.isupper() else c for c in name]).lstrip("_").upper()
307+
308+
309+
def is_missing(value: Any) -> bool:
310+
"""Check if a value represents a missing/null value.
311+
312+
Handles None, np.nan, pd.NA, NaT, and other pandas missing value sentinels.
313+
This is needed for pandas 3.0+ compatibility where object-dtype columns use
314+
np.nan instead of None for missing values.
315+
"""
316+
try:
317+
return bool(pd.isna(value))
318+
except ValueError:
319+
# pd.isna returns either a bool, or an array of bools. In case of an array, converting the result to bool()
320+
# will raise a ValueError. For an array, we know it's not a missing value, even an array of all NaNs is not
321+
# a missing value.
322+
return False

tilebox-datasets/tilebox/datasets/protobuf_conversion/to_protobuf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
ProtobufFieldType,
1313
ProtoFieldValue,
1414
infer_field_type,
15+
is_missing,
1516
)
1617

1718
IngestionData = Mapping[str, Collection[Any]] | Iterable[tuple[str, Collection[Any]]] | pd.DataFrame | xr.Dataset
@@ -120,7 +121,7 @@ def convert_values_to_proto(
120121
values: np.ndarray | pd.Series, field_type: ProtobufFieldType, filter_none: bool = False
121122
) -> list[ProtoFieldValue]:
122123
if filter_none:
123-
return [field_type.to_proto(value) for value in values if value is not None]
124+
return [field_type.to_proto(value) for value in values if not is_missing(value)]
124125
return [field_type.to_proto(value) for value in values]
125126

126127

tilebox-datasets/tilebox/datasets/query/id_interval.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,16 @@ def parse(cls, arg: IDIntervalLike, start_exclusive: bool = False, end_inclusive
5050
Returns:
5151
IDInterval: The parsed ID interval
5252
"""
53+
if isinstance(arg, IDInterval):
54+
return arg
5355

54-
match arg:
55-
case IDInterval(_, _, _, _):
56-
return arg
57-
case (UUID(), UUID()):
58-
start: UUID = arg[0]
59-
end: UUID = arg[1]
56+
if isinstance(arg, tuple) and len(arg) == 2:
57+
start, end = arg
58+
if isinstance(start, UUID) and isinstance(end, UUID):
6059
return IDInterval(
61-
start_id=start,
62-
end_id=end,
63-
start_exclusive=start_exclusive,
64-
end_inclusive=end_inclusive,
60+
start_id=start, end_id=end, start_exclusive=start_exclusive, end_inclusive=end_inclusive
6561
)
66-
case (str(), str()):
67-
start: str = arg[0]
68-
end: str = arg[1]
62+
if isinstance(start, str) and isinstance(end, str):
6963
return IDInterval(
7064
start_id=UUID(start),
7165
end_id=UUID(end),

0 commit comments

Comments
 (0)