From fdfd150ba01134e9544c7aaa3b6ce5060d8b6e61 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 24 Mar 2025 03:55:26 -0700
Subject: [PATCH 01/21] Implement onnx serialisation

---
 legateboost/models/base_model.py |  10 +++
 legateboost/models/krr.py        | 109 +++++++++++++++++++++++++++++++
 legateboost/models/linear.py     |  39 ++++++++++-
 legateboost/test/test_onnx.py    |  35 ++++++++++
 4 files changed, 190 insertions(+), 3 deletions(-)
 create mode 100644 legateboost/test/test_onnx.py

diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
index 2a5df986..a1e88011 100644
--- a/legateboost/models/base_model.py
+++ b/legateboost/models/base_model.py
@@ -126,3 +126,13 @@ def __mul__(self, scalar: Any) -> "BaseModel":
 
     def __hash__(self) -> int:
         return hash(str(self))
+
+    def to_onnx(self) -> Any:
+        """Convert the model to an ONNX model.
+
+        Returns
+        -------
+        Any
+            The ONNX model.
+        """
+        raise NotImplementedError
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index 16d7f445..6e6bd0ca 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -242,3 +242,112 @@ def __mul__(self, scalar: Any) -> "KRR":
         new = copy.deepcopy(self)
         self.betas_ *= scalar
         return new
+
+    def to_onnx(self) -> Any:
+        from onnx import numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_tensor_value_info,
+            np_dtype_to_tensor_dtype,
+        )
+
+        assert self.X_train.dtype == self.betas_.dtype
+
+        def make_constant_node(value, name):
+            return make_node(
+                "Constant",
+                inputs=[],
+                value=numpy_helper.from_array(value, name=name),
+                outputs=[name],
+            )
+
+        nodes = []
+
+        # model constants
+        betas = numpy_helper.from_array(self.betas_.__array__(), name="betas")
+        X_train = numpy_helper.from_array(self.X_train.__array__(), name="X_train")
+
+        # pred inputs
+        X = make_tensor_value_info(
+            "X",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, self.X_train.shape[1]],
+        )
+        pred = make_tensor_value_info(
+            "pred",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, self.betas_.shape[1]],
+        )
+
+        # exanded l2 distance
+        # distance = np.sum(X**2, axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train.T)
+        # + np.sum(self.X_train**2, axis=1)
+        make_tensor_value_info(
+            "XX", np_dtype_to_tensor_dtype(self.betas_.dtype), [None]
+        )
+        make_tensor_value_info(
+            "YY",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [self.X_train.shape[0], 1],
+        )
+        make_tensor_value_info(
+            "XY_reshaped",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [1, self.X_train.shape[0]],
+        )
+        make_tensor_value_info(
+            "XY",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, self.X_train.shape[0]],
+        )
+        nodes.append(make_constant_node(np.array([1]), "axis1"))
+        nodes.append(make_node("ReduceSumSquare", ["X", "axis1"], ["XX"]))
+        nodes.append(make_node("Gemm", ["X", "X_train"], ["XY"], alpha=-2.0, transB=1))
+        nodes.append(make_node("ReduceSumSquare", ["X_train", "axis1"], ["YY"]))
+        nodes.append(make_constant_node(np.array([1, -1]), "reshape"))
+        nodes.append(make_node("Reshape", ["YY", "reshape"], ["YY_reshaped"]))
+        nodes.append(make_node("Add", ["XX", "XY"], ["add0"]))
+        make_tensor_value_info(
+            "l2",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, self.X_train.shape[0]],
+        )
+        nodes.append(make_node("Add", ["YY_reshaped", "add0"], ["l2"]))
+        nodes.append(make_constant_node(np.array([0.0], self.betas_.dtype), "zero"))
+        make_tensor_value_info(
+            "l2_clipped",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, self.X_train.shape[0]],
+        )
+        nodes.append(make_node("Max", ["l2", "zero"], ["l2_clipped"]))
+
+        # RBF kernel
+        # K = np.exp(-distance / (2 * self.sigma**2))
+        make_tensor_value_info(
+            "rbf0",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, self.X_train.shape[0]],
+        )
+        nodes.append(
+            make_constant_node(
+                np.array([-2.0 * self.sigma**2], self.betas_.dtype), "denominator"
+            )
+        )
+        nodes.append(make_node("Div", ["l2_clipped", "denominator"], ["rbf0"]))
+        make_tensor_value_info(
+            "K",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, self.X_train.shape[0]],
+        )
+        nodes.append(make_node("Exp", ["rbf0"], ["K"]))
+
+        # prediction
+        # pred = np.dot(K, self.betas_)
+        nodes.append(make_node("MatMul", ["K", "betas"], ["pred"]))
+        graph = make_graph(nodes, "krr", [X], [pred], [betas, X_train])
+        onnx_model = make_model(graph)
+        check_model(onnx_model)
+        return onnx_model
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index aad65c83..98a0b776 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -58,7 +58,7 @@ def __init__(
             self.l2_regularization = alpha
 
     def _fit_solve(self, X: cn.ndarray, g: cn.ndarray, h: cn.ndarray) -> None:
-        self.betas_ = cn.zeros((X.shape[1] + 1, g.shape[1]))
+        self.betas_ = cn.zeros((X.shape[1] + 1, g.shape[1]), dtype=X.dtype)
         num_outputs = g.shape[1]
         for k in range(num_outputs):
             W = cn.sqrt(h[:, k])
@@ -135,12 +135,13 @@ def batch_predict(models: Sequence[BaseModel], X: cn.ndarray) -> cn.ndarray:
         # summing together the coeffiecients of each model then predicting
         # saves a lot of work
         betas = cn.sum([model.betas_ for model in models], axis=0)
-        return betas[0] + X.dot(betas[1:].astype(X.dtype))
+        betas = betas.astype(X.dtype)
+        return betas[0] + X.dot(betas[1:])
 
     def __str__(self) -> str:
         return (
             "Bias: "
-            + str(self.betas_[1])
+            + str(self.betas_[0])
             + "\nCoefficients: "
             + str(self.betas_[1:])
             + "\n"
@@ -150,3 +151,35 @@ def __mul__(self, scalar: Any) -> "Linear":
         new = copy.deepcopy(self)
         new.betas_ *= scalar
         return new
+
+    def to_onnx(self) -> Any:
+        from onnx import numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_tensor_value_info,
+            np_dtype_to_tensor_dtype,
+        )
+
+        # model constants
+        betas = numpy_helper.from_array(self.betas_[1:].__array__(), name="betas")
+        intercept = numpy_helper.from_array(
+            self.betas_[0].__array__(), name="intercept"
+        )
+
+        # pred inputs
+        X = make_tensor_value_info(
+            "X", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, None]
+        )
+        pred = make_tensor_value_info(
+            "pred", np_dtype_to_tensor_dtype(self.betas_.dtype), [None]
+        )
+
+        node1 = make_node("MatMul", ["X", "betas"], ["XBeta"])
+        node2 = make_node("Add", ["XBeta", "intercept"], ["pred"])
+        graph = make_graph([node1, node2], "lr", [X], [pred], [betas, intercept])
+        onnx_model = make_model(graph)
+        check_model(onnx_model)
+        return onnx_model
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
new file mode 100644
index 00000000..4aae2223
--- /dev/null
+++ b/legateboost/test/test_onnx.py
@@ -0,0 +1,35 @@
+import numpy as np
+import pytest
+from onnx.reference import ReferenceEvaluator
+
+import cupynumeric as cn
+import legateboost as lb
+
+
+@pytest.mark.parametrize(
+    "Model", [M for M in lb.models.BaseModel.__subclasses__() if hasattr(M, "to_onnx")]
+)
+@pytest.mark.parametrize("n_outputs", [1, 5])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_onnx(Model, n_outputs, dtype):
+    rs = np.random.RandomState(0)
+    X = rs.random((1000, 10)).astype(dtype)
+    g = rs.normal(size=(X.shape[0], n_outputs))
+    h = rs.random(g.shape) + 0.1
+    model = (
+        Model()
+        .set_random_state(np.random.RandomState(2))
+        .fit(cn.array(X), cn.array(g), cn.array(h))
+    )
+
+    def pred_onnx(onnx, X):
+        sess = ReferenceEvaluator(onnx)
+        pred = np.empty(X.shape[0], dtype=dtype)
+        feeds = {"X": X, "pred": pred}
+        return sess.run(None, feeds)
+
+    assert np.allclose(
+        model.predict(X),
+        pred_onnx(model.to_onnx(), X)[0],
+        atol=1e-3 if dtype == np.float32 else 1e-6,
+    )

From be665582283802956d518db7a34f305f7c911cf3 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 26 Mar 2025 03:00:32 -0700
Subject: [PATCH 02/21] Implement tree models

---
 legateboost/models/tree.py    | 143 +++++++++++++++++++++++++++++++---
 legateboost/test/test_onnx.py |   2 +-
 2 files changed, 134 insertions(+), 11 deletions(-)

diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index 5dbde0fa..9a13a163 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -3,6 +3,8 @@
 from enum import IntEnum
 from typing import Any, Callable, List, Sequence, Union, cast
 
+import numpy as np
+
 import cupynumeric as cn
 from legate.core import TaskTarget, get_legate_runtime, types
 
@@ -90,6 +92,15 @@ def __init__(
     def num_nodes(self) -> int:
         return int(cn.sum(self.hessian > 0.0))
 
+    def is_leaf(self, id: int) -> Any:
+        return self.feature[id] == -1
+
+    def left_child(self, id) -> int:
+        return id * 2 + 1
+
+    def right_child(self, id) -> int:
+        return id * 2 + 2
+
     def fit(
         self,
         X: cn.ndarray,
@@ -108,7 +119,7 @@ def fit(
         h_ = get_store(h).promote(1, X.shape[1])
 
         task.add_scalar_arg(self.max_depth, types.int32)
-        max_nodes = 2 ** (self.max_depth + 1)
+        max_nodes = 2 ** (self.max_depth + 1) - 1
         task.add_scalar_arg(max_nodes, types.int32)
         task.add_scalar_arg(self.split_samples, types.int32)
         task.add_scalar_arg(self.random_state.randint(0, 2**31), types.int32)
@@ -268,15 +279,6 @@ def batch_predict(models: Sequence[BaseModel], X: cn.ndarray) -> cn.ndarray:
 
         return cn.array(pred, copy=False)
 
-    def is_leaf(self, id: int) -> Any:
-        return self.feature[id] == -1
-
-    def left_child(self, id: int) -> int:
-        return id * 2 + 1
-
-    def right_child(self, id: int) -> int:
-        return id * 2 + 2
-
     def __str__(self) -> str:
         def format_vector(v: cn.ndarray) -> str:
             if cn.isscalar(v):
@@ -313,3 +315,124 @@ def __mul__(self, scalar: Any) -> "Tree":
         new = copy.deepcopy(self)
         new.leaf_value *= scalar
         return new
+
+    def to_onnx(self) -> Any:
+        import onnx
+        from onnx import numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_tensor,
+            make_tensor_value_info,
+        )
+
+        onnx_nodes = []
+
+        # We map the legate-boost tree representation to the TreeEnsemble ONNX operator
+        # the features array, splits array, and leaf weights can be passed unchanged
+        # ONNX then requires some extra arrays to represent the tree structure
+        # - nodes_truenodeidx is the index of the left child for a given node
+        # - nodes_falsenodeidx is the index of the right child for a given node
+        # - nodes_modes indicates that nodes use a <= comparison operator
+        # - nodes_trueleafs indicates that the left child is a leaf node
+        # - nodes_falseleafs indicates that the right child is a leaf node
+        # - leaf_targetids indicates which output the leaf node corresponds to
+        # ONNX does not support vector leaf so we will repeat the tree n_outputs
+        # times, each time with a different constant for leaf_targetids
+        # This is not ideal but I don't see a better way
+
+        tree_max_nodes = self.feature.size
+        all_nodes_idx = np.arange(tree_max_nodes)
+        nodes_featureids = self.feature.__array__()
+        nodes_splits = numpy_helper.from_array(self.split_value.__array__())
+        nodes_truenodeids = self.left_child(all_nodes_idx)
+        # get the left child of each node and check if it is a leaf
+        # if the node is already leaf then its child can go off the end of the array
+        # use np.minimum to avoid this
+        nodes_trueleafs = self.is_leaf(
+            np.minimum(tree_max_nodes - 1, self.left_child(all_nodes_idx))
+        ).astype(int)
+        nodes_falsenodeids = self.right_child(all_nodes_idx)
+        nodes_falseleafs = self.is_leaf(
+            np.minimum(tree_max_nodes - 1, self.right_child(all_nodes_idx))
+        ).astype(int)
+
+        for output_idx in range(0, self.leaf_value.shape[1]):
+            leaf_targetids = np.full(self.feature.size, output_idx, dtype=np.int64)
+            leaf_weights = numpy_helper.from_array(
+                self.leaf_value[:, output_idx].__array__()
+            )
+
+            onnx_nodes.append(
+                onnx.helper.make_node(
+                    "TreeEnsemble",
+                    ["X"],
+                    ["pred" + str(output_idx)],
+                    domain="ai.onnx.ml",
+                    n_targets=self.leaf_value.shape[1],
+                    membership_values=None,
+                    nodes_missing_value_tracks_true=None,
+                    nodes_hitrates=None,
+                    aggregate_function=1,
+                    post_transform=0,
+                    tree_roots=[0],
+                    nodes_modes=make_tensor(
+                        "nodes_modes",
+                        onnx.TensorProto.UINT8,
+                        self.feature.shape,
+                        np.zeros_like(self.feature, dtype=np.uint8),
+                    ),
+                    nodes_featureids=nodes_featureids,
+                    nodes_splits=nodes_splits,
+                    nodes_truenodeids=nodes_truenodeids,
+                    nodes_trueleafs=nodes_trueleafs,
+                    nodes_falsenodeids=nodes_falsenodeids,
+                    nodes_falseleafs=nodes_falseleafs,
+                    leaf_targetids=leaf_targetids,
+                    leaf_weights=leaf_weights,
+                )
+            )
+
+            if output_idx == 0:
+                accumulated_pred = make_tensor_value_info(
+                    "accumulated_pred0", onnx.TensorProto.DOUBLE, [None, None]
+                )
+                onnx_nodes.append(
+                    onnx.helper.make_node(
+                        "Identity",
+                        ["pred" + str(output_idx)],
+                        ["accumulated_pred0"],
+                    )
+                )
+            else:
+                accumulated_pred = make_tensor_value_info(
+                    "accumulated_pred" + str(output_idx),
+                    onnx.TensorProto.DOUBLE,
+                    [None, None],
+                )
+                onnx_nodes.append(
+                    onnx.helper.make_node(
+                        "Add",
+                        [
+                            "accumulated_pred" + str(output_idx - 1),
+                            "pred" + str(output_idx),
+                        ],
+                        ["accumulated_pred" + str(output_idx)],
+                    )
+                )
+
+        # pred inputs
+        X = make_tensor_value_info("X", onnx.TensorProto.DOUBLE, [None, None])
+        graph = make_graph(
+            onnx_nodes, "legateboost.models.Tree", [X], [accumulated_pred]
+        )
+        model = make_model(
+            graph,
+            opset_imports=[
+                onnx.helper.make_opsetid("ai.onnx.ml", 5),
+                onnx.helper.make_opsetid("", 14),
+            ],
+        )
+        check_model(model)
+        return model
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 4aae2223..8710eaf4 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -29,7 +29,7 @@ def pred_onnx(onnx, X):
         return sess.run(None, feeds)
 
     assert np.allclose(
-        model.predict(X),
+        model.predict(cn.array(X)),
         pred_onnx(model.to_onnx(), X)[0],
         atol=1e-3 if dtype == np.float32 else 1e-6,
     )

From 8e20945498ad8854b74cb0a4b12794de8e351f18 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 26 Mar 2025 03:46:13 -0700
Subject: [PATCH 03/21] Implement neural network onnx op

---
 legateboost/models/krr.py    |  4 +-
 legateboost/models/linear.py |  4 +-
 legateboost/models/nn.py     | 84 ++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index 6e6bd0ca..3f35ec5b 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -347,7 +347,9 @@ def make_constant_node(value, name):
         # prediction
         # pred = np.dot(K, self.betas_)
         nodes.append(make_node("MatMul", ["K", "betas"], ["pred"]))
-        graph = make_graph(nodes, "krr", [X], [pred], [betas, X_train])
+        graph = make_graph(
+            nodes, "legateboost.model.KRR", [X], [pred], [betas, X_train]
+        )
         onnx_model = make_model(graph)
         check_model(onnx_model)
         return onnx_model
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index 98a0b776..ec34594e 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -179,7 +179,9 @@ def to_onnx(self) -> Any:
 
         node1 = make_node("MatMul", ["X", "betas"], ["XBeta"])
         node2 = make_node("Add", ["XBeta", "intercept"], ["pred"])
-        graph = make_graph([node1, node2], "lr", [X], [pred], [betas, intercept])
+        graph = make_graph(
+            [node1, node2], "legateboost.model.Linear", [X], [pred], [betas, intercept]
+        )
         onnx_model = make_model(graph)
         check_model(onnx_model)
         return onnx_model
diff --git a/legateboost/models/nn.py b/legateboost/models/nn.py
index eb499b0d..356f0264 100644
--- a/legateboost/models/nn.py
+++ b/legateboost/models/nn.py
@@ -181,3 +181,87 @@ def __mul__(self, scalar: Any) -> "NN":
         new.coefficients_[-1] *= scalar
         new.biases_[-1] *= scalar
         return new
+
+    def to_onnx(self) -> Any:
+        from onnx import numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_tensor_value_info,
+            np_dtype_to_tensor_dtype,
+        )
+
+        # model constants
+        biases = [
+            numpy_helper.from_array(b[0].__array__(), name=f"bias{i}")
+            for i, b in enumerate(self.biases_)
+        ]
+        coefficients = [
+            numpy_helper.from_array(c.__array__(), name=f"coefficients{i}")
+            for i, c in enumerate(self.coefficients_)
+        ]
+
+        # pred inputs
+        X = make_tensor_value_info(
+            "X",
+            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            [None, self.coefficients_[0].shape[0]],
+        )
+
+        nodes = []
+
+        make_tensor_value_info(
+            "activations0",
+            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            [None, None],
+        )
+        nodes.append(make_node("MatMul", ["X", "coefficients0"], ["activations0"]))
+        activations_with_bias = make_tensor_value_info(
+            "activations0withbias",
+            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            [None, None],
+        )
+        nodes.append(
+            make_node("Add", ["activations0", "bias0"], ["activations0withbias"])
+        )
+
+        for i in range(1, len(coefficients)):
+            make_tensor_value_info(
+                f"tanh{i}",
+                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+                [None, None],
+            )
+            nodes.append(make_node("Tanh", [f"activations{i-1}withbias"], [f"tanh{i}"]))
+            make_tensor_value_info(
+                f"activations{i}",
+                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+                [None, None],
+            )
+            nodes.append(
+                make_node(
+                    "MatMul", [f"tanh{i}", f"coefficients{i}"], [f"activations{i}"]
+                )
+            )
+            activations_with_bias = make_tensor_value_info(
+                f"activations{i}withbias",
+                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+                [None, None],
+            )
+            nodes.append(
+                make_node(
+                    "Add", [f"activations{i}", f"bias{i}"], [f"activations{i}withbias"]
+                )
+            )
+
+        graph = make_graph(
+            nodes,
+            "legateboost.model.NN",
+            [X],
+            [activations_with_bias],
+            biases + coefficients,
+        )
+        onnx_model = make_model(graph)
+        check_model(onnx_model)
+        return onnx_model

From 821e7f48c31c4ec6e2669b785bfa602e1a58c6de Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 26 Mar 2025 04:01:42 -0700
Subject: [PATCH 04/21] mypy

---
 conda/environments/all_cuda-122.yaml | 2 ++
 dependencies.yaml                    | 2 ++
 legateboost/models/krr.py            | 4 +++-
 legateboost/models/tree.py           | 6 +++---
 pyproject.toml                       | 2 ++
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml
index f84146d2..68eb3824 100644
--- a/conda/environments/all_cuda-122.yaml
+++ b/conda/environments/all_cuda-122.yaml
@@ -28,6 +28,8 @@ dependencies:
 - ninja>=1.11.1.1
 - notebook>=7
 - numpy
+- onnx>=1.10
+- onnxmltools>=1.10
 - openblas
 - pydata-sphinx-theme>=0.16
 - pytest>=7,<8
diff --git a/dependencies.yaml b/dependencies.yaml
index 78b33351..9cd93fba 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -176,3 +176,5 @@ dependencies:
           - pytest>=7,<8
           - seaborn>=0.13
           - xgboost>=2.0
+          - onnx>=1.10
+          - onnxmltools>=1.10
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index 3f35ec5b..31af8d5a 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -256,7 +256,7 @@ def to_onnx(self) -> Any:
 
         assert self.X_train.dtype == self.betas_.dtype
 
-        def make_constant_node(value, name):
+        def make_constant_node(value: cn.array, name: str) -> Any:
             return make_node(
                 "Constant",
                 inputs=[],
@@ -331,6 +331,8 @@ def make_constant_node(value, name):
             np_dtype_to_tensor_dtype(self.betas_.dtype),
             [None, self.X_train.shape[0]],
         )
+        if self.sigma is None:
+            raise ValueError("sigma is None. Has fit been called?")
         nodes.append(
             make_constant_node(
                 np.array([-2.0 * self.sigma**2], self.betas_.dtype), "denominator"
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index 9a13a163..38cfda93 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -92,13 +92,13 @@ def __init__(
     def num_nodes(self) -> int:
         return int(cn.sum(self.hessian > 0.0))
 
-    def is_leaf(self, id: int) -> Any:
+    def is_leaf(self, id: cn.array) -> cn.array:
         return self.feature[id] == -1
 
-    def left_child(self, id) -> int:
+    def left_child(self, id: cn.array) -> cn.array:
         return id * 2 + 1
 
-    def right_child(self, id) -> int:
+    def right_child(self, id: cn.array) -> cn.array:
         return id * 2 + 2
 
     def fit(
diff --git a/pyproject.toml b/pyproject.toml
index 9b68d931..a69041c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,8 @@ test = [
     "mypy>=1.13",
     "nbconvert>=7.16",
     "notebook>=7",
+    "onnx>=1.10",
+    "onnxmltools>=1.10",
     "pytest>=7,<8",
     "seaborn>=0.13",
     "xgboost>=2.0",

From b9be42ee492388c4b80371e51a8fcee7177a88d6 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 31 Mar 2025 07:31:55 -0700
Subject: [PATCH 05/21] Add interface to estimator

---
 legateboost/legateboost.py    | 96 +++++++++++++++++++++++++++++++++++
 legateboost/models/krr.py     | 91 ++++++++++++++-------------------
 legateboost/models/linear.py  | 45 ++++++++++++----
 legateboost/models/nn.py      | 76 +++++++++++++++------------
 legateboost/models/tree.py    | 55 +++++++++++++-------
 legateboost/test/test_onnx.py | 65 +++++++++++++++++++-----
 6 files changed, 302 insertions(+), 126 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 80c8beb2..2c095a53 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -540,6 +540,102 @@ def dump_models(self) -> str:
             text += str(m)
         return text
 
+    def _make_onnx_init(self, X_dtype):
+        # turn self.model_init_ into an ONNX model
+        from onnx import numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+            np_dtype_to_tensor_dtype,
+        )
+
+        # model constants
+        X_in = make_tensor_value_info(
+            "X_in", np_dtype_to_tensor_dtype(X_dtype), [None, self.n_features_in_]
+        )
+        nodes = []
+        nodes.append(make_node("Shape", ["X_in"], ["n_rows"], end=1))
+        one = numpy_helper.from_array(np.array([1], dtype=np.int64), name="one")
+        nodes.append(make_node("Concat", ["n_rows", "one"], ["tile_repeat"], axis=0))
+        init = numpy_helper.from_array(
+            np.atleast_2d(self.model_init_.__array__().astype(X_dtype)), name="init"
+        )
+        prediction_out = make_tensor_value_info(
+            "predictions_out",
+            np_dtype_to_tensor_dtype(X_dtype),
+            [None, self.model_init_.shape[0]],
+        )
+        nodes.append(make_node("Tile", ["init", "tile_repeat"], ["predictions_out"]))
+        X_out = make_tensor_value_info(
+            "X_out",
+            np_dtype_to_tensor_dtype(X_dtype),
+            [None, self.model_init_.shape[0]],
+        )
+        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
+        graph = make_graph(
+            nodes,
+            "legateboost estimator init",
+            [X_in],
+            [X_out, prediction_out],
+            [init, one],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[
+                make_opsetid("", 21),
+            ],
+        )
+        check_model(onnx_model)
+
+        return onnx_model
+
+    def to_onnx(self, X_dtype, predict_function="predict"):
+        """Converts the model to an ONNX model.
+
+        Parameters
+        ----------
+        X_dtype : numpy.dtype
+            The expected data type of the input data. ONNX models hard
+            code the data type of the input data and will crash if this is
+            not set correctly.
+            Can be np.float32 or np.float64.
+        predict_function : str
+            The serialised ONNX model can produce output equivalent to 'predict',
+            'predict_proba', or 'predict_raw'.
+            The default is "predict".
+        Returns
+        -------
+        Any
+            The ONNX model.
+        """
+        from onnx.compose import merge_models
+
+        model = self._make_onnx_init(X_dtype)
+        if self.models_ is not None and len(self.models_) > 0:
+            model = merge_models(
+                model,
+                self.models_[0].to_onnx(X_dtype),
+                io_map=[("X_out", "X_in"), ("predictions_out", "predictions_in")],
+                prefix2="model_0_",
+            )
+
+        for i in range(1, len(self.models_)):
+            model = merge_models(
+                model,
+                self.models_[i].to_onnx(X_dtype),
+                io_map=[
+                    ("model_{}_X_out".format(i - 1), "X_in"),
+                    ("model_{}_predictions_out".format(i - 1), "predictions_in"),
+                ],
+                prefix2="model_{}_".format(i),
+            )
+
+        return model
+
     def global_attributions(
         self,
         X: cn.array,
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index 31af8d5a..cbea5a22 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -243,13 +243,14 @@ def __mul__(self, scalar: Any) -> "KRR":
         self.betas_ *= scalar
         return new
 
-    def to_onnx(self) -> Any:
+    def to_onnx(self, X_dtype) -> Any:
         from onnx import numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
             make_model,
             make_node,
+            make_opsetid,
             make_tensor_value_info,
             np_dtype_to_tensor_dtype,
         )
@@ -271,66 +272,34 @@ def make_constant_node(value: cn.array, name: str) -> Any:
         X_train = numpy_helper.from_array(self.X_train.__array__(), name="X_train")
 
         # pred inputs
-        X = make_tensor_value_info(
-            "X",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[1]],
+        n_features = self.X_train.shape[1]
+        n_outputs = self.betas_.shape[1]
+        X_in = make_tensor_value_info(
+            "X_in", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
         )
-        pred = make_tensor_value_info(
-            "pred",
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
             np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.betas_.shape[1]],
+            [None, n_outputs],
         )
-
         # exanded l2 distance
         # distance = np.sum(X**2, axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train.T)
         # + np.sum(self.X_train**2, axis=1)
-        make_tensor_value_info(
-            "XX", np_dtype_to_tensor_dtype(self.betas_.dtype), [None]
-        )
-        make_tensor_value_info(
-            "YY",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [self.X_train.shape[0], 1],
-        )
-        make_tensor_value_info(
-            "XY_reshaped",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [1, self.X_train.shape[0]],
-        )
-        make_tensor_value_info(
-            "XY",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
         nodes.append(make_constant_node(np.array([1]), "axis1"))
-        nodes.append(make_node("ReduceSumSquare", ["X", "axis1"], ["XX"]))
-        nodes.append(make_node("Gemm", ["X", "X_train"], ["XY"], alpha=-2.0, transB=1))
+        nodes.append(make_node("ReduceSumSquare", ["X_in", "axis1"], ["XX"]))
+        nodes.append(
+            make_node("Gemm", ["X_in", "X_train"], ["XY"], alpha=-2.0, transB=1)
+        )
         nodes.append(make_node("ReduceSumSquare", ["X_train", "axis1"], ["YY"]))
         nodes.append(make_constant_node(np.array([1, -1]), "reshape"))
         nodes.append(make_node("Reshape", ["YY", "reshape"], ["YY_reshaped"]))
         nodes.append(make_node("Add", ["XX", "XY"], ["add0"]))
-        make_tensor_value_info(
-            "l2",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
         nodes.append(make_node("Add", ["YY_reshaped", "add0"], ["l2"]))
         nodes.append(make_constant_node(np.array([0.0], self.betas_.dtype), "zero"))
-        make_tensor_value_info(
-            "l2_clipped",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
         nodes.append(make_node("Max", ["l2", "zero"], ["l2_clipped"]))
 
         # RBF kernel
         # K = np.exp(-distance / (2 * self.sigma**2))
-        make_tensor_value_info(
-            "rbf0",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
         if self.sigma is None:
             raise ValueError("sigma is None. Has fit been called?")
         nodes.append(
@@ -339,19 +308,37 @@ def make_constant_node(value: cn.array, name: str) -> Any:
             )
         )
         nodes.append(make_node("Div", ["l2_clipped", "denominator"], ["rbf0"]))
-        make_tensor_value_info(
-            "K",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
-            [None, self.X_train.shape[0]],
-        )
         nodes.append(make_node("Exp", ["rbf0"], ["K"]))
 
         # prediction
         # pred = np.dot(K, self.betas_)
-        nodes.append(make_node("MatMul", ["K", "betas"], ["pred"]))
+        nodes.append(make_node("MatMul", ["K", "betas"], ["dot"]))
+
+        # outputs
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, n_outputs],
+        )
+        X_out = make_tensor_value_info(
+            "X_out", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
+        )
+
+        nodes.append(make_node("Add", ["dot", "predictions_in"], ["predictions_out"]))
+        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
+
         graph = make_graph(
-            nodes, "legateboost.model.KRR", [X], [pred], [betas, X_train]
+            nodes,
+            "legateboost.model.KRR",
+            [X_in, predictions_in],
+            [X_out, predictions_out],
+            [betas, X_train],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[
+                make_opsetid("", 21),
+            ],
         )
-        onnx_model = make_model(graph)
         check_model(onnx_model)
         return onnx_model
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index ec34594e..90e1ee4c 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -152,13 +152,14 @@ def __mul__(self, scalar: Any) -> "Linear":
         new.betas_ *= scalar
         return new
 
-    def to_onnx(self) -> Any:
+    def to_onnx(self, X_dtype) -> Any:
         from onnx import numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
             make_model,
             make_node,
+            make_opsetid,
             make_tensor_value_info,
             np_dtype_to_tensor_dtype,
         )
@@ -170,18 +171,44 @@ def to_onnx(self) -> Any:
         )
 
         # pred inputs
-        X = make_tensor_value_info(
-            "X", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, None]
+        n_features = self.betas_.shape[0] - 1
+        n_outputs = self.betas_.shape[1]
+        X_in = make_tensor_value_info(
+            "X_in", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
         )
-        pred = make_tensor_value_info(
-            "pred", np_dtype_to_tensor_dtype(self.betas_.dtype), [None]
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, n_outputs],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            [None, n_outputs],
         )
 
-        node1 = make_node("MatMul", ["X", "betas"], ["XBeta"])
-        node2 = make_node("Add", ["XBeta", "intercept"], ["pred"])
+        nodes = []
+        nodes.append(make_node("MatMul", ["X_in", "betas"], ["XBeta"]))
+        nodes.append(make_node("Add", ["XBeta", "intercept"], ["result"]))
+        nodes.append(
+            make_node("Add", ["result", "predictions_in"], ["predictions_out"])
+        )
+        X_out = make_tensor_value_info(
+            "X_out", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
+        )
+        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
         graph = make_graph(
-            [node1, node2], "legateboost.model.Linear", [X], [pred], [betas, intercept]
+            nodes,
+            "legateboost.model.Linear",
+            [X_in, predictions_in],
+            [X_out, predictions_out],
+            [betas, intercept],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[
+                make_opsetid("", 21),
+            ],
         )
-        onnx_model = make_model(graph)
         check_model(onnx_model)
         return onnx_model
diff --git a/legateboost/models/nn.py b/legateboost/models/nn.py
index 356f0264..d77e4b24 100644
--- a/legateboost/models/nn.py
+++ b/legateboost/models/nn.py
@@ -182,13 +182,14 @@ def __mul__(self, scalar: Any) -> "NN":
         new.biases_[-1] *= scalar
         return new
 
-    def to_onnx(self) -> Any:
+    def to_onnx(self, X_dtype) -> Any:
         from onnx import numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
             make_model,
             make_node,
+            make_opsetid,
             make_tensor_value_info,
             np_dtype_to_tensor_dtype,
         )
@@ -204,64 +205,73 @@ def to_onnx(self) -> Any:
         ]
 
         # pred inputs
-        X = make_tensor_value_info(
-            "X",
+        n_outputs = self.coefficients_[-1].shape[1]
+        n_features = self.coefficients_[0].shape[0]
+        X_in = make_tensor_value_info(
+            "X_in",
             np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
             [None, self.coefficients_[0].shape[0]],
         )
-
-        nodes = []
-
-        make_tensor_value_info(
-            "activations0",
-            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-            [None, None],
-        )
-        nodes.append(make_node("MatMul", ["X", "coefficients0"], ["activations0"]))
-        activations_with_bias = make_tensor_value_info(
-            "activations0withbias",
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
             np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-            [None, None],
+            [None, n_outputs],
         )
+        nodes = []
+
+        nodes.append(make_node("MatMul", ["X_in", "coefficients0"], ["activations0"]))
         nodes.append(
             make_node("Add", ["activations0", "bias0"], ["activations0withbias"])
         )
 
         for i in range(1, len(coefficients)):
-            make_tensor_value_info(
-                f"tanh{i}",
-                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-                [None, None],
-            )
             nodes.append(make_node("Tanh", [f"activations{i-1}withbias"], [f"tanh{i}"]))
-            make_tensor_value_info(
-                f"activations{i}",
-                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-                [None, None],
-            )
             nodes.append(
                 make_node(
                     "MatMul", [f"tanh{i}", f"coefficients{i}"], [f"activations{i}"]
                 )
             )
-            activations_with_bias = make_tensor_value_info(
-                f"activations{i}withbias",
-                np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
-                [None, None],
-            )
             nodes.append(
                 make_node(
                     "Add", [f"activations{i}", f"bias{i}"], [f"activations{i}withbias"]
                 )
             )
 
+        # outputs
+        X_out = make_tensor_value_info(
+            "X_out",
+            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            [None, n_features],
+        )
+        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            [None, n_outputs],
+        )
+        nodes.append(
+            make_node(
+                "Add",
+                [
+                    "activations{}withbias".format(len(self.coefficients_) - 1),
+                    "predictions_in",
+                ],
+                ["predictions_out"],
+            )
+        )
+
         graph = make_graph(
             nodes,
             "legateboost.model.NN",
-            [X],
-            [activations_with_bias],
+            [X_in, predictions_in],
+            [X_out, predictions_out],
             biases + coefficients,
         )
-        onnx_model = make_model(graph)
+        onnx_model = make_model(
+            graph,
+            opset_imports=[
+                make_opsetid("", 21),
+            ],
+        )
         check_model(onnx_model)
         return onnx_model
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index 38cfda93..c9756992 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -316,15 +316,17 @@ def __mul__(self, scalar: Any) -> "Tree":
         new.leaf_value *= scalar
         return new
 
-    def to_onnx(self) -> Any:
+    def to_onnx(self, X_dtype) -> Any:
         import onnx
         from onnx import numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
             make_model,
+            make_node,
             make_tensor,
             make_tensor_value_info,
+            np_dtype_to_tensor_dtype,
         )
 
         onnx_nodes = []
@@ -345,7 +347,9 @@ def to_onnx(self) -> Any:
         tree_max_nodes = self.feature.size
         all_nodes_idx = np.arange(tree_max_nodes)
         nodes_featureids = self.feature.__array__()
-        nodes_splits = numpy_helper.from_array(self.split_value.__array__())
+        nodes_splits = numpy_helper.from_array(
+            self.split_value.__array__().astype(X_dtype)
+        )
         nodes_truenodeids = self.left_child(all_nodes_idx)
         # get the left child of each node and check if it is a leaf
         # if the node is already leaf then its child can go off the end of the array
@@ -357,17 +361,17 @@ def to_onnx(self) -> Any:
         nodes_falseleafs = self.is_leaf(
             np.minimum(tree_max_nodes - 1, self.right_child(all_nodes_idx))
         ).astype(int)
-
-        for output_idx in range(0, self.leaf_value.shape[1]):
+        num_outputs = self.leaf_value.shape[1]
+        for output_idx in range(0, num_outputs):
             leaf_targetids = np.full(self.feature.size, output_idx, dtype=np.int64)
             leaf_weights = numpy_helper.from_array(
-                self.leaf_value[:, output_idx].__array__()
+                self.leaf_value[:, output_idx].__array__().astype(X_dtype)
             )
 
             onnx_nodes.append(
-                onnx.helper.make_node(
+                make_node(
                     "TreeEnsemble",
-                    ["X"],
+                    ["X_in"],
                     ["pred" + str(output_idx)],
                     domain="ai.onnx.ml",
                     n_targets=self.leaf_value.shape[1],
@@ -395,9 +399,6 @@ def to_onnx(self) -> Any:
             )
 
             if output_idx == 0:
-                accumulated_pred = make_tensor_value_info(
-                    "accumulated_pred0", onnx.TensorProto.DOUBLE, [None, None]
-                )
                 onnx_nodes.append(
                     onnx.helper.make_node(
                         "Identity",
@@ -406,11 +407,6 @@ def to_onnx(self) -> Any:
                     )
                 )
             else:
-                accumulated_pred = make_tensor_value_info(
-                    "accumulated_pred" + str(output_idx),
-                    onnx.TensorProto.DOUBLE,
-                    [None, None],
-                )
                 onnx_nodes.append(
                     onnx.helper.make_node(
                         "Add",
@@ -422,16 +418,37 @@ def to_onnx(self) -> Any:
                     )
                 )
 
-        # pred inputs
-        X = make_tensor_value_info("X", onnx.TensorProto.DOUBLE, [None, None])
+        X_in = make_tensor_value_info(
+            "X_in", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+        )
+        X_out = make_tensor_value_info(
+            "X_out", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+        )
+        predictions_in = make_tensor_value_info(
+            "predictions_in", np_dtype_to_tensor_dtype(X_dtype), [None, num_outputs]
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out", np_dtype_to_tensor_dtype(X_dtype), [None, num_outputs]
+        )
+        onnx_nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
+        onnx_nodes.append(
+            make_node(
+                "Add",
+                ["predictions_in", "accumulated_pred" + str(num_outputs - 1)],
+                ["predictions_out"],
+            )
+        )
         graph = make_graph(
-            onnx_nodes, "legateboost.models.Tree", [X], [accumulated_pred]
+            onnx_nodes,
+            "legateboost.models.Tree",
+            [X_in, predictions_in],
+            [X_out, predictions_out],
         )
         model = make_model(
             graph,
             opset_imports=[
                 onnx.helper.make_opsetid("ai.onnx.ml", 5),
-                onnx.helper.make_opsetid("", 14),
+                onnx.helper.make_opsetid("", 21),
             ],
         )
         check_model(model)
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 8710eaf4..f52ea1b5 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -1,17 +1,30 @@
 import numpy as np
+import onnxruntime as ort
 import pytest
-from onnx.reference import ReferenceEvaluator
 
 import cupynumeric as cn
 import legateboost as lb
 
 
-@pytest.mark.parametrize(
-    "Model", [M for M in lb.models.BaseModel.__subclasses__() if hasattr(M, "to_onnx")]
-)
+def pred_onnx_estimator(onnx, X, n_outputs):
+    sess = ort.InferenceSession(onnx.SerializeToString())
+    feeds = {"X_in": X}
+    return sess.run(None, feeds)[1]
+
+
+def pred_onnx_model(onnx, X, n_outputs):
+    sess = ort.InferenceSession(onnx.SerializeToString())
+    feeds = {
+        "X_in": X,
+        "predictions_in": np.zeros((X.shape[0], n_outputs), dtype=X.dtype),
+    }
+    return sess.run(None, feeds)[1]
+
+
+@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
 @pytest.mark.parametrize("n_outputs", [1, 5])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_onnx(Model, n_outputs, dtype):
+def test_models(Model, n_outputs, dtype):
     rs = np.random.RandomState(0)
     X = rs.random((1000, 10)).astype(dtype)
     g = rs.normal(size=(X.shape[0], n_outputs))
@@ -22,14 +35,40 @@ def test_onnx(Model, n_outputs, dtype):
         .fit(cn.array(X), cn.array(g), cn.array(h))
     )
 
-    def pred_onnx(onnx, X):
-        sess = ReferenceEvaluator(onnx)
-        pred = np.empty(X.shape[0], dtype=dtype)
-        feeds = {"X": X, "pred": pred}
-        return sess.run(None, feeds)
+    onnx_pred = pred_onnx_model(model.to_onnx(X.dtype), X, n_outputs)
+    lb_pred = model.predict(cn.array(X))
+    assert onnx_pred.shape == lb_pred.shape
+    assert np.allclose(onnx_pred, lb_pred, atol=1e-3 if dtype == np.float32 else 1e-6)
+
+
+@pytest.mark.parametrize("n_outputs", [1, 5])
+def test_init(n_outputs):
+    # ONNX correctly outputs model init
+    X = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
+    y = np.full((3, n_outputs), 5.0, dtype=np.float32)
+    estimator = lb.LBRegressor(n_estimators=0, random_state=0).fit(X, y)
+    assert np.all(estimator.model_init_ == 5.0)
+    assert np.all(estimator.predict(X) == 5.0)
+    assert np.all(
+        pred_onnx_estimator(estimator.to_onnx(X.dtype), X.__array__(), 1) == 5.0
+    )
+
+
+@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
+@pytest.mark.parametrize("n_outputs", [1, 5])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_estimator(Model, n_outputs, dtype):
+    rs = np.random.RandomState(0)
+    X = rs.random((1000, 10)).astype(dtype)
+    y = rs.random((1000, n_outputs)).astype(dtype)
+    model = lb.LBRegressor(
+        n_estimators=10,
+        base_models=(Model(),),
+        random_state=0,
+    ).fit(X, y)
 
     assert np.allclose(
-        model.predict(cn.array(X)),
-        pred_onnx(model.to_onnx(), X)[0],
-        atol=1e-3 if dtype == np.float32 else 1e-6,
+        model.predict(X),
+        pred_onnx_estimator(model.to_onnx(X.dtype), X.__array__(), 1).squeeze(),
+        atol=1e-3,
     )

From 5e3a0b69acd6802ea72349a59616492621528f74 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 31 Mar 2025 07:32:47 -0700
Subject: [PATCH 06/21] Add interface to estimator

---
 conda/environments/all_cuda-122.yaml | 1 +
 dependencies.yaml                    | 1 +
 pyproject.toml                       | 1 +
 3 files changed, 3 insertions(+)

diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml
index 68eb3824..cf9bd702 100644
--- a/conda/environments/all_cuda-122.yaml
+++ b/conda/environments/all_cuda-122.yaml
@@ -30,6 +30,7 @@ dependencies:
 - numpy
 - onnx>=1.10
 - onnxmltools>=1.10
+- onnxruntime>=1.21
 - openblas
 - pydata-sphinx-theme>=0.16
 - pytest>=7,<8
diff --git a/dependencies.yaml b/dependencies.yaml
index 9cd93fba..d7501dbd 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -178,3 +178,4 @@ dependencies:
           - xgboost>=2.0
           - onnx>=1.10
           - onnxmltools>=1.10
+          - onnxruntime>=1.21
diff --git a/pyproject.toml b/pyproject.toml
index a69041c6..f6ab85c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ test = [
     "notebook>=7",
     "onnx>=1.10",
     "onnxmltools>=1.10",
+    "onnxruntime>=1.21",
     "pytest>=7,<8",
     "seaborn>=0.13",
     "xgboost>=2.0",

From 7bf50187015654e49bb15a71fa69449a48053990 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 2 Apr 2025 01:54:45 -0700
Subject: [PATCH 07/21] Increase test coverage

---
 legateboost/legateboost.py               |  52 +++++++-----
 legateboost/models/tree.py               |   8 ++
 legateboost/objectives.py                |  13 ++-
 legateboost/test/test_onnx.py            | 104 ++++++++++++++++-------
 legateboost/test/test_with_hypothesis.py |  51 ++++++++---
 5 files changed, 160 insertions(+), 68 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 2c095a53..016bb77e 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -16,7 +16,7 @@
 from .input_validation import _lb_check_X, _lb_check_X_y, check_sample_weight
 from .metrics import BaseMetric, metrics
 from .models import BaseModel, Tree
-from .objectives import BaseObjective, objectives
+from .objectives import OBJECTIVES_MAP, BaseObjective
 from .shapley import global_shapley_attributions, local_shapley_attributions
 from .utils import AddableMixin, AddMember, PickleCupynumericMixin
 
@@ -422,7 +422,7 @@ def fit(
 
         # setup objective
         if isinstance(self.objective, str):
-            self._objective_instance = objectives[self.objective]()
+            self._objective_instance = OBJECTIVES_MAP[self.objective]()
         elif isinstance(self.objective, BaseObjective):
             self._objective_instance = self.objective
         else:
@@ -528,6 +528,26 @@ def _predict(self, X: cn.ndarray) -> cn.ndarray:
             pred += Type.batch_predict(models, X)
         return pred
 
+    def predict_raw(self, X: cn.ndarray) -> cn.ndarray:
+        """Predict pre-transformed values for samples in X. E.g. before applying a
+        sigmoid function.
+
+        Parameters
+        ----------
+
+        X :
+            The input samples.
+
+        Returns
+        -------
+
+        y :
+            The predicted raw values for each sample in X.
+        """
+        X = _lb_check_X(X)
+        validate_data(self, X, reset=False, skip_check_array=True)
+        return self._predict(X)
+
     def dump_models(self) -> str:
         """Dumps the models in the current instance to a string.
 
@@ -573,7 +593,7 @@ def _make_onnx_init(self, X_dtype):
         X_out = make_tensor_value_info(
             "X_out",
             np_dtype_to_tensor_dtype(X_dtype),
-            [None, self.model_init_.shape[0]],
+            [None, None],
         )
         nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
         graph = make_graph(
@@ -612,6 +632,7 @@ def to_onnx(self, X_dtype, predict_function="predict"):
         Any
             The ONNX model.
         """
+        from onnx.checker import check_model
         from onnx.compose import merge_models
 
         model = self._make_onnx_init(X_dtype)
@@ -634,6 +655,11 @@ def to_onnx(self, X_dtype, predict_function="predict"):
                 prefix2="model_{}_".format(i),
             )
 
+        # remove the X_out output, we only need the predictions
+        # add a transform operator
+        model.graph.output.remove(model.graph.output[0])
+
+        check_model(model)
         return model
 
     def global_attributions(
@@ -1127,26 +1153,6 @@ def fit(
         )
         return self
 
-    def predict_raw(self, X: cn.ndarray) -> cn.ndarray:
-        """Predict pre-transformed values for samples in X. E.g. before applying a
-        sigmoid function.
-
-        Parameters
-        ----------
-
-        X :
-            The input samples.
-
-        Returns
-        -------
-
-        y :
-            The predicted raw values for each sample in X.
-        """
-        X = _lb_check_X(X)
-        validate_data(self, X, reset=False, skip_check_array=True)
-        return super()._predict(X)
-
     def predict_proba(self, X: cn.ndarray) -> cn.ndarray:
         """Predict class probabilities for samples in X.
 
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index c9756992..39e32877 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -361,6 +361,14 @@ def to_onnx(self, X_dtype) -> Any:
         nodes_falseleafs = self.is_leaf(
             np.minimum(tree_max_nodes - 1, self.right_child(all_nodes_idx))
         ).astype(int)
+        if self.is_leaf(0):
+            # we have a decision stump
+            # according to the onnx operator we must set
+            # true/false at root to the leaf at 0
+            nodes_falsenodeids[0] = 0
+            nodes_truenodeids[0] = 0
+            nodes_trueleafs[0] = 0
+            nodes_falseleafs[0] = 0
         num_outputs = self.leaf_value.shape[1]
         for output_idx in range(0, num_outputs):
             leaf_targetids = np.full(self.feature.size, output_idx, dtype=np.int64)
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index ea389660..e48b7054 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -628,7 +628,7 @@ def initialise_prediction(
         return self.one_step_newton(y, w, boost_from_average, init)
 
 
-objectives = {
+OBJECTIVES_MAP = {
     "squared_error": SquaredErrorObjective,
     "normal": NormalObjective,
     "log_loss": LogLossObjective,
@@ -638,3 +638,14 @@ def initialise_prediction(
     "gamma_deviance": GammaDevianceObjective,
     "gamma": GammaObjective,
 }
+
+REGRESSION_OBJECTIVES = ["squared_error", "normal", "gamma_deviance", "gamma"]
+
+CLASSIFICATION_OBJECTIVES = [
+    "log_loss",
+    "multi_label",
+    "exp",
+    "quantile",
+    "gamma_deviance",
+    "gamma",
+]
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index f52ea1b5..8a33b743 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -6,39 +6,48 @@
 import legateboost as lb
 
 
-def pred_onnx_estimator(onnx, X, n_outputs):
-    sess = ort.InferenceSession(onnx.SerializeToString())
-    feeds = {"X_in": X}
-    return sess.run(None, feeds)[1]
-
-
-def pred_onnx_model(onnx, X, n_outputs):
-    sess = ort.InferenceSession(onnx.SerializeToString())
+def compare_onnx_predictions(estimator, X):
+    sess = ort.InferenceSession(estimator.to_onnx(X.dtype).SerializeToString())
     feeds = {
         "X_in": X,
-        "predictions_in": np.zeros((X.shape[0], n_outputs), dtype=X.dtype),
     }
-    return sess.run(None, feeds)[1]
+    if isinstance(estimator, lb.models.BaseModel):
+        pred = estimator.predict(cn.array(X))
+        feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1]), dtype=X.dtype)
+        onnx_pred = sess.run(None, feeds)[1]
+    else:
+        pred = estimator.predict_raw(cn.array(X))
+        onnx_pred = sess.run(None, feeds)[0]
 
+    onnx_pred = onnx_pred.squeeze()
+    pred = pred.squeeze()
+    assert pred.shape == onnx_pred.shape
+    assert np.allclose(
+        onnx_pred, pred, atol=1e-3 if X.dtype == np.float32 else 1e-6
+    ), np.linalg.norm(pred - onnx_pred)
 
-@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
-@pytest.mark.parametrize("n_outputs", [1, 5])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_models(Model, n_outputs, dtype):
+
+@pytest.fixture
+def model_dataset(dtype, n_outputs):
     rs = np.random.RandomState(0)
     X = rs.random((1000, 10)).astype(dtype)
     g = rs.normal(size=(X.shape[0], n_outputs))
     h = rs.random(g.shape) + 0.1
+    return X, g, h
+
+
+@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_outputs", [1, 5])
+def test_models(Model, model_dataset):
+    X, g, h = model_dataset
     model = (
         Model()
         .set_random_state(np.random.RandomState(2))
         .fit(cn.array(X), cn.array(g), cn.array(h))
     )
 
-    onnx_pred = pred_onnx_model(model.to_onnx(X.dtype), X, n_outputs)
-    lb_pred = model.predict(cn.array(X))
-    assert onnx_pred.shape == lb_pred.shape
-    assert np.allclose(onnx_pred, lb_pred, atol=1e-3 if dtype == np.float32 else 1e-6)
+    compare_onnx_predictions(model, X)
 
 
 @pytest.mark.parametrize("n_outputs", [1, 5])
@@ -48,27 +57,58 @@ def test_init(n_outputs):
     y = np.full((3, n_outputs), 5.0, dtype=np.float32)
     estimator = lb.LBRegressor(n_estimators=0, random_state=0).fit(X, y)
     assert np.all(estimator.model_init_ == 5.0)
-    assert np.all(estimator.predict(X) == 5.0)
-    assert np.all(
-        pred_onnx_estimator(estimator.to_onnx(X.dtype), X.__array__(), 1) == 5.0
+    compare_onnx_predictions(estimator, X)
+
+
+@pytest.fixture
+def regression_dataset(dtype, n_outputs):
+    from sklearn.datasets import make_regression
+
+    X, y = make_regression(
+        n_samples=1000,
+        n_features=10,
+        n_informative=5,
+        n_targets=n_outputs,
+        random_state=0,
     )
+    # make labels strictly positive for certain objectives
+    return X.astype(dtype), np.abs(y.astype(dtype))
 
 
 @pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
-@pytest.mark.parametrize("n_outputs", [1, 5])
+@pytest.mark.parametrize("objective", lb.objectives.REGRESSION_OBJECTIVES)
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_estimator(Model, n_outputs, dtype):
-    rs = np.random.RandomState(0)
-    X = rs.random((1000, 10)).astype(dtype)
-    y = rs.random((1000, n_outputs)).astype(dtype)
+@pytest.mark.parametrize("n_outputs", [1, 5])
+def test_regressor(Model, objective, regression_dataset):
+    X, y = regression_dataset
+    if objective in [
+        "quantile",
+        "gamma_deviance",
+        "gamma",
+    ] and (y.ndim > 1 and y.shape[1] > 1):
+        pytest.skip("skipping quantile, gamma and gamma_deviance for multiple outputs")
     model = lb.LBRegressor(
-        n_estimators=10,
+        n_estimators=2,
+        objective=objective,
         base_models=(Model(),),
         random_state=0,
     ).fit(X, y)
 
-    assert np.allclose(
-        model.predict(X),
-        pred_onnx_estimator(model.to_onnx(X.dtype), X.__array__(), 1).squeeze(),
-        atol=1e-3,
-    )
+    compare_onnx_predictions(model, X)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_outputs", [1, 5])
+@pytest.mark.parametrize("max_depth", list(range(0, 12, 3)))
+def test_tree(regression_dataset, max_depth):
+    # test tree depths more exhaustively
+    # some edge cases e.g. max_depth=0
+    X, y = regression_dataset
+    model = lb.LBRegressor(
+        init=None,
+        n_estimators=2,
+        base_models=(lb.models.Tree(max_depth=max_depth),),
+        random_state=0,
+    ).fit(X, y)
+
+    compare_onnx_predictions(model, X)
diff --git a/legateboost/test/test_with_hypothesis.py b/legateboost/test/test_with_hypothesis.py
index 22275f67..c9312079 100644
--- a/legateboost/test/test_with_hypothesis.py
+++ b/legateboost/test/test_with_hypothesis.py
@@ -1,4 +1,5 @@
 import numpy as np
+import onnxruntime as ort
 from hypothesis import HealthCheck, Verbosity, assume, given, settings, strategies as st
 from sklearn.preprocessing import StandardScaler
 
@@ -25,15 +26,15 @@
 @st.composite
 def tree_strategy(draw):
     if get_legate_runtime().machine.count(TaskTarget.GPU) > 0:
-        max_depth = draw(st.integers(1, 8))
+        max_depth = draw(st.integers(0, 8))
     else:
-        max_depth = draw(st.integers(1, 6))
-    alpha = draw(st.floats(0.0, 1.0))
+        max_depth = draw(st.integers(0, 6))
+    l2_regularization = draw(st.floats(0.0, 1.0))
     split_samples = draw(st.integers(1, 500))
     feature_fraction = draw(st.sampled_from([0.5, 1.0]))
     return lb.models.Tree(
         max_depth=max_depth,
-        alpha=alpha,
+        l2_regularization=l2_regularization,
         split_samples=split_samples,
         feature_fraction=feature_fraction,
     )
@@ -41,20 +42,22 @@ def tree_strategy(draw):
 
 @st.composite
 def nn_strategy(draw):
-    alpha = draw(st.floats(0.0, 1.0))
+    l2_regularization = draw(st.floats(0.0, 1.0))
     hidden_layer_sizes = draw(st.sampled_from([(), (100,), (100, 100), (10, 10, 10)]))
     # max iter needs to be sufficiently large, otherwise the models can make the loss
     # worse (from a bad initialization)
     max_iter = 200
     return lb.models.NN(
-        alpha=alpha, hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter
+        l2_regularization=l2_regularization,
+        hidden_layer_sizes=hidden_layer_sizes,
+        max_iter=max_iter,
     )
 
 
 @st.composite
 def linear_strategy(draw):
-    alpha = draw(st.floats(0.0, 1.0))
-    return lb.models.Linear(alpha=alpha)
+    l2_regularization = draw(st.floats(0.0, 1.0))
+    return lb.models.Linear(l2_regularization=l2_regularization)
 
 
 @st.composite
@@ -63,9 +66,11 @@ def krr_strategy(draw):
         sigma = draw(st.floats(0.1, 1.0))
     else:
         sigma = None
-    alpha = draw(st.floats(0.0, 1.0))
+    l2_regularization = draw(st.floats(0.0, 1.0))
     components = draw(st.integers(2, 10))
-    return lb.models.KRR(n_components=components, alpha=alpha, sigma=sigma)
+    return lb.models.KRR(
+        n_components=components, l2_regularization=l2_regularization, sigma=sigma
+    )
 
 
 @st.composite
@@ -161,11 +166,20 @@ def test_regressor(model_params, regression_params, regression_dataset):
     model = lb.LBRegressor(**model_params, **regression_params, verbose=True).fit(
         X, y, sample_weight=w, eval_result=eval_result
     )
-    model.predict(X)
     loss = next(iter(eval_result["train"].values()))
     assert non_increasing(loss, tol=1e-1)
     sanity_check_models(model)
 
+    # check onnx
+    # for now reshape legate-boost predict to 2-D
+    # eventually onnx should match the output shape exactly
+    predict_raw = model.predict_raw(X)
+    onnx_predict_raw = pred_onnx(model.to_onnx(X.dtype), X)
+    onnx_predict_raw = onnx_predict_raw.reshape(predict_raw.shape)
+    assert np.allclose(
+        predict_raw, onnx_predict_raw, atol=1e-3 if X.dtype == np.float32 else 1e-6
+    ), np.linalg.norm(predict_raw - onnx_predict_raw)
+
 
 classification_param_strategy = st.fixed_dictionaries(
     {
@@ -240,12 +254,18 @@ def classification_dataset_strategy(draw):
     return X, y, w, name
 
 
+def pred_onnx(onnx, X):
+    sess = ort.InferenceSession(onnx.SerializeToString())
+    return sess.run(None, {"X_in": X})[0]
+
+
 @given(
     general_model_param_strategy,
     classification_param_strategy,
     classification_dataset_strategy(),
 )
 @cn.errstate(divide="raise", invalid="raise")
+@settings(print_blob=True)
 def test_classifier(
     model_params: dict, classification_params: dict, classification_dataset: tuple
 ) -> None:
@@ -256,8 +276,15 @@ def test_classifier(
     )
     model.predict(X)
     model.predict_proba(X)
-    model.predict_raw(X)
+    predict_raw = model.predict_raw(X)
     loss = next(iter(eval_result["train"].values()))
     # multiclass models with higher learning rates don't always converge
     if len(model.classes_) == 2:
         assert non_increasing(loss, 1e-1)
+
+    # check onnx
+    onnx_predict_raw = pred_onnx(model.to_onnx(X.dtype), X)
+    onnx_predict_raw = onnx_predict_raw.reshape(predict_raw.shape)
+    assert np.allclose(
+        predict_raw, onnx_predict_raw, atol=1e-3 if X.dtype == np.float32 else 1e-6
+    ), np.linalg.norm(predict_raw - onnx_predict_raw)

From 4c9501773858573ee28b88bcde6983ae7ab03314 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 4 Apr 2025 00:59:46 -0700
Subject: [PATCH 08/21] Use older TreeEnsemble, predictions as double

---
 conda/environments/all_cuda-122.yaml |   2 +-
 dependencies.yaml                    |   2 +-
 legateboost/legateboost.py           |   6 +-
 legateboost/models/krr.py            |  36 +++---
 legateboost/models/linear.py         |  11 +-
 legateboost/models/nn.py             |  16 ++-
 legateboost/models/tree.py           | 178 ++++++++++++---------------
 legateboost/test/test_onnx.py        |  20 ++-
 pyproject.toml                       |   2 +-
 9 files changed, 137 insertions(+), 136 deletions(-)

diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml
index cf9bd702..6d4e98af 100644
--- a/conda/environments/all_cuda-122.yaml
+++ b/conda/environments/all_cuda-122.yaml
@@ -30,7 +30,7 @@ dependencies:
 - numpy
 - onnx>=1.10
 - onnxmltools>=1.10
-- onnxruntime>=1.21
+- onnxruntime
 - openblas
 - pydata-sphinx-theme>=0.16
 - pytest>=7,<8
diff --git a/dependencies.yaml b/dependencies.yaml
index d7501dbd..3700d52e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -178,4 +178,4 @@ dependencies:
           - xgboost>=2.0
           - onnx>=1.10
           - onnxmltools>=1.10
-          - onnxruntime>=1.21
+          - onnxruntime
diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 016bb77e..9c40d297 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -562,7 +562,7 @@ def dump_models(self) -> str:
 
     def _make_onnx_init(self, X_dtype):
         # turn self.model_init_ into an ONNX model
-        from onnx import numpy_helper
+        from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
@@ -582,11 +582,11 @@ def _make_onnx_init(self, X_dtype):
         one = numpy_helper.from_array(np.array([1], dtype=np.int64), name="one")
         nodes.append(make_node("Concat", ["n_rows", "one"], ["tile_repeat"], axis=0))
         init = numpy_helper.from_array(
-            np.atleast_2d(self.model_init_.__array__().astype(X_dtype)), name="init"
+            np.atleast_2d(self.model_init_.__array__()), name="init"
         )
         prediction_out = make_tensor_value_info(
             "predictions_out",
-            np_dtype_to_tensor_dtype(X_dtype),
+            TensorProto.DOUBLE,
             [None, self.model_init_.shape[0]],
         )
         nodes.append(make_node("Tile", ["init", "tile_repeat"], ["predictions_out"]))
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index cbea5a22..bd0b19f2 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -244,7 +244,7 @@ def __mul__(self, scalar: Any) -> "KRR":
         return new
 
     def to_onnx(self, X_dtype) -> Any:
-        from onnx import numpy_helper
+        from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
@@ -257,14 +257,6 @@ def to_onnx(self, X_dtype) -> Any:
 
         assert self.X_train.dtype == self.betas_.dtype
 
-        def make_constant_node(value: cn.array, name: str) -> Any:
-            return make_node(
-                "Constant",
-                inputs=[],
-                value=numpy_helper.from_array(value, name=name),
-                outputs=[name],
-            )
-
         nodes = []
 
         # model constants
@@ -279,33 +271,34 @@ def make_constant_node(value: cn.array, name: str) -> Any:
         )
         predictions_in = make_tensor_value_info(
             "predictions_in",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            TensorProto.DOUBLE,
             [None, n_outputs],
         )
         # exanded l2 distance
         # distance = np.sum(X**2, axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train.T)
         # + np.sum(self.X_train**2, axis=1)
-        nodes.append(make_constant_node(np.array([1]), "axis1"))
+        axis1 = numpy_helper.from_array(np.array([1]), name="axis1")
         nodes.append(make_node("ReduceSumSquare", ["X_in", "axis1"], ["XX"]))
         nodes.append(
             make_node("Gemm", ["X_in", "X_train"], ["XY"], alpha=-2.0, transB=1)
         )
         nodes.append(make_node("ReduceSumSquare", ["X_train", "axis1"], ["YY"]))
-        nodes.append(make_constant_node(np.array([1, -1]), "reshape"))
+        reshape = numpy_helper.from_array(
+            np.array([1, -1], dtype=np.int64), name="reshape"
+        )
         nodes.append(make_node("Reshape", ["YY", "reshape"], ["YY_reshaped"]))
         nodes.append(make_node("Add", ["XX", "XY"], ["add0"]))
         nodes.append(make_node("Add", ["YY_reshaped", "add0"], ["l2"]))
-        nodes.append(make_constant_node(np.array([0.0], self.betas_.dtype), "zero"))
+        zero = numpy_helper.from_array(np.array([0.0], self.X_train.dtype), name="zero")
         nodes.append(make_node("Max", ["l2", "zero"], ["l2_clipped"]))
 
         # RBF kernel
         # K = np.exp(-distance / (2 * self.sigma**2))
         if self.sigma is None:
             raise ValueError("sigma is None. Has fit been called?")
-        nodes.append(
-            make_constant_node(
-                np.array([-2.0 * self.sigma**2], self.betas_.dtype), "denominator"
-            )
+
+        denominator = numpy_helper.from_array(
+            np.array([-2.0 * self.sigma**2], self.X_train.dtype), name="denominator"
         )
         nodes.append(make_node("Div", ["l2_clipped", "denominator"], ["rbf0"]))
         nodes.append(make_node("Exp", ["rbf0"], ["K"]))
@@ -317,14 +310,17 @@ def make_constant_node(value: cn.array, name: str) -> Any:
         # outputs
         predictions_out = make_tensor_value_info(
             "predictions_out",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            TensorProto.DOUBLE,
             [None, n_outputs],
         )
         X_out = make_tensor_value_info(
             "X_out", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
         )
 
-        nodes.append(make_node("Add", ["dot", "predictions_in"], ["predictions_out"]))
+        nodes.append(make_node("Cast", ["dot"], ["dot_double"], to=TensorProto.DOUBLE))
+        nodes.append(
+            make_node("Add", ["dot_double", "predictions_in"], ["predictions_out"])
+        )
         nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
 
         graph = make_graph(
@@ -332,7 +328,7 @@ def make_constant_node(value: cn.array, name: str) -> Any:
             "legateboost.model.KRR",
             [X_in, predictions_in],
             [X_out, predictions_out],
-            [betas, X_train],
+            [betas, X_train, axis1, reshape, zero, denominator],
         )
         onnx_model = make_model(
             graph,
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index 90e1ee4c..31ee7a28 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -153,7 +153,7 @@ def __mul__(self, scalar: Any) -> "Linear":
         return new
 
     def to_onnx(self, X_dtype) -> Any:
-        from onnx import numpy_helper
+        from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
@@ -178,12 +178,12 @@ def to_onnx(self, X_dtype) -> Any:
         )
         predictions_in = make_tensor_value_info(
             "predictions_in",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            TensorProto.DOUBLE,
             [None, n_outputs],
         )
         predictions_out = make_tensor_value_info(
             "predictions_out",
-            np_dtype_to_tensor_dtype(self.betas_.dtype),
+            TensorProto.DOUBLE,
             [None, n_outputs],
         )
 
@@ -191,7 +191,10 @@ def to_onnx(self, X_dtype) -> Any:
         nodes.append(make_node("MatMul", ["X_in", "betas"], ["XBeta"]))
         nodes.append(make_node("Add", ["XBeta", "intercept"], ["result"]))
         nodes.append(
-            make_node("Add", ["result", "predictions_in"], ["predictions_out"])
+            make_node("Cast", ["result"], ["result_double"], to=TensorProto.DOUBLE)
+        )
+        nodes.append(
+            make_node("Add", ["result_double", "predictions_in"], ["predictions_out"])
         )
         X_out = make_tensor_value_info(
             "X_out", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
diff --git a/legateboost/models/nn.py b/legateboost/models/nn.py
index d77e4b24..36733c70 100644
--- a/legateboost/models/nn.py
+++ b/legateboost/models/nn.py
@@ -183,7 +183,7 @@ def __mul__(self, scalar: Any) -> "NN":
         return new
 
     def to_onnx(self, X_dtype) -> Any:
-        from onnx import numpy_helper
+        from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
@@ -214,7 +214,7 @@ def to_onnx(self, X_dtype) -> Any:
         )
         predictions_in = make_tensor_value_info(
             "predictions_in",
-            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            TensorProto.DOUBLE,
             [None, n_outputs],
         )
         nodes = []
@@ -246,14 +246,22 @@ def to_onnx(self, X_dtype) -> Any:
         nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
         predictions_out = make_tensor_value_info(
             "predictions_out",
-            np_dtype_to_tensor_dtype(self.coefficients_[0].dtype),
+            TensorProto.DOUBLE,
             [None, n_outputs],
         )
+        nodes.append(
+            make_node(
+                "Cast",
+                ["activations{}withbias".format(len(self.coefficients_) - 1)],
+                ["casted"],
+                to=TensorProto.DOUBLE,
+            )
+        )
         nodes.append(
             make_node(
                 "Add",
                 [
-                    "activations{}withbias".format(len(self.coefficients_) - 1),
+                    "casted",
                     "predictions_in",
                 ],
                 ["predictions_out"],
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index 39e32877..ed612ab1 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -318,144 +318,120 @@ def __mul__(self, scalar: Any) -> "Tree":
 
     def to_onnx(self, X_dtype) -> Any:
         import onnx
-        from onnx import numpy_helper
+        from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
             make_model,
             make_node,
-            make_tensor,
             make_tensor_value_info,
             np_dtype_to_tensor_dtype,
         )
 
         onnx_nodes = []
 
-        # We map the legate-boost tree representation to the TreeEnsemble ONNX operator
-        # the features array, splits array, and leaf weights can be passed unchanged
-        # ONNX then requires some extra arrays to represent the tree structure
-        # - nodes_truenodeidx is the index of the left child for a given node
-        # - nodes_falsenodeidx is the index of the right child for a given node
-        # - nodes_modes indicates that nodes use a <= comparison operator
-        # - nodes_trueleafs indicates that the left child is a leaf node
-        # - nodes_falseleafs indicates that the right child is a leaf node
-        # - leaf_targetids indicates which output the leaf node corresponds to
-        # ONNX does not support vector leaf so we will repeat the tree n_outputs
-        # times, each time with a different constant for leaf_targetids
-        # This is not ideal but I don't see a better way
-
+        num_outputs = self.leaf_value.shape[1]
         tree_max_nodes = self.feature.size
         all_nodes_idx = np.arange(tree_max_nodes)
         nodes_featureids = self.feature.__array__()
-        nodes_splits = numpy_helper.from_array(
-            self.split_value.__array__().astype(X_dtype)
-        )
         nodes_truenodeids = self.left_child(all_nodes_idx)
-        # get the left child of each node and check if it is a leaf
-        # if the node is already leaf then its child can go off the end of the array
-        # use np.minimum to avoid this
-        nodes_trueleafs = self.is_leaf(
-            np.minimum(tree_max_nodes - 1, self.left_child(all_nodes_idx))
-        ).astype(int)
         nodes_falsenodeids = self.right_child(all_nodes_idx)
-        nodes_falseleafs = self.is_leaf(
-            np.minimum(tree_max_nodes - 1, self.right_child(all_nodes_idx))
-        ).astype(int)
-        if self.is_leaf(0):
-            # we have a decision stump
-            # according to the onnx operator we must set
-            # true/false at root to the leaf at 0
-            nodes_falsenodeids[0] = 0
-            nodes_truenodeids[0] = 0
-            nodes_trueleafs[0] = 0
-            nodes_falseleafs[0] = 0
-        num_outputs = self.leaf_value.shape[1]
-        for output_idx in range(0, num_outputs):
-            leaf_targetids = np.full(self.feature.size, output_idx, dtype=np.int64)
-            leaf_weights = numpy_helper.from_array(
-                self.leaf_value[:, output_idx].__array__().astype(X_dtype)
+        node_modes = np.full(tree_max_nodes, "BRANCH_LEQ")
+        node_modes[self.is_leaf(all_nodes_idx)] = "LEAF"
+        leaf_targetids = np.full(tree_max_nodes, 0, dtype=np.int64)
+        # predict the leaf node index
+        # use it to later index into the 2d array of leaf weights
+        # as ONNX does not support 2d leaf weights
+        target_weights = all_nodes_idx.astype(np.float32)
+        kwargs = {}
+        # TreeEnsembleRegressor asks us to pass these as tensors when X_dtype is double
+        if X_dtype == np.float32:
+            kwargs["nodes_values"] = self.split_value.__array__()
+            kwargs["target_weights"] = target_weights
+        else:
+            kwargs["nodes_values_as_tensor"] = numpy_helper.from_array(
+                self.split_value.__array__(), name="nodes_values"
             )
-
-            onnx_nodes.append(
-                make_node(
-                    "TreeEnsemble",
-                    ["X_in"],
-                    ["pred" + str(output_idx)],
-                    domain="ai.onnx.ml",
-                    n_targets=self.leaf_value.shape[1],
-                    membership_values=None,
-                    nodes_missing_value_tracks_true=None,
-                    nodes_hitrates=None,
-                    aggregate_function=1,
-                    post_transform=0,
-                    tree_roots=[0],
-                    nodes_modes=make_tensor(
-                        "nodes_modes",
-                        onnx.TensorProto.UINT8,
-                        self.feature.shape,
-                        np.zeros_like(self.feature, dtype=np.uint8),
-                    ),
-                    nodes_featureids=nodes_featureids,
-                    nodes_splits=nodes_splits,
-                    nodes_truenodeids=nodes_truenodeids,
-                    nodes_trueleafs=nodes_trueleafs,
-                    nodes_falsenodeids=nodes_falsenodeids,
-                    nodes_falseleafs=nodes_falseleafs,
-                    leaf_targetids=leaf_targetids,
-                    leaf_weights=leaf_weights,
-                )
+            kwargs["target_weights_as_tensor"] = numpy_helper.from_array(
+                target_weights.astype(np.float64), name="target_weights"
             )
 
-            if output_idx == 0:
-                onnx_nodes.append(
-                    onnx.helper.make_node(
-                        "Identity",
-                        ["pred" + str(output_idx)],
-                        ["accumulated_pred0"],
-                    )
-                )
-            else:
-                onnx_nodes.append(
-                    onnx.helper.make_node(
-                        "Add",
-                        [
-                            "accumulated_pred" + str(output_idx - 1),
-                            "pred" + str(output_idx),
-                        ],
-                        ["accumulated_pred" + str(output_idx)],
-                    )
-                )
+        # TreeEnsembleRegressor is deprecated, but its successor TreeEnsemble
+        # is at the time of writing not available from onnxruntime on conda-forge
+        # This can be updated at some point without too much trouble
+        onnx_nodes.append(
+            make_node(
+                "TreeEnsembleRegressor",
+                ["X_in"],
+                ["predicted_leaf_index"],
+                domain="ai.onnx.ml",
+                n_targets=1,
+                membership_values=None,
+                nodes_missing_value_tracks_true=None,
+                nodes_hitrates=None,
+                nodes_modes=node_modes,
+                nodes_featureids=nodes_featureids,
+                nodes_truenodeids=nodes_truenodeids,
+                nodes_falsenodeids=nodes_falsenodeids,
+                nodes_nodeids=all_nodes_idx,
+                nodes_treeids=np.zeros(tree_max_nodes, dtype=np.int64),
+                target_ids=leaf_targetids,
+                target_nodeids=all_nodes_idx,
+                target_treeids=np.zeros(tree_max_nodes, dtype=np.int64),
+                **kwargs,
+            )
+        )
 
-        X_in = make_tensor_value_info(
-            "X_in", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+        leaf_weights = numpy_helper.from_array(
+            self.leaf_value.__array__(), name="leaf_weights"
         )
-        X_out = make_tensor_value_info(
-            "X_out", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+        predictions_out = make_tensor_value_info(
+            "predictions_out", TensorProto.DOUBLE, [None, num_outputs]
         )
-        predictions_in = make_tensor_value_info(
-            "predictions_in", np_dtype_to_tensor_dtype(X_dtype), [None, num_outputs]
+        # make indices 1-d
+        onnx_nodes.append(
+            make_node(
+                "Squeeze", ["predicted_leaf_index"], ["predicted_leaf_index_squeezed"]
+            )
         )
-        predictions_out = make_tensor_value_info(
-            "predictions_out", np_dtype_to_tensor_dtype(X_dtype), [None, num_outputs]
+        onnx_nodes.append(
+            make_node(
+                "Cast",
+                ["predicted_leaf_index_squeezed"],
+                ["predicted_leaf_index_int"],
+                to=TensorProto.INT32,
+            )
         )
-        onnx_nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
         onnx_nodes.append(
             make_node(
-                "Add",
-                ["predictions_in", "accumulated_pred" + str(num_outputs - 1)],
-                ["predictions_out"],
+                "Gather", ["leaf_weights", "predicted_leaf_index_int"], ["gathered"]
             )
         )
+        predictions_in = make_tensor_value_info(
+            "predictions_in", TensorProto.DOUBLE, [None, num_outputs]
+        )
+        onnx_nodes.append(
+            make_node("Add", ["predictions_in", "gathered"], ["predictions_out"])
+        )
+
+        X_in = make_tensor_value_info(
+            "X_in", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+        )
+        X_out = make_tensor_value_info(
+            "X_out", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+        )
+        onnx_nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
         graph = make_graph(
             onnx_nodes,
             "legateboost.models.Tree",
             [X_in, predictions_in],
             [X_out, predictions_out],
+            [leaf_weights],
         )
         model = make_model(
             graph,
             opset_imports=[
-                onnx.helper.make_opsetid("ai.onnx.ml", 5),
+                onnx.helper.make_opsetid("ai.onnx.ml", 3),
                 onnx.helper.make_opsetid("", 21),
             ],
         )
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 8a33b743..62ad44e4 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -13,13 +13,14 @@ def compare_onnx_predictions(estimator, X):
     }
     if isinstance(estimator, lb.models.BaseModel):
         pred = estimator.predict(cn.array(X))
-        feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1]), dtype=X.dtype)
+        feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1]))
         onnx_pred = sess.run(None, feeds)[1]
     else:
         pred = estimator.predict_raw(cn.array(X))
         onnx_pred = sess.run(None, feeds)[0]
 
     onnx_pred = onnx_pred.squeeze()
+    assert onnx_pred.dtype == np.float64
     pred = pred.squeeze()
     assert pred.shape == onnx_pred.shape
     assert np.allclose(
@@ -112,3 +113,20 @@ def test_tree(regression_dataset, max_depth):
     ).fit(X, y)
 
     compare_onnx_predictions(model, X)
+
+
+@pytest.mark.parametrize("dtype", [np.float32])
+@pytest.mark.parametrize("n_outputs", [1])
+def test_small_tree(regression_dataset, dtype, n_outputs):
+    max_depth = 0
+    # test tree depths more exhaustively
+    # some edge cases e.g. max_depth=0
+    X, y = regression_dataset
+    model = lb.LBRegressor(
+        init=None,
+        n_estimators=2,
+        base_models=(lb.models.Tree(max_depth=max_depth),),
+        random_state=0,
+    ).fit(X, y)
+
+    compare_onnx_predictions(model, X)
diff --git a/pyproject.toml b/pyproject.toml
index f6ab85c1..e8269341 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ test = [
     "notebook>=7",
     "onnx>=1.10",
     "onnxmltools>=1.10",
-    "onnxruntime>=1.21",
+    "onnxruntime",
     "pytest>=7,<8",
     "seaborn>=0.13",
     "xgboost>=2.0",

From 438958c94d82632c9325755f68a96e8272f681cf Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 4 Apr 2025 01:41:03 -0700
Subject: [PATCH 09/21] Update docs

---
 legateboost/legateboost.py       | 23 +++++++--
 legateboost/models/base_model.py |  8 ++-
 legateboost/objectives.py        |  3 --
 legateboost/test/test_onnx.py    | 88 ++++++++++++++++++++++----------
 4 files changed, 87 insertions(+), 35 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 9c40d297..49c7077c 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -614,7 +614,11 @@ def _make_onnx_init(self, X_dtype):
         return onnx_model
 
     def to_onnx(self, X_dtype, predict_function="predict"):
-        """Converts the model to an ONNX model.
+        """Converts the estimator to an ONNX model which is expected to produce
+        equivalent predictions to `predict_function` up to reasonable floating
+        point tolerance. The ONNX model is hard coded to the X input data type,
+        separate models should be generated for float and double. The ONNX model
+        takes "X_in" as input and produces "predictions_out" as output.
 
         Parameters
         ----------
@@ -631,6 +635,19 @@ def to_onnx(self, X_dtype, predict_function="predict"):
         -------
         Any
             The ONNX model.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import legateboost as lb
+        >>> X = np.random.random((1000, 10))
+        >>> y = np.random.random(X.shape[0])
+        >>> model = lb.LBRegressor(n_estimators=5).fit(X, y)
+        >>> import onnxruntime as ort
+        >>> sess = ort.InferenceSession(model.to_onnx(X.dtype).SerializeToString())
+        >>> onnx_pred = sess.run(None, {"X_in": X})[0]
+        >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6)
+        >>>
         """
         from onnx.checker import check_model
         from onnx.compose import merge_models
@@ -823,10 +840,10 @@ class LBRegressor(RegressorMixin, LBBase):
     Examples
     --------
     >>> import cupynumeric as cn
-    >>> import legateboost as lbst
+    >>> import legateboost as lb
     >>> X = cn.random.random((1000, 10))
     >>> y = cn.random.random(X.shape[0])
-    >>> model = lbst.LBRegressor(n_estimators=5).fit(X, y)
+    >>> model = lb.LBRegressor(n_estimators=5).fit(X, y)
     >>>
     """
 
diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
index a1e88011..07757ab6 100644
--- a/legateboost/models/base_model.py
+++ b/legateboost/models/base_model.py
@@ -127,9 +127,15 @@ def __mul__(self, scalar: Any) -> "BaseModel":
     def __hash__(self) -> int:
         return hash(str(self))
 
-    def to_onnx(self) -> Any:
+    def to_onnx(self, X_dtype) -> Any:
         """Convert the model to an ONNX model.
 
+        The implemented ONNX model should accept the following two inputs:
+        - "X_in" : 2D tensor of shape (n_samples, n_features) and type `X_dtype`.
+        - "predictions in" : 2D tensor of shape (n_samples, n_outputs) and type double.
+        The model should output:
+        - "predictions out" : 2D tensor of shape (n_samples, n_outputs) and type double.
+
         Returns
         -------
         Any
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index e48b7054..abe4715e 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -645,7 +645,4 @@ def initialise_prediction(
     "log_loss",
     "multi_label",
     "exp",
-    "quantile",
-    "gamma_deviance",
-    "gamma",
 ]
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 62ad44e4..7b48fad6 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -6,19 +6,14 @@
 import legateboost as lb
 
 
-def compare_onnx_predictions(estimator, X):
-    sess = ort.InferenceSession(estimator.to_onnx(X.dtype).SerializeToString())
+def compare_model_predictions(model, X):
+    sess = ort.InferenceSession(model.to_onnx(X.dtype).SerializeToString())
     feeds = {
         "X_in": X,
     }
-    if isinstance(estimator, lb.models.BaseModel):
-        pred = estimator.predict(cn.array(X))
-        feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1]))
-        onnx_pred = sess.run(None, feeds)[1]
-    else:
-        pred = estimator.predict_raw(cn.array(X))
-        onnx_pred = sess.run(None, feeds)[0]
-
+    pred = model.predict(cn.array(X))
+    feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1]))
+    onnx_pred = sess.run(None, feeds)[1]
     onnx_pred = onnx_pred.squeeze()
     assert onnx_pred.dtype == np.float64
     pred = pred.squeeze()
@@ -28,6 +23,25 @@ def compare_onnx_predictions(estimator, X):
     ), np.linalg.norm(pred - onnx_pred)
 
 
+def compare_estimator_predictions(estimator, X, predict_function):
+    sess = ort.InferenceSession(
+        estimator.to_onnx(X.dtype, predict_function).SerializeToString()
+    )
+    feeds = {
+        "X_in": X,
+    }
+    pred = estimator.predict_raw(cn.array(X))
+    onnx_pred = sess.run(None, feeds)[0]
+
+    onnx_pred = onnx_pred.squeeze()
+    assert onnx_pred.dtype == np.float64
+    pred = pred.squeeze()
+    assert pred.shape == onnx_pred.shape
+    assert np.allclose(
+        onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6
+    ), np.linalg.norm(pred - onnx_pred)
+
+
 @pytest.fixture
 def model_dataset(dtype, n_outputs):
     rs = np.random.RandomState(0)
@@ -48,7 +62,7 @@ def test_models(Model, model_dataset):
         .fit(cn.array(X), cn.array(g), cn.array(h))
     )
 
-    compare_onnx_predictions(model, X)
+    compare_model_predictions(model, X)
 
 
 @pytest.mark.parametrize("n_outputs", [1, 5])
@@ -58,7 +72,7 @@ def test_init(n_outputs):
     y = np.full((3, n_outputs), 5.0, dtype=np.float32)
     estimator = lb.LBRegressor(n_estimators=0, random_state=0).fit(X, y)
     assert np.all(estimator.model_init_ == 5.0)
-    compare_onnx_predictions(estimator, X)
+    compare_estimator_predictions(estimator, X, "predict_raw")
 
 
 @pytest.fixture
@@ -95,30 +109,48 @@ def test_regressor(Model, objective, regression_dataset):
         random_state=0,
     ).fit(X, y)
 
-    compare_onnx_predictions(model, X)
+    compare_estimator_predictions(model, X, "predict_raw")
+
+
+@pytest.fixture
+def classification_dataset(dtype, n_outputs):
+    from sklearn.datasets import make_classification
+
+    X, y = make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=5,
+        n_classes=n_outputs,
+        random_state=0,
+    )
+    return X.astype(dtype), np.abs(y.astype(dtype))
 
 
+@pytest.mark.parametrize("Model", [M for M in lb.models.BaseModel.__subclasses__()])
+@pytest.mark.parametrize("objective", lb.objectives.CLASSIFICATION_OBJECTIVES)
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("n_outputs", [1, 5])
-@pytest.mark.parametrize("max_depth", list(range(0, 12, 3)))
-def test_tree(regression_dataset, max_depth):
-    # test tree depths more exhaustively
-    # some edge cases e.g. max_depth=0
-    X, y = regression_dataset
-    model = lb.LBRegressor(
-        init=None,
+@pytest.mark.parametrize("n_outputs", [2, 5])
+def test_classifier(Model, objective, classification_dataset):
+    X, y = classification_dataset
+    if objective == "multi_label":
+        # encode labels as one-hot
+        encoded = np.zeros((y.shape[0], int(y.max() + 1)))
+        encoded[np.arange(y.shape[0]), y.astype(int)] = 1
+        y = encoded
+    model = lb.LBClassifier(
         n_estimators=2,
-        base_models=(lb.models.Tree(max_depth=max_depth),),
+        objective=objective,
+        base_models=(Model(),),
         random_state=0,
     ).fit(X, y)
 
-    compare_onnx_predictions(model, X)
+    compare_estimator_predictions(model, X, "predict_raw")
 
 
-@pytest.mark.parametrize("dtype", [np.float32])
-@pytest.mark.parametrize("n_outputs", [1])
-def test_small_tree(regression_dataset, dtype, n_outputs):
-    max_depth = 0
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("n_outputs", [1, 5])
+@pytest.mark.parametrize("max_depth", list(range(0, 12, 3)))
+def test_tree(regression_dataset, max_depth):
     # test tree depths more exhaustively
     # some edge cases e.g. max_depth=0
     X, y = regression_dataset
@@ -129,4 +161,4 @@ def test_small_tree(regression_dataset, dtype, n_outputs):
         random_state=0,
     ).fit(X, y)
 
-    compare_onnx_predictions(model, X)
+    compare_estimator_predictions(model, X, "predict_raw")

From 1c74de5967dc49961c204960028b80439143ed07 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 10 Apr 2025 01:26:56 -0700
Subject: [PATCH 10/21] Implement normal onnx operator

---
 legateboost/legateboost.py       |  81 ++++++++++++++++--
 legateboost/models/base_model.py |   9 +-
 legateboost/models/krr.py        |   2 +-
 legateboost/models/linear.py     |   2 +-
 legateboost/models/nn.py         |   2 +-
 legateboost/models/tree.py       |  10 +--
 legateboost/objectives.py        | 138 ++++++++++++++++++++++++++++++-
 legateboost/test/test_onnx.py    |  10 +--
 8 files changed, 230 insertions(+), 24 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 49c7077c..48f6a228 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -613,7 +613,7 @@ def _make_onnx_init(self, X_dtype):
 
         return onnx_model
 
-    def to_onnx(self, X_dtype, predict_function="predict"):
+    def to_onnx(self, X: cn.ndarray, predict_function="predict"):
         """Converts the estimator to an ONNX model which is expected to produce
         equivalent predictions to `predict_function` up to reasonable floating
         point tolerance. The ONNX model is hard coded to the X input data type,
@@ -622,11 +622,9 @@ def to_onnx(self, X_dtype, predict_function="predict"):
 
         Parameters
         ----------
-        X_dtype : numpy.dtype
-            The expected data type of the input data. ONNX models hard
-            code the data type of the input data and will crash if this is
-            not set correctly.
-            Can be np.float32 or np.float64.
+        X:
+            Example input data. Use to infer input data characteristics.
+            A model produced for float32 will not accept float64 input and vice versa.
         predict_function : str
             The serialised ONNX model can produce output equivalent to 'predict',
             'predict_proba', or 'predict_raw'.
@@ -649,14 +647,28 @@ def to_onnx(self, X_dtype, predict_function="predict"):
         >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6)
         >>>
         """
+        if predict_function not in ["predict", "predict_proba", "predict_raw"]:
+            raise ValueError(
+                "predict_function should be one of "
+                "['predict', 'predict_proba', 'predict_raw']"
+            )
+
+        from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.compose import merge_models
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
 
-        model = self._make_onnx_init(X_dtype)
+        model = self._make_onnx_init(X.dtype)
         if self.models_ is not None and len(self.models_) > 0:
             model = merge_models(
                 model,
-                self.models_[0].to_onnx(X_dtype),
+                self.models_[0].to_onnx(X),
                 io_map=[("X_out", "X_in"), ("predictions_out", "predictions_in")],
                 prefix2="model_0_",
             )
@@ -664,7 +676,7 @@ def to_onnx(self, X_dtype, predict_function="predict"):
         for i in range(1, len(self.models_)):
             model = merge_models(
                 model,
-                self.models_[i].to_onnx(X_dtype),
+                self.models_[i].to_onnx(X),
                 io_map=[
                     ("model_{}_X_out".format(i - 1), "X_in"),
                     ("model_{}_predictions_out".format(i - 1), "predictions_in"),
@@ -676,6 +688,57 @@ def to_onnx(self, X_dtype, predict_function="predict"):
         # add a transform operator
         model.graph.output.remove(model.graph.output[0])
 
+        # add any transform from the objective
+        if predict_function == "predict":
+            model = merge_models(
+                model,
+                self._objective_instance.onnx_transform(),
+                io_map=[
+                    (
+                        "model_{}_predictions_out".format(len(self.models_) - 1),
+                        "predictions_in",
+                    )
+                ],
+                prefix2="transform_",
+            )
+            # coerce the output shape to be the same as the equivalent predict function
+            test_pred = getattr(self, predict_function)(X[0:1])
+
+            extra_out_shape = [] if test_pred.ndim == 1 else list(test_pred.shape[1:])
+            shape = numpy_helper.from_array(
+                np.array([-1] + extra_out_shape), name="shape"
+            )
+
+            reshape_predictions_in = make_tensor_value_info(
+                "reshape_predictions_in",
+                TensorProto.DOUBLE,
+                [None, None],
+            )
+            reshaped_predictions = make_tensor_value_info(
+                "reshaped_predictions",
+                TensorProto.DOUBLE,
+                shape=[None] + list(extra_out_shape),
+            )
+            nodes = [
+                make_node(
+                    "Reshape",
+                    ["reshape_predictions_in", "shape"],
+                    ["reshaped_predictions"],
+                )
+            ]
+            graph = make_graph(
+                nodes,
+                "legateboost estimator transform",
+                [reshape_predictions_in],
+                [reshaped_predictions],
+                [shape],
+            )
+            model = merge_models(
+                model,
+                make_model(graph, opset_imports=[make_opsetid("", 21)]),
+                io_map=[("transform_predictions_out", "reshape_predictions_in")],
+            )
+
         check_model(model)
         return model
 
diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
index 07757ab6..45fac68d 100644
--- a/legateboost/models/base_model.py
+++ b/legateboost/models/base_model.py
@@ -127,7 +127,7 @@ def __mul__(self, scalar: Any) -> "BaseModel":
     def __hash__(self) -> int:
         return hash(str(self))
 
-    def to_onnx(self, X_dtype) -> Any:
+    def to_onnx(self, X) -> Any:
         """Convert the model to an ONNX model.
 
         The implemented ONNX model should accept the following two inputs:
@@ -136,6 +136,13 @@ def to_onnx(self, X_dtype) -> Any:
         The model should output:
         - "predictions out" : 2D tensor of shape (n_samples, n_outputs) and type double.
 
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Example input X matrix. Used to infer type and shape of the input.
+
+        y_pred : ndarray of shape (n_samples,)
+            The predicted labels.
         Returns
         -------
         Any
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index bd0b19f2..da2e22ce 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -243,7 +243,7 @@ def __mul__(self, scalar: Any) -> "KRR":
         self.betas_ *= scalar
         return new
 
-    def to_onnx(self, X_dtype) -> Any:
+    def to_onnx(self, X) -> Any:
         from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index 31ee7a28..de88ef2c 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -152,7 +152,7 @@ def __mul__(self, scalar: Any) -> "Linear":
         new.betas_ *= scalar
         return new
 
-    def to_onnx(self, X_dtype) -> Any:
+    def to_onnx(self, X) -> Any:
         from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
diff --git a/legateboost/models/nn.py b/legateboost/models/nn.py
index 36733c70..d628b6e5 100644
--- a/legateboost/models/nn.py
+++ b/legateboost/models/nn.py
@@ -182,7 +182,7 @@ def __mul__(self, scalar: Any) -> "NN":
         new.biases_[-1] *= scalar
         return new
 
-    def to_onnx(self, X_dtype) -> Any:
+    def to_onnx(self, X) -> Any:
         from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index ed612ab1..c6711172 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -316,7 +316,7 @@ def __mul__(self, scalar: Any) -> "Tree":
         new.leaf_value *= scalar
         return new
 
-    def to_onnx(self, X_dtype) -> Any:
+    def to_onnx(self, X) -> Any:
         import onnx
         from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
@@ -344,8 +344,8 @@ def to_onnx(self, X_dtype) -> Any:
         # as ONNX does not support 2d leaf weights
         target_weights = all_nodes_idx.astype(np.float32)
         kwargs = {}
-        # TreeEnsembleRegressor asks us to pass these as tensors when X_dtype is double
-        if X_dtype == np.float32:
+        # TreeEnsembleRegressor asks us to pass these as tensors when X.dtype is double
+        if X.dtype == np.float32:
             kwargs["nodes_values"] = self.split_value.__array__()
             kwargs["target_weights"] = target_weights
         else:
@@ -415,10 +415,10 @@ def to_onnx(self, X_dtype) -> Any:
         )
 
         X_in = make_tensor_value_info(
-            "X_in", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+            "X_in", np_dtype_to_tensor_dtype(X.dtype), [None, None]
         )
         X_out = make_tensor_value_info(
-            "X_out", np_dtype_to_tensor_dtype(X_dtype), [None, None]
+            "X_out", np_dtype_to_tensor_dtype(X.dtype), [None, None]
         )
         onnx_nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
         graph = make_graph(
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index abe4715e..755f31d0 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Tuple
 
+import numpy as np
 from scipy.stats import norm
 from typing_extensions import TypeAlias, override
 
@@ -70,6 +71,54 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         """
         return pred
 
+    def onnx_transform(self) -> cn.ndarray:
+        """Returns an ONNX model that accepts
+        - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
+        And outputs the transformed predictions.
+        - "predictions_out" : arbitrary tensor depending on the objective.
+
+        Is by default the identity transform.
+
+        The ONNX transform should produce the same output as the transform
+        method for each objective.
+
+        Returns:
+            Onnx model that transforms the predictions.
+        """
+        from onnx import TensorProto
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        nodes = [make_node("Identity", ["predictions_in"], ["predictions_out"])]
+        graph = make_graph(
+            nodes,
+            "BaseModel",
+            [predictions_in],
+            [predictions_out],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
     @abstractmethod
     def metric(self) -> BaseMetric:
         """Returns the default error metric for the objective function.
@@ -243,6 +292,93 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         pred[:, :, 1] = cn.clip(pred[:, :, 1], -5, 5)
         return pred
 
+    def onnx_transform(self) -> cn.ndarray:
+        from onnx import TensorProto, numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, None, 2],
+        )
+        nodes = []
+        # clip
+        mininmum = numpy_helper.from_array(
+            np.array(-5, dtype=np.float64), name="minimum"
+        )
+        maximum = numpy_helper.from_array(np.array(5, dtype=np.float64), name="maximum")
+        # reshape
+        out_shape = numpy_helper.from_array(
+            np.array([0, -1, 2], dtype=np.int64), name="out_shape"
+        )
+        nodes.append(
+            make_node("Reshape", ["predictions_in", "out_shape"], ["reshaped"])
+        )
+
+        nodes.append(make_node("Shape", ["reshaped"], ["new_shape"]))
+
+        var_starts = numpy_helper.from_array(
+            np.array([0, 0, 1], dtype=np.int64), name="var_starts"
+        )
+        mean_starts = numpy_helper.from_array(
+            np.array([0, 0, 0], dtype=np.int64), name="mean_starts"
+        )
+
+        # extract mean and variance parts
+        axis = numpy_helper.from_array(np.array([0, 1, 2], dtype=np.int64), name="axis")
+        steps = numpy_helper.from_array(
+            np.array([1, 1, 2], dtype=np.int64), name="steps"
+        )
+        nodes.append(
+            make_node(
+                "Slice",
+                ["reshaped", "var_starts", "new_shape", "axis", "steps"],
+                ["variance"],
+            )
+        )
+        nodes.append(
+            make_node(
+                "Slice",
+                ["reshaped", "mean_starts", "new_shape", "axis", "steps"],
+                ["mean"],
+            )
+        )
+        nodes.append(
+            make_node("Clip", ["variance", "minimum", "maximum"], ["clipped_variance"])
+        )
+
+        # combine them again
+        nodes.append(
+            make_node(
+                "Concat", ["mean", "clipped_variance"], ["predictions_out"], axis=2
+            )
+        )
+        graph = make_graph(
+            nodes,
+            "NormalObjective",
+            [predictions_in],
+            [predictions_out],
+            [out_shape, var_starts, mean_starts, axis, steps, mininmum, maximum],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
     @override
     def mean(self, param: cn.ndarray) -> cn.ndarray:
         """Return the mean for the Normal distribution."""
@@ -421,7 +557,7 @@ def var(self, param: cn.ndarray) -> cn.ndarray:
 class QuantileObjective(BaseObjective):
     """Minimises the quantile loss, otherwise known as check loss or pinball loss.
 
-    :math:`L(y_i, p_i) = \\frac{1}{k}\\sum_{j=1}^{k} (q_j - \\mathbb{1})(y_i - p_{i, j})`
+    :math:`L(y_i, p_i) = \\frac{}{k}\\sum_{j=1}^{k} (q_j - \\mathbb{1})(y_i - p_{i, j})`
 
     where
 
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 7b48fad6..fc686b1f 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -7,7 +7,7 @@
 
 
 def compare_model_predictions(model, X):
-    sess = ort.InferenceSession(model.to_onnx(X.dtype).SerializeToString())
+    sess = ort.InferenceSession(model.to_onnx(X).SerializeToString())
     feeds = {
         "X_in": X,
     }
@@ -25,17 +25,16 @@ def compare_model_predictions(model, X):
 
 def compare_estimator_predictions(estimator, X, predict_function):
     sess = ort.InferenceSession(
-        estimator.to_onnx(X.dtype, predict_function).SerializeToString()
+        estimator.to_onnx(X, predict_function).SerializeToString()
     )
     feeds = {
         "X_in": X,
     }
-    pred = estimator.predict_raw(cn.array(X))
+    pred_method = getattr(estimator, predict_function)
+    pred = pred_method(cn.array(X))
     onnx_pred = sess.run(None, feeds)[0]
 
-    onnx_pred = onnx_pred.squeeze()
     assert onnx_pred.dtype == np.float64
-    pred = pred.squeeze()
     assert pred.shape == onnx_pred.shape
     assert np.allclose(
         onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6
@@ -110,6 +109,7 @@ def test_regressor(Model, objective, regression_dataset):
     ).fit(X, y)
 
     compare_estimator_predictions(model, X, "predict_raw")
+    compare_estimator_predictions(model, X, "predict")
 
 
 @pytest.fixture

From 692a1a5200e4b608c9584fffecf8446f03e5b8f4 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 10 Apr 2025 02:35:18 -0700
Subject: [PATCH 11/21] Implement remaining transforms

---
 legateboost/legateboost.py         |   2 +-
 legateboost/objectives.py          | 218 ++++++++++++++++++++++++++++-
 legateboost/test/test_objective.py |  28 ++++
 3 files changed, 245 insertions(+), 3 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 48f6a228..62dc48f7 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -692,7 +692,7 @@ def to_onnx(self, X: cn.ndarray, predict_function="predict"):
         if predict_function == "predict":
             model = merge_models(
                 model,
-                self._objective_instance.onnx_transform(),
+                self._objective_instance.onnx_transform(self.predict_raw(X[0:1])),
                 io_map=[
                     (
                         "model_{}_predictions_out".format(len(self.models_) - 1),
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index 755f31d0..5c59330d 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -44,6 +44,7 @@ class BaseObjective(ABC):
 
     # utility constant
     one = cn.ones(1, dtype=cn.float64)
+    half = cn.array(0.5, dtype=cn.float64)
 
     @abstractmethod
     def gradient(self, y: cn.ndarray, pred: cn.ndarray) -> GradPair:
@@ -71,7 +72,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         """
         return pred
 
-    def onnx_transform(self) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         """Returns an ONNX model that accepts
         - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
         And outputs the transformed predictions.
@@ -292,7 +293,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         pred[:, :, 1] = cn.clip(pred[:, :, 1], -5, 5)
         return pred
 
-    def onnx_transform(self) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         from onnx import TensorProto, numpy_helper
         from onnx.checker import check_model
         from onnx.helper import (
@@ -457,6 +458,45 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         """Inverse log link."""
         return cn.exp(pred)
 
+    @override
+    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+        from onnx import TensorProto
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        nodes = []
+        # exp
+        nodes.append(make_node("Exp", ["predictions_in"], ["predictions_out"]))
+
+        graph = make_graph(
+            nodes,
+            "GammaDevianceObjective",
+            [predictions_in],
+            [predictions_out],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
     def initialise_prediction(
         self, y: cn.ndarray, w: cn.ndarray, boost_from_average: bool
     ) -> cn.ndarray:
@@ -501,6 +541,53 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         assert pred.ndim == 3
         return cn.exp(pred)
 
+    @override
+    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+        from onnx import TensorProto, numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, None, 2],
+        )
+        nodes = []
+        # reshape
+        out_shape = numpy_helper.from_array(
+            np.array([0, -1, 2], dtype=np.int64), name="out_shape"
+        )
+        nodes.append(
+            make_node("Reshape", ["predictions_in", "out_shape"], ["reshaped"])
+        )
+        # exp
+        nodes.append(make_node("Exp", ["reshaped"], ["predictions_out"]))
+
+        graph = make_graph(
+            nodes,
+            "GammaObjective",
+            [predictions_in],
+            [predictions_out],
+            [out_shape],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
     @override
     def metric(self) -> GammaLLMetric:
         return GammaLLMetric()
@@ -647,6 +734,46 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         div = cn.sum(e_x, axis=1)
         return e_x / div[:, cn.newaxis]
 
+    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+        from onnx import TensorProto
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        nodes = []
+        if pred.shape[1] == 1:
+            nodes.append(make_node("Sigmoid", ["predictions_in"], ["predictions_out"]))
+        else:
+            nodes.append(make_node("Softmax", ["predictions_in"], ["predictions_out"]))
+        graph = make_graph(
+            nodes,
+            "LogLossObjective",
+            [predictions_in],
+            [predictions_out],
+            [],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
     def metric(self) -> LogLossMetric:
         return LogLossMetric()
 
@@ -683,6 +810,43 @@ def gradient(self, y: cn.ndarray, pred: cn.ndarray) -> GradPair:
     def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return self.one / (self.one + cn.exp(-pred))
 
+    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+        from onnx import TensorProto
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        nodes = []
+        nodes.append(make_node("Sigmoid", ["predictions_in"], ["predictions_out"]))
+        graph = make_graph(
+            nodes,
+            "MultiLabelObjective",
+            [predictions_in],
+            [predictions_out],
+            [],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
     def output_class(self, pred: cn.ndarray) -> cn.ndarray:
         return cn.array(pred > 0.5, dtype=cn.int32).squeeze()
 
@@ -750,6 +914,56 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         K = pred.shape[1]  # number of classes
         return logloss.transform((1 / (K - 1)) * pred)
 
+    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+        from onnx import TensorProto, numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+
+        nodes = []
+        initializers = []
+        if pred.shape[1] == 1:
+            two = numpy_helper.from_array(np.array(2, dtype=np.float64), name="two")
+            nodes.append(make_node("Mul", ["predictions_in", "two"], ["multiplied"]))
+            nodes.append(make_node("Sigmoid", ["multiplied"], ["predictions_out"]))
+            initializers.append(two)
+        else:
+            constant = numpy_helper.from_array(
+                np.array(1 / (pred.shape[1] - 1), dtype=np.float64), name="constant"
+            )
+            nodes.append(make_node("Mul", ["predictions_in", "constant"], ["scaled"]))
+            nodes.append(make_node("Softmax", ["scaled"], ["predictions_out"]))
+            initializers.append(constant)
+        graph = make_graph(
+            nodes,
+            "ExpObjective",
+            [predictions_in],
+            [predictions_out],
+            initializers,
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
     def metric(self) -> ExponentialMetric:
         return ExponentialMetric()
 
diff --git a/legateboost/test/test_objective.py b/legateboost/test/test_objective.py
index aaf8823c..a78ecb80 100644
--- a/legateboost/test/test_objective.py
+++ b/legateboost/test/test_objective.py
@@ -1,3 +1,5 @@
+import numpy as np
+import onnxruntime as ort
 import pytest
 
 import cupynumeric as cn
@@ -5,12 +7,26 @@
 from legateboost.testing.utils import non_increasing
 
 
+def compare_onnx_transform(obj, pred):
+    sess = ort.InferenceSession(obj.onnx_transform(pred).SerializeToString())
+    feeds = {
+        "predictions_in": pred,
+    }
+    onnx_transform = sess.run(None, feeds)[0]
+    assert onnx_transform.dtype == np.float64
+    transform = obj.transform(cn.array(pred))
+    assert transform.shape == onnx_transform.shape
+    assert np.allclose(onnx_transform, transform, atol=1e-6)
+
+
 def test_normal() -> None:
     obj = lb.NormalObjective()
     y = cn.array([[1.0], [2.0], [3.0]])
     init = obj.initialise_prediction(y, cn.array([1.0, 1.0, 1.0]), True)
     assert cn.allclose(init, cn.array([y.mean(), cn.log(y.std())]))
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+
 
 def test_gamma_deviance() -> None:
     obj = lb.GammaDevianceObjective()
@@ -35,6 +51,8 @@ def test_gamma_deviance() -> None:
     reg.fit(X, y1, eval_set=[(X, y1)], eval_result=eval_result)
     assert non_increasing(eval_result["train"]["deviance_gamma"])
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+
 
 def test_gamma() -> None:
     import numpy as np
@@ -51,6 +69,8 @@ def test_gamma() -> None:
     reg.fit(X, y, eval_set=[(X, y)], eval_result=eval_result)
     assert non_increasing(eval_result["train"]["gamma_neg_ll"])
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+
 
 def test_log_loss() -> None:
     obj = lb.LogLossObjective()
@@ -95,6 +115,9 @@ def test_log_loss() -> None:
             False,
         )
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+    compare_onnx_transform(obj, np.arange(4).reshape(4, 1).astype(np.float64))
+
 
 def test_exp():
     obj = lb.ExponentialObjective()
@@ -127,6 +150,9 @@ def test_exp():
             False,
         )
 
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))
+    compare_onnx_transform(obj, np.arange(4).reshape(4, 1).astype(np.float64))
+
 
 def test_multi_label():
     obj = lb.MultiLabelObjective()
@@ -139,3 +165,5 @@ def test_multi_label():
 
     with pytest.raises(ValueError, match=r"Expected labels to be in \[0, 1\]"):
         obj.initialise_prediction(cn.array([[1], [2]]), cn.array([[1.0], [1.0]]), False)
+
+    compare_onnx_transform(obj, np.arange(12).reshape(2, 6).astype(np.float64))

From 2d32592ebce89c18280053e6de1daa2714884094 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 11 Apr 2025 02:24:45 -0700
Subject: [PATCH 12/21] Classifier tests passing for float64 but not float32

---
 legateboost/legateboost.py    | 349 ++++++++++++++++++++++++----------
 legateboost/objectives.py     | 103 +++++++++-
 legateboost/test/test_onnx.py |   4 +-
 3 files changed, 357 insertions(+), 99 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 62dc48f7..e6dd45e6 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -560,6 +560,50 @@ def dump_models(self) -> str:
             text += str(m)
         return text
 
+    def _make_onnx_reshape_predictions(self, pred: cn.ndarray) -> cn.ndarray:
+        from onnx import TensorProto, numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+            np_dtype_to_tensor_dtype,
+        )
+
+        # make an onnx model that shapes the predictions equivalently to pred
+        extra_out_shape = [] if pred.ndim == 1 else list(pred.shape[1:])
+        shape = numpy_helper.from_array(np.array([-1] + extra_out_shape), name="shape")
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            np_dtype_to_tensor_dtype(pred.dtype),
+            shape=[None] + list(extra_out_shape),
+        )
+        nodes = [
+            make_node(
+                "Reshape",
+                ["predictions_in", "shape"],
+                ["predictions_out"],
+            )
+        ]
+        graph = make_graph(
+            nodes,
+            "reshape output",
+            [predictions_in],
+            [predictions_out],
+            [shape],
+        )
+        model = make_model(graph, opset_imports=[make_opsetid("", 21)])
+        check_model(model)
+        return model
+
     def _make_onnx_init(self, X_dtype):
         # turn self.model_init_ into an ONNX model
         from onnx import TensorProto, numpy_helper
@@ -613,56 +657,9 @@ def _make_onnx_init(self, X_dtype):
 
         return onnx_model
 
-    def to_onnx(self, X: cn.ndarray, predict_function="predict"):
-        """Converts the estimator to an ONNX model which is expected to produce
-        equivalent predictions to `predict_function` up to reasonable floating
-        point tolerance. The ONNX model is hard coded to the X input data type,
-        separate models should be generated for float and double. The ONNX model
-        takes "X_in" as input and produces "predictions_out" as output.
-
-        Parameters
-        ----------
-        X:
-            Example input data. Use to infer input data characteristics.
-            A model produced for float32 will not accept float64 input and vice versa.
-        predict_function : str
-            The serialised ONNX model can produce output equivalent to 'predict',
-            'predict_proba', or 'predict_raw'.
-            The default is "predict".
-        Returns
-        -------
-        Any
-            The ONNX model.
-
-        Examples
-        --------
-        >>> import numpy as np
-        >>> import legateboost as lb
-        >>> X = np.random.random((1000, 10))
-        >>> y = np.random.random(X.shape[0])
-        >>> model = lb.LBRegressor(n_estimators=5).fit(X, y)
-        >>> import onnxruntime as ort
-        >>> sess = ort.InferenceSession(model.to_onnx(X.dtype).SerializeToString())
-        >>> onnx_pred = sess.run(None, {"X_in": X})[0]
-        >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6)
-        >>>
-        """
-        if predict_function not in ["predict", "predict_proba", "predict_raw"]:
-            raise ValueError(
-                "predict_function should be one of "
-                "['predict', 'predict_proba', 'predict_raw']"
-            )
-
-        from onnx import TensorProto, numpy_helper
+    def _to_onnx_predict_raw(self, X: cn.ndarray):
         from onnx.checker import check_model
         from onnx.compose import merge_models
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
 
         model = self._make_onnx_init(X.dtype)
         if self.models_ is not None and len(self.models_) > 0:
@@ -685,60 +682,26 @@ def to_onnx(self, X: cn.ndarray, predict_function="predict"):
             )
 
         # remove the X_out output, we only need the predictions
-        # add a transform operator
         model.graph.output.remove(model.graph.output[0])
 
-        # add any transform from the objective
-        if predict_function == "predict":
-            model = merge_models(
-                model,
-                self._objective_instance.onnx_transform(self.predict_raw(X[0:1])),
-                io_map=[
-                    (
-                        "model_{}_predictions_out".format(len(self.models_) - 1),
-                        "predictions_in",
-                    )
-                ],
-                prefix2="transform_",
-            )
-            # coerce the output shape to be the same as the equivalent predict function
-            test_pred = getattr(self, predict_function)(X[0:1])
+        check_model(model)
+        return model
 
-            extra_out_shape = [] if test_pred.ndim == 1 else list(test_pred.shape[1:])
-            shape = numpy_helper.from_array(
-                np.array([-1] + extra_out_shape), name="shape"
-            )
+    def _to_onnx_predict_transformed(self, X: cn.ndarray):
+        from onnx.checker import check_model
+        from onnx.compose import merge_models
 
-            reshape_predictions_in = make_tensor_value_info(
-                "reshape_predictions_in",
-                TensorProto.DOUBLE,
-                [None, None],
-            )
-            reshaped_predictions = make_tensor_value_info(
-                "reshaped_predictions",
-                TensorProto.DOUBLE,
-                shape=[None] + list(extra_out_shape),
-            )
-            nodes = [
-                make_node(
-                    "Reshape",
-                    ["reshape_predictions_in", "shape"],
-                    ["reshaped_predictions"],
+        model = merge_models(
+            self._to_onnx_predict_raw(X),
+            self._objective_instance.onnx_transform(self.predict_raw(X[0:1])),
+            io_map=[
+                (
+                    "model_{}_predictions_out".format(len(self.models_) - 1),
+                    "predictions_in",
                 )
-            ]
-            graph = make_graph(
-                nodes,
-                "legateboost estimator transform",
-                [reshape_predictions_in],
-                [reshaped_predictions],
-                [shape],
-            )
-            model = merge_models(
-                model,
-                make_model(graph, opset_imports=[make_opsetid("", 21)]),
-                io_map=[("transform_predictions_out", "reshape_predictions_in")],
-            )
-
+            ],
+            prefix2="transform_",
+        )
         check_model(model)
         return model
 
@@ -1029,6 +992,63 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
             pred = pred.squeeze(axis=1)
         return pred
 
+    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict"):
+        """Converts the estimator to an ONNX model which is expected to produce
+        equivalent predictions to `predict_function` up to reasonable floating
+        point tolerance. The ONNX model is hard coded to the X input data type,
+        separate models should be generated for float and double. The ONNX model
+        takes "X_in" as input and produces "predictions_out" as output.
+
+        Parameters
+        ----------
+        X:
+            Example input data. Use to infer input data characteristics.
+            A model produced for float32 will not accept float64 input and vice versa.
+        predict_function : str
+            The serialised ONNX model can produce output equivalent to 'predict' or
+            'predict_raw'.
+            The default is "predict".
+        Returns
+        -------
+        Any
+            The ONNX model.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import legateboost as lb
+        >>> X = np.random.random((1000, 10))
+        >>> y = np.random.random(X.shape[0])
+        >>> model = lb.LBRegressor(n_estimators=5).fit(X, y)
+        >>> import onnxruntime as ort
+        >>> sess = ort.InferenceSession(model.to_onnx(X.dtype).SerializeToString())
+        >>> onnx_pred = sess.run(None, {"X_in": X})[0]
+        >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6)
+        >>>
+        """
+        from onnx.checker import check_model
+        from onnx.compose import merge_models
+
+        if predict_function not in ["predict", "predict_raw"]:
+            raise ValueError(
+                "predict_function should be one of ['predict', 'predict_raw']"
+            )
+        if predict_function == "predict":
+            model = self._to_onnx_predict_transformed(X)
+        else:
+            model = self._to_onnx_predict_raw(X)
+
+        # coerce the output shape to be the same as the equivalent predict function
+        test_pred = getattr(self, predict_function)(X[0:1])
+        model = merge_models(
+            model,
+            self._make_onnx_reshape_predictions(test_pred),
+            io_map=[(model.graph.output[0].name, "predictions_in")],
+            prefix2="reshape_",
+        )
+        check_model(model)
+        return model
+
 
 class LBClassifier(ClassifierMixin, LBBase):
     """Implements a gradient boosting algorithm for classification problems.
@@ -1274,3 +1294,138 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
         """
         check_is_fitted(self)
         return self._objective_instance.output_class(self.predict_proba(X))
+
+    def _mirror_predict_proba_output(self, model) -> cn.ndarray:
+        assert len(self.classes_) == 2
+        from onnx import TensorProto, numpy_helper
+        from onnx.checker import check_model
+        from onnx.compose import merge_models
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        nodes = []
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.DOUBLE,
+            [None, 2],
+        )
+        one = numpy_helper.from_array(np.array([1.0], dtype=np.float64), name="one")
+        nodes.append(make_node("Sub", ["one", "predictions_in"], ["false_probability"]))
+        nodes.append(
+            make_node(
+                "Concat",
+                ["false_probability", "predictions_in"],
+                ["predictions_out"],
+                axis=1,
+            )
+        )
+
+        graph = make_graph(
+            nodes,
+            "mirror predict proba",
+            [predictions_in],
+            [predictions_out],
+            [one],
+        )
+        new_model = make_model(
+            graph,
+            opset_imports=[
+                make_opsetid("", 21),
+            ],
+        )
+        new_model = merge_models(
+            model,
+            new_model,
+            io_map=[
+                (model.graph.output[0].name, "predictions_in"),
+            ],
+            prefix2="mirror_",
+        )
+        check_model(new_model)
+        return new_model
+
+    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict"):
+        """Converts the estimator to an ONNX model which is expected to produce
+        equivalent predictions to `predict_function` up to reasonable floating
+        point tolerance. The ONNX model is hard coded to the X input data type,
+        separate models should be generated for float and double. The ONNX model
+        takes "X_in" as input and produces "predictions_out" as output.
+
+        Parameters
+        ----------
+        X:
+            Example input data. Use to infer input data characteristics.
+            A model produced for float32 will not accept float64 input and vice versa.
+        predict_function : str
+            The serialised ONNX model can produce output equivalent to 'predict',
+            'predict_proba', or 'predict_raw'.
+            The default is "predict".
+        Returns
+        -------
+        Any
+            The ONNX model.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import legateboost as lb
+        >>> X = np.random.random((1000, 10))
+        >>> y = np.random.randint(0, 2, X.shape[0])
+        >>> model = lb.LBClassifier(n_estimators=5).fit(X, y)
+        >>> import onnxruntime as ort
+        >>> sess = ort.InferenceSession(model.to_onnx(X.dtype,
+        ...     predict_function="predict_proba").SerializeToString())
+        >>> onnx_pred = sess.run(None, {"X_in": X})[0]
+        >>> assert np.allclose(model.predict_proba(X), onnx_pred, atol=1e-6)
+        >>>
+        """
+        from onnx.checker import check_model
+        from onnx.compose import merge_models
+
+        if predict_function not in ["predict", "predict_proba", "predict_raw"]:
+            raise ValueError(
+                "predict_function should be one of ['predict',"
+                " 'predict_proba', 'predict_raw']"
+            )
+        if predict_function in ["predict_proba", "predict"]:
+            model = self._to_onnx_predict_transformed(X)
+            # need to mirror the output when we only output one target
+            if self.predict_raw(X[0:1]).shape[1] == 1:
+                model = self._mirror_predict_proba_output(model)
+        if predict_function == "predict":
+            # argmax the predict_proba output
+            argmax = self._objective_instance.onnx_output_class(
+                self.predict_proba(X[0:1])
+            )
+            model = merge_models(
+                model,
+                argmax,
+                io_map=[
+                    (model.graph.output[0].name, "predictions_in"),
+                ],
+                prefix2="classifier_predict_",
+            )
+
+        elif predict_function == "predict_raw":
+            model = self._to_onnx_predict_raw(X)
+
+        # coerce the output shape to be the same as the equivalent predict function
+        test_pred = getattr(self, predict_function)(X[0:1])
+        model = merge_models(
+            model,
+            self._make_onnx_reshape_predictions(test_pred),
+            io_map=[(model.graph.output[0].name, "predictions_in")],
+            prefix2="reshape_",
+        )
+        check_model(model)
+        return model
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index 5c59330d..e6d770b0 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -165,6 +165,54 @@ def output_class(self, pred: cn.ndarray) -> cn.ndarray:
         """
         return cn.argmax(pred, axis=-1)
 
+    def onnx_output_class(self, pred: cn.ndarray):
+        """Returns an ONNX model that accepts
+        - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
+        And outputs the predicted class labels.
+        - "predictions_out" : 1D tensor of shape (n_samples,) and type int32.
+
+        Returns:
+            Onnx model that converts probabilities into class labels.
+        """
+        from onnx import TensorProto
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.INT64,
+            [None],
+        )
+        nodes = []
+        nodes.append(
+            make_node(
+                "ArgMax", ["predictions_in"], ["predictions_out"], axis=-1, keepdims=0
+            )
+        )
+        graph = make_graph(
+            nodes,
+            "OutputClass",
+            [predictions_in],
+            [predictions_out],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
+
 
 class SquaredErrorObjective(BaseObjective):
     """The Squared Error objective function for regression problems.
@@ -848,7 +896,60 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         return onnx_model
 
     def output_class(self, pred: cn.ndarray) -> cn.ndarray:
-        return cn.array(pred > 0.5, dtype=cn.int32).squeeze()
+        return cn.array(pred > 0.5, dtype=cn.int64)
+
+    def onnx_output_class(self, pred: cn.ndarray):
+        """Returns an ONNX model that accepts
+        - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
+        And outputs the predicted class labels.
+        - "predictions_out" : 1D tensor of shape (n_samples,) and type int32.
+
+        Returns:
+            Onnx model that converts probabilities into class labels.
+        """
+        from onnx import TensorProto, numpy_helper
+        from onnx.checker import check_model
+        from onnx.helper import (
+            make_graph,
+            make_model,
+            make_node,
+            make_opsetid,
+            make_tensor_value_info,
+        )
+
+        predictions_in = make_tensor_value_info(
+            "predictions_in",
+            TensorProto.DOUBLE,
+            [None, None],
+        )
+        predictions_out = make_tensor_value_info(
+            "predictions_out",
+            TensorProto.INT64,
+            [None],
+        )
+        nodes = []
+        half = numpy_helper.from_array(np.array(0.5, dtype=np.float64), name="half")
+        nodes.append(
+            make_node("Greater", ["predictions_in", "half"], ["comparison_result"])
+        )
+        nodes.append(
+            make_node(
+                "Cast", ["comparison_result"], ["predictions_out"], to=TensorProto.INT64
+            )
+        )
+        graph = make_graph(
+            nodes,
+            "OutputClass",
+            [predictions_in],
+            [predictions_out],
+            [half],
+        )
+        onnx_model = make_model(
+            graph,
+            opset_imports=[make_opsetid("", 21)],
+        )
+        check_model(onnx_model)
+        return onnx_model
 
     def metric(self) -> MultiLabelMetric:
         return MultiLabelMetric()
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index fc686b1f..2aa0e250 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -34,7 +34,7 @@ def compare_estimator_predictions(estimator, X, predict_function):
     pred = pred_method(cn.array(X))
     onnx_pred = sess.run(None, feeds)[0]
 
-    assert onnx_pred.dtype == np.float64
+    assert onnx_pred.dtype == pred.dtype
     assert pred.shape == onnx_pred.shape
     assert np.allclose(
         onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6
@@ -145,6 +145,8 @@ def test_classifier(Model, objective, classification_dataset):
     ).fit(X, y)
 
     compare_estimator_predictions(model, X, "predict_raw")
+    compare_estimator_predictions(model, X, "predict_proba")
+    compare_estimator_predictions(model, X, "predict")
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])

From 039a8e1ed186431d42c5b415ecaeb150a175fcdf Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 11 Apr 2025 04:44:56 -0700
Subject: [PATCH 13/21] Compensate for tolerance

---
 legateboost/test/test_onnx.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 2aa0e250..3d75300b 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -19,11 +19,11 @@ def compare_model_predictions(model, X):
     pred = pred.squeeze()
     assert pred.shape == onnx_pred.shape
     assert np.allclose(
-        onnx_pred, pred, atol=1e-3 if X.dtype == np.float32 else 1e-6
+        onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6
     ), np.linalg.norm(pred - onnx_pred)
 
 
-def compare_estimator_predictions(estimator, X, predict_function):
+def compare_estimator_predictions(estimator, X, predict_function, allowed_wrong=0):
     sess = ort.InferenceSession(
         estimator.to_onnx(X, predict_function).SerializeToString()
     )
@@ -36,9 +36,10 @@ def compare_estimator_predictions(estimator, X, predict_function):
 
     assert onnx_pred.dtype == pred.dtype
     assert pred.shape == onnx_pred.shape
-    assert np.allclose(
-        onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6
-    ), np.linalg.norm(pred - onnx_pred)
+    number_wrong = np.sum(
+        np.abs(pred - onnx_pred) > 1e-2 if X.dtype == np.float32 else 1e-6
+    )
+    assert number_wrong <= allowed_wrong
 
 
 @pytest.fixture
@@ -146,7 +147,11 @@ def test_classifier(Model, objective, classification_dataset):
 
     compare_estimator_predictions(model, X, "predict_raw")
     compare_estimator_predictions(model, X, "predict_proba")
-    compare_estimator_predictions(model, X, "predict")
+    # softmax has numerical differences with float32
+    # allow a very small number of different class predictions
+    # this is fine so long as the probabilities are close
+    allowed_wrong = 5 if y.max() > 1 and X.dtype == np.float32 else 0
+    compare_estimator_predictions(model, X, "predict", allowed_wrong)
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])

From aea209cb9fd20122369cda79be17f625a03f00c1 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 11 Apr 2025 07:14:43 -0700
Subject: [PATCH 14/21] Convert some operators to text

---
 legateboost/objectives.py | 332 +++++++++++---------------------------
 1 file changed, 95 insertions(+), 237 deletions(-)

diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index e6d770b0..9819af28 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -86,39 +86,19 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         Returns:
             Onnx model that transforms the predictions.
         """
-        from onnx import TensorProto
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
-
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        nodes = [make_node("Identity", ["predictions_in"], ["predictions_out"])]
-        graph = make_graph(
-            nodes,
-            "BaseModel",
-            [predictions_in],
-            [predictions_out],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        import onnx
+
+        onnx_text = """
+        <
+            ir_version: 9,
+            opset_import: ["" : 10]
+        >
+        BaseObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Identity(predictions_in)
+        }
+        """
+        return onnx.parser.parse_model(onnx_text)
 
     @abstractmethod
     def metric(self) -> BaseMetric:
@@ -342,91 +322,31 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return pred
 
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
-
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, None, 2],
-        )
-        nodes = []
-        # clip
-        mininmum = numpy_helper.from_array(
-            np.array(-5, dtype=np.float64), name="minimum"
-        )
-        maximum = numpy_helper.from_array(np.array(5, dtype=np.float64), name="maximum")
-        # reshape
-        out_shape = numpy_helper.from_array(
-            np.array([0, -1, 2], dtype=np.int64), name="out_shape"
-        )
-        nodes.append(
-            make_node("Reshape", ["predictions_in", "out_shape"], ["reshaped"])
-        )
-
-        nodes.append(make_node("Shape", ["reshaped"], ["new_shape"]))
-
-        var_starts = numpy_helper.from_array(
-            np.array([0, 0, 1], dtype=np.int64), name="var_starts"
-        )
-        mean_starts = numpy_helper.from_array(
-            np.array([0, 0, 0], dtype=np.int64), name="mean_starts"
-        )
-
-        # extract mean and variance parts
-        axis = numpy_helper.from_array(np.array([0, 1, 2], dtype=np.int64), name="axis")
-        steps = numpy_helper.from_array(
-            np.array([1, 1, 2], dtype=np.int64), name="steps"
-        )
-        nodes.append(
-            make_node(
-                "Slice",
-                ["reshaped", "var_starts", "new_shape", "axis", "steps"],
-                ["variance"],
-            )
-        )
-        nodes.append(
-            make_node(
-                "Slice",
-                ["reshaped", "mean_starts", "new_shape", "axis", "steps"],
-                ["mean"],
-            )
-        )
-        nodes.append(
-            make_node("Clip", ["variance", "minimum", "maximum"], ["clipped_variance"])
-        )
-
-        # combine them again
-        nodes.append(
-            make_node(
-                "Concat", ["mean", "clipped_variance"], ["predictions_out"], axis=2
-            )
-        )
-        graph = make_graph(
-            nodes,
-            "NormalObjective",
-            [predictions_in],
-            [predictions_out],
-            [out_shape, var_starts, mean_starts, axis, steps, mininmum, maximum],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        import onnx
+
+        onnx_text = """
+        <
+            ir_version: 9,
+            opset_import: ["" : 21]
+        >
+        NormalObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            out_shape = Constant<value_ints = [0,-1,2]>()
+            var_starts = Constant<value_ints = [0,0,1]>()
+            mean_starts = Constant<value_ints = [0,0,0]>()
+            axis = Constant<value_ints = [0,1,2]>()
+            steps = Constant<value_ints = [1,1,2]>()
+            min = Constant<value = double {-5.0}>()
+            max = Constant<value = double {5.0}>()
+            reshaped = Reshape(predictions_in, out_shape)
+            new_shape = Shape(reshaped)
+            variance = Slice(reshaped, var_starts, new_shape, axis, steps)
+            mean = Slice(reshaped, mean_starts, new_shape, axis, steps)
+            clipped_variance = Clip(variance, min, max)
+            predictions_out = Concat<axis=2>(mean, clipped_variance)
+        }
+        """
+        return onnx.parser.parse_model(onnx_text)
 
     @override
     def mean(self, param: cn.ndarray) -> cn.ndarray:
@@ -783,44 +703,21 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return e_x / div[:, cn.newaxis]
 
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
-        from onnx import TensorProto
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
-
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        nodes = []
-        if pred.shape[1] == 1:
-            nodes.append(make_node("Sigmoid", ["predictions_in"], ["predictions_out"]))
-        else:
-            nodes.append(make_node("Softmax", ["predictions_in"], ["predictions_out"]))
-        graph = make_graph(
-            nodes,
-            "LogLossObjective",
-            [predictions_in],
-            [predictions_out],
-            [],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        import onnx
+
+        operator_to_use = "Sigmoid" if pred.shape[1] == 1 else "Softmax"
+        onnx_text = f"""
+        <
+            ir_version: 9,
+            opset_import: ["" : 10]
+        >
+        LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {{
+            predictions_out = {operator_to_use}(predictions_in)
+        }}
+        """
+        print(onnx_text)
+        return onnx.parser.parse_model(onnx_text)
 
     def metric(self) -> LogLossMetric:
         return LogLossMetric()
@@ -859,41 +756,19 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return self.one / (self.one + cn.exp(-pred))
 
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
-        from onnx import TensorProto
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
-
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        nodes = []
-        nodes.append(make_node("Sigmoid", ["predictions_in"], ["predictions_out"]))
-        graph = make_graph(
-            nodes,
-            "MultiLabelObjective",
-            [predictions_in],
-            [predictions_out],
-            [],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        import onnx
+
+        onnx_text = """
+        <
+            ir_version: 9,
+            opset_import: ["" : 10]
+        >
+        MultiLabelObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Sigmoid(predictions_in)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_model(onnx_text)
 
     def output_class(self, pred: cn.ndarray) -> cn.ndarray:
         return cn.array(pred > 0.5, dtype=cn.int64)
@@ -1016,54 +891,37 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return logloss.transform((1 / (K - 1)) * pred)
 
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
+        import onnx
 
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-
-        nodes = []
-        initializers = []
         if pred.shape[1] == 1:
-            two = numpy_helper.from_array(np.array(2, dtype=np.float64), name="two")
-            nodes.append(make_node("Mul", ["predictions_in", "two"], ["multiplied"]))
-            nodes.append(make_node("Sigmoid", ["multiplied"], ["predictions_out"]))
-            initializers.append(two)
-        else:
-            constant = numpy_helper.from_array(
-                np.array(1 / (pred.shape[1] - 1), dtype=np.float64), name="constant"
-            )
-            nodes.append(make_node("Mul", ["predictions_in", "constant"], ["scaled"]))
-            nodes.append(make_node("Softmax", ["scaled"], ["predictions_out"]))
-            initializers.append(constant)
-        graph = make_graph(
-            nodes,
-            "ExpObjective",
-            [predictions_in],
-            [predictions_out],
-            initializers,
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+            onnx_text = """
+            <
+                ir_version: 9,
+                opset_import: ["" : 10]
+            >
+            LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+            {
+                constant = Constant<value = double {2.0}>()
+                a = Mul(predictions_in, constant)
+                predictions_out = Sigmoid(a)
+            }
+            """  # noqa: E501
+            return onnx.parser.parse_model(onnx_text)
+
+        constant = 1 / (pred.shape[1] - 1)
+        onnx_text_multiclass = f"""
+        <
+            ir_version: 9,
+            opset_import: ["" : 10]
+        >
+        LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {{
+            constant = Constant<value = double {{{constant}}}>()
+            a = Mul(predictions_in, constant)
+            predictions_out = Softmax(a)
+        }}
+        """
+        return onnx.parser.parse_model(onnx_text_multiclass)
 
     def metric(self) -> ExponentialMetric:
         return ExponentialMetric()

From b069090aaf9535b556faab81767ba76a19410c08 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 17 Apr 2025 02:59:15 -0700
Subject: [PATCH 15/21] Reduce verbosity

---
 legateboost/legateboost.py    | 131 ++++++-------------
 legateboost/models/krr.py     | 130 ++++++------------
 legateboost/models/linear.py  |  87 ++++---------
 legateboost/objectives.py     | 239 +++++++++-------------------------
 legateboost/test/test_onnx.py |   7 +-
 5 files changed, 169 insertions(+), 425 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index e6dd45e6..6986cea2 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -561,101 +561,50 @@ def dump_models(self) -> str:
         return text
 
     def _make_onnx_reshape_predictions(self, pred: cn.ndarray) -> cn.ndarray:
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-            np_dtype_to_tensor_dtype,
-        )
-
         # make an onnx model that shapes the predictions equivalently to pred
-        extra_out_shape = [] if pred.ndim == 1 else list(pred.shape[1:])
-        shape = numpy_helper.from_array(np.array([-1] + extra_out_shape), name="shape")
-
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            np_dtype_to_tensor_dtype(pred.dtype),
-            shape=[None] + list(extra_out_shape),
-        )
-        nodes = [
-            make_node(
-                "Reshape",
-                ["predictions_in", "shape"],
-                ["predictions_out"],
-            )
-        ]
-        graph = make_graph(
-            nodes,
-            "reshape output",
-            [predictions_in],
-            [predictions_out],
-            [shape],
-        )
-        model = make_model(graph, opset_imports=[make_opsetid("", 21)])
-        check_model(model)
-        return model
+        shape = list(pred.shape)
+        shape[0] = -1
+        out_type = "int64" if pred.dtype == cn.int64 else "double"
+        import onnx
+
+        onnx_text = f"""
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        ReshapePredictions ({out_type}[N, M] predictions_in) => ({out_type}{shape} predictions_out)
+        {{
+            shape = Constant<value_ints={shape}>()
+            predictions_out = Reshape(predictions_in, shape)
+        }}
+        """  # noqa: E501
+        return onnx.parser.parse_model(onnx_text)
 
     def _make_onnx_init(self, X_dtype):
-        # turn self.model_init_ into an ONNX model
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-            np_dtype_to_tensor_dtype,
-        )
-
-        # model constants
-        X_in = make_tensor_value_info(
-            "X_in", np_dtype_to_tensor_dtype(X_dtype), [None, self.n_features_in_]
-        )
-        nodes = []
-        nodes.append(make_node("Shape", ["X_in"], ["n_rows"], end=1))
-        one = numpy_helper.from_array(np.array([1], dtype=np.int64), name="one")
-        nodes.append(make_node("Concat", ["n_rows", "one"], ["tile_repeat"], axis=0))
-        init = numpy_helper.from_array(
-            np.atleast_2d(self.model_init_.__array__()), name="init"
-        )
-        prediction_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, self.model_init_.shape[0]],
-        )
-        nodes.append(make_node("Tile", ["init", "tile_repeat"], ["predictions_out"]))
-        X_out = make_tensor_value_info(
-            "X_out",
-            np_dtype_to_tensor_dtype(X_dtype),
-            [None, None],
-        )
-        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
-        graph = make_graph(
-            nodes,
-            "legateboost estimator init",
-            [X_in],
-            [X_out, prediction_out],
-            [init, one],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[
-                make_opsetid("", 21),
-            ],
+        import onnx
+
+        X_type_text = "double" if X_dtype == cn.float64 else "float"
+        onnx_text = f"""
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        ReshapePredictions ({X_type_text}[N, M] X_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
+        {{
+            X_out = Identity(X_in)
+            n_rows = Shape<end=1>(X_in)
+            one = Constant<value_ints=[1]>()
+            tile_repeat = Concat<axis=0>(n_rows, one)
+            predictions_out = Tile(init, tile_repeat)
+        }}
+        """  # noqa: E501
+        init_model = onnx.parser.parse_model(onnx_text)
+        init_model.graph.initializer.append(
+            onnx.numpy_helper.from_array(
+                np.atleast_2d(self.model_init_.__array__()), name="init"
+            )
         )
-        check_model(onnx_model)
-
-        return onnx_model
+        return init_model
 
     def _to_onnx_predict_raw(self, X: cn.ndarray):
         from onnx.checker import check_model
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index da2e22ce..5ecaaebc 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -244,97 +244,41 @@ def __mul__(self, scalar: Any) -> "KRR":
         return new
 
     def to_onnx(self, X) -> Any:
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-            np_dtype_to_tensor_dtype,
+        import onnx
+
+        X_type_text = "double" if X.dtype == cn.float64 else "float"
+        denominator = -2.0 * self.sigma**2
+        onnx_text = f"""
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        KRRModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
+        {{
+            X_out = Identity(X_in)
+            axis1 = Constant<value_ints=[1]>()
+            XX = ReduceSumSquare(X_in, axis1)
+            XY = Gemm<alpha=-2.0, transB=1>(X_in, X_train)
+            YY = ReduceSumSquare(X_train, axis1)
+            reshape = Constant<value_ints=[1, -1]>()
+            YY_reshaped = Reshape(YY, reshape)
+            add0 = Add(XX, XY)
+            l2 = Add(YY_reshaped, add0)
+            zero = Constant<value= {X_type_text}{{0.0}}>()
+            l2_clipped = Max(l2, zero)
+            denominator = Constant<value={X_type_text} {{{denominator}}}>()
+            rbf0 = Div(l2_clipped, denominator)
+            K = Exp(rbf0)
+            dot = MatMul(K, betas)
+            dot_double = Cast<to=11>(dot)
+            predictions_out = Add(dot_double, predictions_in)
+        }}
+        """  # noqa: E501
+        model = onnx.parser.parse_model(onnx_text)
+        model.graph.initializer.extend(
+            [
+                onnx.numpy_helper.from_array(self.betas_.__array__(), name="betas"),
+                onnx.numpy_helper.from_array(self.X_train.__array__(), name="X_train"),
+            ]
         )
-
-        assert self.X_train.dtype == self.betas_.dtype
-
-        nodes = []
-
-        # model constants
-        betas = numpy_helper.from_array(self.betas_.__array__(), name="betas")
-        X_train = numpy_helper.from_array(self.X_train.__array__(), name="X_train")
-
-        # pred inputs
-        n_features = self.X_train.shape[1]
-        n_outputs = self.betas_.shape[1]
-        X_in = make_tensor_value_info(
-            "X_in", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
-        )
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, n_outputs],
-        )
-        # exanded l2 distance
-        # distance = np.sum(X**2, axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train.T)
-        # + np.sum(self.X_train**2, axis=1)
-        axis1 = numpy_helper.from_array(np.array([1]), name="axis1")
-        nodes.append(make_node("ReduceSumSquare", ["X_in", "axis1"], ["XX"]))
-        nodes.append(
-            make_node("Gemm", ["X_in", "X_train"], ["XY"], alpha=-2.0, transB=1)
-        )
-        nodes.append(make_node("ReduceSumSquare", ["X_train", "axis1"], ["YY"]))
-        reshape = numpy_helper.from_array(
-            np.array([1, -1], dtype=np.int64), name="reshape"
-        )
-        nodes.append(make_node("Reshape", ["YY", "reshape"], ["YY_reshaped"]))
-        nodes.append(make_node("Add", ["XX", "XY"], ["add0"]))
-        nodes.append(make_node("Add", ["YY_reshaped", "add0"], ["l2"]))
-        zero = numpy_helper.from_array(np.array([0.0], self.X_train.dtype), name="zero")
-        nodes.append(make_node("Max", ["l2", "zero"], ["l2_clipped"]))
-
-        # RBF kernel
-        # K = np.exp(-distance / (2 * self.sigma**2))
-        if self.sigma is None:
-            raise ValueError("sigma is None. Has fit been called?")
-
-        denominator = numpy_helper.from_array(
-            np.array([-2.0 * self.sigma**2], self.X_train.dtype), name="denominator"
-        )
-        nodes.append(make_node("Div", ["l2_clipped", "denominator"], ["rbf0"]))
-        nodes.append(make_node("Exp", ["rbf0"], ["K"]))
-
-        # prediction
-        # pred = np.dot(K, self.betas_)
-        nodes.append(make_node("MatMul", ["K", "betas"], ["dot"]))
-
-        # outputs
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, n_outputs],
-        )
-        X_out = make_tensor_value_info(
-            "X_out", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
-        )
-
-        nodes.append(make_node("Cast", ["dot"], ["dot_double"], to=TensorProto.DOUBLE))
-        nodes.append(
-            make_node("Add", ["dot_double", "predictions_in"], ["predictions_out"])
-        )
-        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
-
-        graph = make_graph(
-            nodes,
-            "legateboost.model.KRR",
-            [X_in, predictions_in],
-            [X_out, predictions_out],
-            [betas, X_train, axis1, reshape, zero, denominator],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[
-                make_opsetid("", 21),
-            ],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        return model
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index de88ef2c..edb30e61 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -153,65 +153,30 @@ def __mul__(self, scalar: Any) -> "Linear":
         return new
 
     def to_onnx(self, X) -> Any:
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-            np_dtype_to_tensor_dtype,
+        import onnx
+
+        X_type_text = "double" if X.dtype == cn.float64 else "float"
+        onnx_text = f"""
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        LinearModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
+        {{
+            X_out = Identity(X_in)
+            mult = MatMul(X_in, betas)
+            result = Add(mult, intercept)
+            result_double = Cast<to=11>(result)
+            predictions_out = Add(result_double, predictions_in)
+        }}
+        """  # noqa: E501
+        model = onnx.parser.parse_model(onnx_text)
+        model.graph.initializer.extend(
+            [
+                onnx.numpy_helper.from_array(self.betas_[1:].__array__(), name="betas"),
+                onnx.numpy_helper.from_array(
+                    self.betas_[0].__array__(), name="intercept"
+                ),
+            ]
         )
-
-        # model constants
-        betas = numpy_helper.from_array(self.betas_[1:].__array__(), name="betas")
-        intercept = numpy_helper.from_array(
-            self.betas_[0].__array__(), name="intercept"
-        )
-
-        # pred inputs
-        n_features = self.betas_.shape[0] - 1
-        n_outputs = self.betas_.shape[1]
-        X_in = make_tensor_value_info(
-            "X_in", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
-        )
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, n_outputs],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, n_outputs],
-        )
-
-        nodes = []
-        nodes.append(make_node("MatMul", ["X_in", "betas"], ["XBeta"]))
-        nodes.append(make_node("Add", ["XBeta", "intercept"], ["result"]))
-        nodes.append(
-            make_node("Cast", ["result"], ["result_double"], to=TensorProto.DOUBLE)
-        )
-        nodes.append(
-            make_node("Add", ["result_double", "predictions_in"], ["predictions_out"])
-        )
-        X_out = make_tensor_value_info(
-            "X_out", np_dtype_to_tensor_dtype(self.betas_.dtype), [None, n_features]
-        )
-        nodes.append(make_node("Identity", ["X_in"], ["X_out"]))
-        graph = make_graph(
-            nodes,
-            "legateboost.model.Linear",
-            [X_in, predictions_in],
-            [X_out, predictions_out],
-            [betas, intercept],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[
-                make_opsetid("", 21),
-            ],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        return model
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index 9819af28..d2607d2a 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -1,7 +1,6 @@
 from abc import ABC, abstractmethod
 from typing import Tuple
 
-import numpy as np
 from scipy.stats import norm
 from typing_extensions import TypeAlias, override
 
@@ -90,8 +89,8 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
 
         onnx_text = """
         <
-            ir_version: 9,
-            opset_import: ["" : 10]
+            ir_version: 10,
+            opset_import: ["" : 21]
         >
         BaseObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
@@ -154,44 +153,19 @@ def onnx_output_class(self, pred: cn.ndarray):
         Returns:
             Onnx model that converts probabilities into class labels.
         """
-        from onnx import TensorProto
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
+        import onnx
 
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.INT64,
-            [None],
-        )
-        nodes = []
-        nodes.append(
-            make_node(
-                "ArgMax", ["predictions_in"], ["predictions_out"], axis=-1, keepdims=0
-            )
-        )
-        graph = make_graph(
-            nodes,
-            "OutputClass",
-            [predictions_in],
-            [predictions_out],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        onnx_text = """
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        BaseModelOutputClass (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = ArgMax<axis = -1>(predictions_in)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_model(onnx_text)
 
 
 class SquaredErrorObjective(BaseObjective):
@@ -326,7 +300,7 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
 
         onnx_text = """
         <
-            ir_version: 9,
+            ir_version: 10,
             opset_import: ["" : 21]
         >
         NormalObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
@@ -428,42 +402,19 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
 
     @override
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
-        from onnx import TensorProto
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
+        import onnx
 
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        nodes = []
-        # exp
-        nodes.append(make_node("Exp", ["predictions_in"], ["predictions_out"]))
-
-        graph = make_graph(
-            nodes,
-            "GammaDevianceObjective",
-            [predictions_in],
-            [predictions_out],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        onnx_text = """
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        GammaDevianceTransform (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Exp(predictions_in)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_model(onnx_text)
 
     def initialise_prediction(
         self, y: cn.ndarray, w: cn.ndarray, boost_from_average: bool
@@ -511,50 +462,19 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
 
     @override
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
+        import onnx
 
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, None, 2],
-        )
-        nodes = []
-        # reshape
-        out_shape = numpy_helper.from_array(
-            np.array([0, -1, 2], dtype=np.int64), name="out_shape"
-        )
-        nodes.append(
-            make_node("Reshape", ["predictions_in", "out_shape"], ["reshaped"])
-        )
-        # exp
-        nodes.append(make_node("Exp", ["reshaped"], ["predictions_out"]))
-
-        graph = make_graph(
-            nodes,
-            "GammaObjective",
-            [predictions_in],
-            [predictions_out],
-            [out_shape],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        onnx_text = """
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        GammaTransform (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            predictions_out = Exp(predictions_in)
+        }
+        """
+        return onnx.parser.parse_model(onnx_text)
 
     @override
     def metric(self) -> GammaLLMetric:
@@ -708,15 +628,14 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         operator_to_use = "Sigmoid" if pred.shape[1] == 1 else "Softmax"
         onnx_text = f"""
         <
-            ir_version: 9,
-            opset_import: ["" : 10]
+            ir_version: 10,
+            opset_import: ["" : 21]
         >
         LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {{
             predictions_out = {operator_to_use}(predictions_in)
         }}
         """
-        print(onnx_text)
         return onnx.parser.parse_model(onnx_text)
 
     def metric(self) -> LogLossMetric:
@@ -760,8 +679,8 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
 
         onnx_text = """
         <
-            ir_version: 9,
-            opset_import: ["" : 10]
+            ir_version: 10,
+            opset_import: ["" : 21]
         >
         MultiLabelObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
@@ -774,57 +693,21 @@ def output_class(self, pred: cn.ndarray) -> cn.ndarray:
         return cn.array(pred > 0.5, dtype=cn.int64)
 
     def onnx_output_class(self, pred: cn.ndarray):
-        """Returns an ONNX model that accepts
-        - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
-        And outputs the predicted class labels.
-        - "predictions_out" : 1D tensor of shape (n_samples,) and type int32.
-
-        Returns:
-            Onnx model that converts probabilities into class labels.
-        """
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
+        import onnx
 
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.INT64,
-            [None],
-        )
-        nodes = []
-        half = numpy_helper.from_array(np.array(0.5, dtype=np.float64), name="half")
-        nodes.append(
-            make_node("Greater", ["predictions_in", "half"], ["comparison_result"])
-        )
-        nodes.append(
-            make_node(
-                "Cast", ["comparison_result"], ["predictions_out"], to=TensorProto.INT64
-            )
-        )
-        graph = make_graph(
-            nodes,
-            "OutputClass",
-            [predictions_in],
-            [predictions_out],
-            [half],
-        )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[make_opsetid("", 21)],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        onnx_text = """
+        <
+            ir_version: 10,
+            opset_import: ["" : 21]
+        >
+        MultiLabelOutputClass (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        {
+            half = Constant<value = double {0.5}>()
+            greater = Greater(predictions_in, half)
+            predictions_out = Cast<to = 7>(greater)
+        }
+        """  # noqa: E501
+        return onnx.parser.parse_model(onnx_text)
 
     def metric(self) -> MultiLabelMetric:
         return MultiLabelMetric()
@@ -896,8 +779,8 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         if pred.shape[1] == 1:
             onnx_text = """
             <
-                ir_version: 9,
-                opset_import: ["" : 10]
+                ir_version: 10,
+                opset_import: ["" : 21]
             >
             LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
             {
@@ -911,8 +794,8 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         constant = 1 / (pred.shape[1] - 1)
         onnx_text_multiclass = f"""
         <
-            ir_version: 9,
-            opset_import: ["" : 10]
+            ir_version: 10,
+            opset_import: ["" : 21]
         >
         LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {{
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 3d75300b..89f5a377 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -37,9 +37,12 @@ def compare_estimator_predictions(estimator, X, predict_function, allowed_wrong=
     assert onnx_pred.dtype == pred.dtype
     assert pred.shape == onnx_pred.shape
     number_wrong = np.sum(
-        np.abs(pred - onnx_pred) > 1e-2 if X.dtype == np.float32 else 1e-6
+        np.abs(pred - onnx_pred) > (1e-2 if X.dtype == np.float32 else 1e-5)
+    )
+    assert number_wrong <= allowed_wrong, (
+        np.linalg.norm(pred - onnx_pred),
+        number_wrong,
     )
-    assert number_wrong <= allowed_wrong
 
 
 @pytest.fixture

From f4b7b81e6ee9f4c0bfeabe9aa7d61dd69a209eac Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 22 Apr 2025 04:45:42 -0700
Subject: [PATCH 16/21] Refactor and add typing annotations

---
 legateboost/legateboost.py       | 201 ++++++-------------------------
 legateboost/models/base_model.py |  10 +-
 legateboost/models/krr.py        |  13 +-
 legateboost/models/linear.py     |  12 +-
 legateboost/models/nn.py         |  14 +--
 legateboost/models/tree.py       |  16 +--
 legateboost/objectives.py        |  82 ++++---------
 legateboost/onnx_utils.py        | 104 ++++++++++++++++
 legateboost/test/test_onnx.py    |  10 +-
 9 files changed, 188 insertions(+), 274 deletions(-)
 create mode 100644 legateboost/onnx_utils.py

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 6986cea2..4899ec42 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -17,6 +17,13 @@
 from .metrics import BaseMetric, metrics
 from .models import BaseModel, Tree
 from .objectives import OBJECTIVES_MAP, BaseObjective
+from .onnx_utils import (
+    init_predictions,
+    make_model,
+    merge_model_graphs,
+    mirror_predict_proba_output,
+    reshape_predictions,
+)
 from .shapley import global_shapley_attributions, local_shapley_attributions
 from .utils import AddableMixin, AddMember, PickleCupynumericMixin
 
@@ -560,87 +567,17 @@ def dump_models(self) -> str:
             text += str(m)
         return text
 
-    def _make_onnx_reshape_predictions(self, pred: cn.ndarray) -> cn.ndarray:
-        # make an onnx model that shapes the predictions equivalently to pred
-        shape = list(pred.shape)
-        shape[0] = -1
-        out_type = "int64" if pred.dtype == cn.int64 else "double"
-        import onnx
-
-        onnx_text = f"""
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
-        ReshapePredictions ({out_type}[N, M] predictions_in) => ({out_type}{shape} predictions_out)
-        {{
-            shape = Constant<value_ints={shape}>()
-            predictions_out = Reshape(predictions_in, shape)
-        }}
-        """  # noqa: E501
-        return onnx.parser.parse_model(onnx_text)
-
-    def _make_onnx_init(self, X_dtype):
-        import onnx
-
-        X_type_text = "double" if X_dtype == cn.float64 else "float"
-        onnx_text = f"""
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
-        ReshapePredictions ({X_type_text}[N, M] X_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
-        {{
-            X_out = Identity(X_in)
-            n_rows = Shape<end=1>(X_in)
-            one = Constant<value_ints=[1]>()
-            tile_repeat = Concat<axis=0>(n_rows, one)
-            predictions_out = Tile(init, tile_repeat)
-        }}
-        """  # noqa: E501
-        init_model = onnx.parser.parse_model(onnx_text)
-        init_model.graph.initializer.append(
-            onnx.numpy_helper.from_array(
-                np.atleast_2d(self.model_init_.__array__()), name="init"
-            )
-        )
-        return init_model
-
-    def _to_onnx_predict_raw(self, X: cn.ndarray):
-        from onnx.checker import check_model
-        from onnx.compose import merge_models
-
-        model = self._make_onnx_init(X.dtype)
-        if self.models_ is not None and len(self.models_) > 0:
-            model = merge_models(
-                model,
-                self.models_[0].to_onnx(X),
-                io_map=[("X_out", "X_in"), ("predictions_out", "predictions_in")],
-                prefix2="model_0_",
-            )
-
-        for i in range(1, len(self.models_)):
-            model = merge_models(
-                model,
-                self.models_[i].to_onnx(X),
-                io_map=[
-                    ("model_{}_X_out".format(i - 1), "X_in"),
-                    ("model_{}_predictions_out".format(i - 1), "predictions_in"),
-                ],
-                prefix2="model_{}_".format(i),
-            )
-
+    def _to_onnx_predict_raw(self, X: cn.ndarray) -> Any:
+        init_graph = init_predictions(self.model_init_, X.dtype)
+        graph = merge_model_graphs([init_graph] + [m.to_onnx(X) for m in self.models_])
         # remove the X_out output, we only need the predictions
-        model.graph.output.remove(model.graph.output[0])
+        graph.output.remove(graph.output[0])
+        return graph
 
-        check_model(model)
-        return model
-
-    def _to_onnx_predict_transformed(self, X: cn.ndarray):
-        from onnx.checker import check_model
-        from onnx.compose import merge_models
+    def _to_onnx_predict_transformed(self, X: cn.ndarray) -> Any:
+        import onnx
 
-        model = merge_models(
+        graph = onnx.compose.merge_graphs(
             self._to_onnx_predict_raw(X),
             self._objective_instance.onnx_transform(self.predict_raw(X[0:1])),
             io_map=[
@@ -651,8 +588,7 @@ def _to_onnx_predict_transformed(self, X: cn.ndarray):
             ],
             prefix2="transform_",
         )
-        check_model(model)
-        return model
+        return graph
 
     def global_attributions(
         self,
@@ -941,7 +877,7 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
             pred = pred.squeeze(axis=1)
         return pred
 
-    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict"):
+    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any:
         """Converts the estimator to an ONNX model which is expected to produce
         equivalent predictions to `predict_function` up to reasonable floating
         point tolerance. The ONNX model is hard coded to the X input data type,
@@ -975,27 +911,22 @@ def to_onnx(self, X: cn.ndarray, predict_function: str = "predict"):
         >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6)
         >>>
         """
-        from onnx.checker import check_model
-        from onnx.compose import merge_models
+        import onnx
 
         if predict_function not in ["predict", "predict_raw"]:
             raise ValueError(
                 "predict_function should be one of ['predict', 'predict_raw']"
             )
         if predict_function == "predict":
-            model = self._to_onnx_predict_transformed(X)
+            graph = self._to_onnx_predict_transformed(X)
         else:
-            model = self._to_onnx_predict_raw(X)
+            graph = self._to_onnx_predict_raw(X)
 
         # coerce the output shape to be the same as the equivalent predict function
         test_pred = getattr(self, predict_function)(X[0:1])
-        model = merge_models(
-            model,
-            self._make_onnx_reshape_predictions(test_pred),
-            io_map=[(model.graph.output[0].name, "predictions_in")],
-            prefix2="reshape_",
-        )
-        check_model(model)
+        graph = reshape_predictions(graph, test_pred)
+        model = make_model(graph)
+        onnx.checker.check_model(model, full_check=True)
         return model
 
 
@@ -1244,66 +1175,7 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
         check_is_fitted(self)
         return self._objective_instance.output_class(self.predict_proba(X))
 
-    def _mirror_predict_proba_output(self, model) -> cn.ndarray:
-        assert len(self.classes_) == 2
-        from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
-        from onnx.compose import merge_models
-        from onnx.helper import (
-            make_graph,
-            make_model,
-            make_node,
-            make_opsetid,
-            make_tensor_value_info,
-        )
-
-        nodes = []
-        predictions_in = make_tensor_value_info(
-            "predictions_in",
-            TensorProto.DOUBLE,
-            [None, None],
-        )
-        predictions_out = make_tensor_value_info(
-            "predictions_out",
-            TensorProto.DOUBLE,
-            [None, 2],
-        )
-        one = numpy_helper.from_array(np.array([1.0], dtype=np.float64), name="one")
-        nodes.append(make_node("Sub", ["one", "predictions_in"], ["false_probability"]))
-        nodes.append(
-            make_node(
-                "Concat",
-                ["false_probability", "predictions_in"],
-                ["predictions_out"],
-                axis=1,
-            )
-        )
-
-        graph = make_graph(
-            nodes,
-            "mirror predict proba",
-            [predictions_in],
-            [predictions_out],
-            [one],
-        )
-        new_model = make_model(
-            graph,
-            opset_imports=[
-                make_opsetid("", 21),
-            ],
-        )
-        new_model = merge_models(
-            model,
-            new_model,
-            io_map=[
-                (model.graph.output[0].name, "predictions_in"),
-            ],
-            prefix2="mirror_",
-        )
-        check_model(new_model)
-        return new_model
-
-    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict"):
+    def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any:
         """Converts the estimator to an ONNX model which is expected to produce
         equivalent predictions to `predict_function` up to reasonable floating
         point tolerance. The ONNX model is hard coded to the X input data type,
@@ -1338,8 +1210,7 @@ def to_onnx(self, X: cn.ndarray, predict_function: str = "predict"):
         >>> assert np.allclose(model.predict_proba(X), onnx_pred, atol=1e-6)
         >>>
         """
-        from onnx.checker import check_model
-        from onnx.compose import merge_models
+        import onnx
 
         if predict_function not in ["predict", "predict_proba", "predict_raw"]:
             raise ValueError(
@@ -1347,34 +1218,30 @@ def to_onnx(self, X: cn.ndarray, predict_function: str = "predict"):
                 " 'predict_proba', 'predict_raw']"
             )
         if predict_function in ["predict_proba", "predict"]:
-            model = self._to_onnx_predict_transformed(X)
+            graph = self._to_onnx_predict_transformed(X)
             # need to mirror the output when we only output one target
             if self.predict_raw(X[0:1]).shape[1] == 1:
-                model = self._mirror_predict_proba_output(model)
+                graph = mirror_predict_proba_output(graph)
         if predict_function == "predict":
             # argmax the predict_proba output
             argmax = self._objective_instance.onnx_output_class(
                 self.predict_proba(X[0:1])
             )
-            model = merge_models(
-                model,
+            graph = onnx.compose.merge_graphs(
+                graph,
                 argmax,
                 io_map=[
-                    (model.graph.output[0].name, "predictions_in"),
+                    (graph.output[0].name, "predictions_in"),
                 ],
                 prefix2="classifier_predict_",
             )
 
         elif predict_function == "predict_raw":
-            model = self._to_onnx_predict_raw(X)
+            graph = self._to_onnx_predict_raw(X)
 
         # coerce the output shape to be the same as the equivalent predict function
         test_pred = getattr(self, predict_function)(X[0:1])
-        model = merge_models(
-            model,
-            self._make_onnx_reshape_predictions(test_pred),
-            io_map=[(model.graph.output[0].name, "predictions_in")],
-            prefix2="reshape_",
-        )
-        check_model(model)
+        graph = reshape_predictions(graph, test_pred)
+        model = make_model(graph)
+        onnx.checker.check_model(model, full_check=True)
         return model
diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
index 45fac68d..38e37cb7 100644
--- a/legateboost/models/base_model.py
+++ b/legateboost/models/base_model.py
@@ -127,13 +127,13 @@ def __mul__(self, scalar: Any) -> "BaseModel":
     def __hash__(self) -> int:
         return hash(str(self))
 
-    def to_onnx(self, X) -> Any:
-        """Convert the model to an ONNX model.
+    def to_onnx(self, X: cn.array) -> Any:
+        """Convert the model to an ONNX graph.
 
-        The implemented ONNX model should accept the following two inputs:
+        The implemented ONNX graph should accept the following two inputs:
         - "X_in" : 2D tensor of shape (n_samples, n_features) and type `X_dtype`.
         - "predictions in" : 2D tensor of shape (n_samples, n_outputs) and type double.
-        The model should output:
+        The graph should output:
         - "predictions out" : 2D tensor of shape (n_samples, n_outputs) and type double.
 
         Parameters
@@ -146,6 +146,6 @@ def to_onnx(self, X) -> Any:
         Returns
         -------
         Any
-            The ONNX model.
+            The ONNX graph.
         """
         raise NotImplementedError
diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index 5ecaaebc..0e1ea61d 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -243,16 +243,13 @@ def __mul__(self, scalar: Any) -> "KRR":
         self.betas_ *= scalar
         return new
 
-    def to_onnx(self, X) -> Any:
+    def to_onnx(self, X: cn.array) -> Any:
         import onnx
 
         X_type_text = "double" if X.dtype == cn.float64 else "float"
+        assert self.sigma is not None, "Has model been trained?"
         denominator = -2.0 * self.sigma**2
         onnx_text = f"""
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         KRRModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
         {{
             X_out = Identity(X_in)
@@ -274,11 +271,11 @@ def to_onnx(self, X) -> Any:
             predictions_out = Add(dot_double, predictions_in)
         }}
         """  # noqa: E501
-        model = onnx.parser.parse_model(onnx_text)
-        model.graph.initializer.extend(
+        graph = onnx.parser.parse_graph(onnx_text)
+        graph.initializer.extend(
             [
                 onnx.numpy_helper.from_array(self.betas_.__array__(), name="betas"),
                 onnx.numpy_helper.from_array(self.X_train.__array__(), name="X_train"),
             ]
         )
-        return model
+        return graph
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
index edb30e61..1319d2de 100644
--- a/legateboost/models/linear.py
+++ b/legateboost/models/linear.py
@@ -152,15 +152,11 @@ def __mul__(self, scalar: Any) -> "Linear":
         new.betas_ *= scalar
         return new
 
-    def to_onnx(self, X) -> Any:
+    def to_onnx(self, X: cn.array) -> Any:
         import onnx
 
         X_type_text = "double" if X.dtype == cn.float64 else "float"
         onnx_text = f"""
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         LinearModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
         {{
             X_out = Identity(X_in)
@@ -170,8 +166,8 @@ def to_onnx(self, X) -> Any:
             predictions_out = Add(result_double, predictions_in)
         }}
         """  # noqa: E501
-        model = onnx.parser.parse_model(onnx_text)
-        model.graph.initializer.extend(
+        graph = onnx.parser.parse_graph(onnx_text)
+        graph.initializer.extend(
             [
                 onnx.numpy_helper.from_array(self.betas_[1:].__array__(), name="betas"),
                 onnx.numpy_helper.from_array(
@@ -179,4 +175,4 @@ def to_onnx(self, X) -> Any:
                 ),
             ]
         )
-        return model
+        return graph
diff --git a/legateboost/models/nn.py b/legateboost/models/nn.py
index d628b6e5..969dc0df 100644
--- a/legateboost/models/nn.py
+++ b/legateboost/models/nn.py
@@ -182,14 +182,11 @@ def __mul__(self, scalar: Any) -> "NN":
         new.biases_[-1] *= scalar
         return new
 
-    def to_onnx(self, X) -> Any:
+    def to_onnx(self, X: cn.array) -> Any:
         from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
-            make_model,
             make_node,
-            make_opsetid,
             make_tensor_value_info,
             np_dtype_to_tensor_dtype,
         )
@@ -275,11 +272,4 @@ def to_onnx(self, X) -> Any:
             [X_out, predictions_out],
             biases + coefficients,
         )
-        onnx_model = make_model(
-            graph,
-            opset_imports=[
-                make_opsetid("", 21),
-            ],
-        )
-        check_model(onnx_model)
-        return onnx_model
+        return graph
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index c6711172..cc23d59b 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -316,13 +316,10 @@ def __mul__(self, scalar: Any) -> "Tree":
         new.leaf_value *= scalar
         return new
 
-    def to_onnx(self, X) -> Any:
-        import onnx
+    def to_onnx(self, X: cn.array) -> Any:
         from onnx import TensorProto, numpy_helper
-        from onnx.checker import check_model
         from onnx.helper import (
             make_graph,
-            make_model,
             make_node,
             make_tensor_value_info,
             np_dtype_to_tensor_dtype,
@@ -427,13 +424,6 @@ def to_onnx(self, X) -> Any:
             [X_in, predictions_in],
             [X_out, predictions_out],
             [leaf_weights],
+            # opset_imports=[make_opsetid("ai.onnx.ml", 3), make_opsetid("", 21)],
         )
-        model = make_model(
-            graph,
-            opset_imports=[
-                onnx.helper.make_opsetid("ai.onnx.ml", 3),
-                onnx.helper.make_opsetid("", 21),
-            ],
-        )
-        check_model(model)
-        return model
+        return graph
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index d2607d2a..1c819258 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Tuple
+from typing import Any, Tuple
 
 from scipy.stats import norm
 from typing_extensions import TypeAlias, override
@@ -72,7 +72,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return pred
 
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
-        """Returns an ONNX model that accepts
+        """Returns an ONNX graph that accepts
         - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
         And outputs the transformed predictions.
         - "predictions_out" : arbitrary tensor depending on the objective.
@@ -83,21 +83,17 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         method for each objective.
 
         Returns:
-            Onnx model that transforms the predictions.
+            Onnx graph that transforms the predictions.
         """
         import onnx
 
         onnx_text = """
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         BaseObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
             predictions_out = Identity(predictions_in)
         }
         """
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
     @abstractmethod
     def metric(self) -> BaseMetric:
@@ -144,7 +140,7 @@ def output_class(self, pred: cn.ndarray) -> cn.ndarray:
         """
         return cn.argmax(pred, axis=-1)
 
-    def onnx_output_class(self, pred: cn.ndarray):
+    def onnx_output_class(self, pred: cn.ndarray) -> Any:
         """Returns an ONNX model that accepts
         - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
         And outputs the predicted class labels.
@@ -156,16 +152,12 @@ def onnx_output_class(self, pred: cn.ndarray):
         import onnx
 
         onnx_text = """
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         BaseModelOutputClass (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
             predictions_out = ArgMax<axis = -1>(predictions_in)
         }
         """  # noqa: E501
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
 
 class SquaredErrorObjective(BaseObjective):
@@ -299,10 +291,6 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         import onnx
 
         onnx_text = """
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         NormalObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
             out_shape = Constant<value_ints = [0,-1,2]>()
@@ -320,7 +308,7 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
             predictions_out = Concat<axis=2>(mean, clipped_variance)
         }
         """
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
     @override
     def mean(self, param: cn.ndarray) -> cn.ndarray:
@@ -405,16 +393,12 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         import onnx
 
         onnx_text = """
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         GammaDevianceTransform (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
             predictions_out = Exp(predictions_in)
         }
         """  # noqa: E501
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
     def initialise_prediction(
         self, y: cn.ndarray, w: cn.ndarray, boost_from_average: bool
@@ -465,16 +449,12 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         import onnx
 
         onnx_text = """
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         GammaTransform (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
             predictions_out = Exp(predictions_in)
         }
         """
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
     @override
     def metric(self) -> GammaLLMetric:
@@ -627,16 +607,12 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
 
         operator_to_use = "Sigmoid" if pred.shape[1] == 1 else "Softmax"
         onnx_text = f"""
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {{
             predictions_out = {operator_to_use}(predictions_in)
         }}
         """
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
     def metric(self) -> LogLossMetric:
         return LogLossMetric()
@@ -678,28 +654,20 @@ def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         import onnx
 
         onnx_text = """
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         MultiLabelObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
             predictions_out = Sigmoid(predictions_in)
         }
         """  # noqa: E501
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
     def output_class(self, pred: cn.ndarray) -> cn.ndarray:
         return cn.array(pred > 0.5, dtype=cn.int64)
 
-    def onnx_output_class(self, pred: cn.ndarray):
+    def onnx_output_class(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
         MultiLabelOutputClass (double[N, M] predictions_in) => (double[N, M] predictions_out)
         {
             half = Constant<value = double {0.5}>()
@@ -707,7 +675,7 @@ def onnx_output_class(self, pred: cn.ndarray):
             predictions_out = Cast<to = 7>(greater)
         }
         """  # noqa: E501
-        return onnx.parser.parse_model(onnx_text)
+        return onnx.parser.parse_graph(onnx_text)
 
     def metric(self) -> MultiLabelMetric:
         return MultiLabelMetric()
@@ -776,35 +744,29 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
     def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
         import onnx
 
+        onnx_text = """
+        LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        """  # noqa: E501
+
         if pred.shape[1] == 1:
-            onnx_text = """
-            <
-                ir_version: 10,
-                opset_import: ["" : 21]
-            >
-            LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+            onnx_text += """
             {
                 constant = Constant<value = double {2.0}>()
                 a = Mul(predictions_in, constant)
                 predictions_out = Sigmoid(a)
             }
-            """  # noqa: E501
-            return onnx.parser.parse_model(onnx_text)
+            """
+            return onnx.parser.parse_graph(onnx_text)
 
         constant = 1 / (pred.shape[1] - 1)
-        onnx_text_multiclass = f"""
-        <
-            ir_version: 10,
-            opset_import: ["" : 21]
-        >
-        LogLossObjective (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        onnx_text += f"""
         {{
             constant = Constant<value = double {{{constant}}}>()
             a = Mul(predictions_in, constant)
             predictions_out = Softmax(a)
         }}
         """
-        return onnx.parser.parse_model(onnx_text_multiclass)
+        return onnx.parser.parse_graph(onnx_text)
 
     def metric(self) -> ExponentialMetric:
         return ExponentialMetric()
diff --git a/legateboost/onnx_utils.py b/legateboost/onnx_utils.py
new file mode 100644
index 00000000..c1c3b8f6
--- /dev/null
+++ b/legateboost/onnx_utils.py
@@ -0,0 +1,104 @@
+from typing import Any, List
+
+import numpy as np
+
+import cupynumeric as cn
+
+# onnx is imported only if needed - keep this a soft dependency
+try:
+    import onnx
+except ImportError:
+    pass
+
+
+def make_model(graph: onnx.GraphProto) -> onnx.ModelProto:
+    # make model with appropriate opset imports for legate-boost
+    LEGATEBOOST_ONNX_OPSET_IMPORTS = [
+        onnx.helper.make_opsetid("ai.onnx.ml", 3),
+        onnx.helper.make_opsetid("", 21),
+    ]
+    return onnx.helper.make_model(graph, opset_imports=LEGATEBOOST_ONNX_OPSET_IMPORTS)
+
+
+def reshape_predictions(graph: onnx.GraphProto, pred: cn.ndarray) -> onnx.GraphProto:
+    # àppend an onnx graph that shapes the predictions equivalently to pred
+    shape = list(pred.shape)
+    shape[0] = -1
+    out_type = "int64" if pred.dtype == cn.int64 else "double"
+    onnx_text = f"""
+    ReshapePredictions ({out_type}[N, M] predictions_in) => ({out_type}{shape} predictions_out)
+    {{
+        shape = Constant<value_ints={shape}>()
+        predictions_out = Reshape(predictions_in, shape)
+    }}
+    """  # noqa: E501
+    reshape_graph = onnx.parser.parse_graph(onnx_text)
+    graph = onnx.compose.merge_graphs(
+        graph,
+        reshape_graph,
+        io_map=[
+            (graph.output[0].name, "predictions_in"),
+        ],
+        prefix2="reshape_",
+    )
+    return graph
+
+
+def mirror_predict_proba_output(graph: onnx.GraphProto) -> onnx.GraphProto:
+    # where model outputs only true probability we need to add the false probability
+    onnx_text = """
+    MirrorPredict (double[N, M] predictions_in) => (double[N, 2] predictions_out)
+    {
+        one = Constant<value = double {1.0}>()
+        false_probability = Sub(one, predictions_in)
+        predictions_out = Concat<axis=1>(false_probability, predictions_in)
+    }
+    """  # noqa: E501
+    new_graph = onnx.parser.parse_graph(onnx_text)
+    new_graph = onnx.compose.merge_graphs(
+        graph,
+        new_graph,
+        io_map=[
+            (graph.output[0].name, "predictions_in"),
+        ],
+        prefix2="mirror_",
+    )
+    return new_graph
+
+
+def init_predictions(model_init: cn.array, X_dtype: Any) -> onnx.GraphProto:
+    # form a graph that takes X_in and model_init as input and outputs
+    # model_init repeated n_rows times
+
+    X_type_text = "double" if X_dtype == cn.float64 else "float"
+    onnx_text = f"""
+    InitPredictions ({X_type_text}[N, M] X_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
+    {{
+        X_out = Identity(X_in)
+        n_rows = Shape<end=1>(X_in)
+        one = Constant<value_ints=[1]>()
+        tile_repeat = Concat<axis=0>(n_rows, one)
+        predictions_out = Tile(init, tile_repeat)
+    }}
+    """  # noqa: E501
+    graph = onnx.parser.parse_graph(onnx_text)
+    graph.initializer.append(
+        onnx.numpy_helper.from_array(np.atleast_2d(model_init.__array__()), name="init")
+    )
+    return graph
+
+
+def merge_model_graphs(graphs: List[onnx.GraphProto]) -> onnx.GraphProto:
+    # merge a list of graphs into a single graph
+    combined = graphs[0]
+    for i, g in enumerate(graphs[1:]):
+        combined = onnx.compose.merge_graphs(
+            combined,
+            g,
+            io_map=[
+                (combined.output[0].name, "X_in"),
+                (combined.output[1].name, "predictions_in"),
+            ],
+            prefix2="model_{}_".format(i),
+        )
+    return combined
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 89f5a377..07641e9f 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -4,10 +4,12 @@
 
 import cupynumeric as cn
 import legateboost as lb
+from legateboost.onnx_utils import make_model
 
 
 def compare_model_predictions(model, X):
-    sess = ort.InferenceSession(model.to_onnx(X).SerializeToString())
+    onnx_model = make_model(model.to_onnx(X))
+    sess = ort.InferenceSession(onnx_model.SerializeToString())
     feeds = {
         "X_in": X,
     }
@@ -99,6 +101,12 @@ def regression_dataset(dtype, n_outputs):
 @pytest.mark.parametrize("n_outputs", [1, 5])
 def test_regressor(Model, objective, regression_dataset):
     X, y = regression_dataset
+    if (
+        Model in [lb.models.KRR, lb.models.NN]
+        and objective == "gamma"
+        and X.dtype == np.float32
+    ):
+        pytest.skip("Skipping as numerically unstable")
     if objective in [
         "quantile",
         "gamma_deviance",

From 5dcc2c2cc2ea73d4102e689dd8393da24f9b01d6 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 22 Apr 2025 05:16:11 -0700
Subject: [PATCH 17/21] Remove onnx type hints

---
 legateboost/onnx_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/legateboost/onnx_utils.py b/legateboost/onnx_utils.py
index c1c3b8f6..3b2f72fb 100644
--- a/legateboost/onnx_utils.py
+++ b/legateboost/onnx_utils.py
@@ -11,7 +11,7 @@
     pass
 
 
-def make_model(graph: onnx.GraphProto) -> onnx.ModelProto:
+def make_model(graph: Any) -> Any:
     # make model with appropriate opset imports for legate-boost
     LEGATEBOOST_ONNX_OPSET_IMPORTS = [
         onnx.helper.make_opsetid("ai.onnx.ml", 3),
@@ -20,7 +20,7 @@ def make_model(graph: onnx.GraphProto) -> onnx.ModelProto:
     return onnx.helper.make_model(graph, opset_imports=LEGATEBOOST_ONNX_OPSET_IMPORTS)
 
 
-def reshape_predictions(graph: onnx.GraphProto, pred: cn.ndarray) -> onnx.GraphProto:
+def reshape_predictions(graph: Any, pred: cn.ndarray) -> Any:
     # àppend an onnx graph that shapes the predictions equivalently to pred
     shape = list(pred.shape)
     shape[0] = -1
@@ -44,7 +44,7 @@ def reshape_predictions(graph: onnx.GraphProto, pred: cn.ndarray) -> onnx.GraphP
     return graph
 
 
-def mirror_predict_proba_output(graph: onnx.GraphProto) -> onnx.GraphProto:
+def mirror_predict_proba_output(graph: Any) -> Any:
     # where model outputs only true probability we need to add the false probability
     onnx_text = """
     MirrorPredict (double[N, M] predictions_in) => (double[N, 2] predictions_out)
@@ -66,7 +66,7 @@ def mirror_predict_proba_output(graph: onnx.GraphProto) -> onnx.GraphProto:
     return new_graph
 
 
-def init_predictions(model_init: cn.array, X_dtype: Any) -> onnx.GraphProto:
+def init_predictions(model_init: cn.array, X_dtype: Any) -> Any:
     # form a graph that takes X_in and model_init as input and outputs
     # model_init repeated n_rows times
 
@@ -88,7 +88,7 @@ def init_predictions(model_init: cn.array, X_dtype: Any) -> onnx.GraphProto:
     return graph
 
 
-def merge_model_graphs(graphs: List[onnx.GraphProto]) -> onnx.GraphProto:
+def merge_model_graphs(graphs: List[Any]) -> Any:
     # merge a list of graphs into a single graph
     combined = graphs[0]
     for i, g in enumerate(graphs[1:]):

From b9f67f0d8ce059875c9638dd6b8a02d1e5df899b Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 23 Apr 2025 01:29:15 -0700
Subject: [PATCH 18/21] Update doc example

---
 legateboost/legateboost.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 4899ec42..a7e1b50c 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -906,7 +906,7 @@ def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any:
         >>> y = np.random.random(X.shape[0])
         >>> model = lb.LBRegressor(n_estimators=5).fit(X, y)
         >>> import onnxruntime as ort
-        >>> sess = ort.InferenceSession(model.to_onnx(X.dtype).SerializeToString())
+        >>> sess = ort.InferenceSession(model.to_onnx(X).SerializeToString())
         >>> onnx_pred = sess.run(None, {"X_in": X})[0]
         >>> assert np.allclose(model.predict(X), onnx_pred, atol=1e-6)
         >>>
@@ -1204,7 +1204,7 @@ def to_onnx(self, X: cn.ndarray, predict_function: str = "predict") -> Any:
         >>> y = np.random.randint(0, 2, X.shape[0])
         >>> model = lb.LBClassifier(n_estimators=5).fit(X, y)
         >>> import onnxruntime as ort
-        >>> sess = ort.InferenceSession(model.to_onnx(X.dtype,
+        >>> sess = ort.InferenceSession(model.to_onnx(X,
         ...     predict_function="predict_proba").SerializeToString())
         >>> onnx_pred = sess.run(None, {"X_in": X})[0]
         >>> assert np.allclose(model.predict_proba(X), onnx_pred, atol=1e-6)

From f5dd467a51d5f1df46785eef4e2791e6dd87bbcc Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 24 Apr 2025 04:03:11 -0700
Subject: [PATCH 19/21] Make the onnx tree sparse with a recursive builder

---
 legateboost/models/tree.py | 114 +++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 30 deletions(-)

diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index cc23d59b..5da43355 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -1,9 +1,11 @@
 import copy
 import warnings
+from dataclasses import dataclass
 from enum import IntEnum
 from typing import Any, Callable, List, Sequence, Union, cast
 
 import numpy as np
+import numpy.typing as npt
 
 import cupynumeric as cn
 from legate.core import TaskTarget, get_legate_runtime, types
@@ -316,6 +318,54 @@ def __mul__(self, scalar: Any) -> "Tree":
         new.leaf_value *= scalar
         return new
 
+    # copy the tree structure to numpy arrays
+    # cupynumeric element access is very slow
+    @dataclass
+    class TreeAsNumpy:
+        leaf_value: npt.NDArray[np.float64]
+        feature: npt.NDArray[np.int32]
+        split_value: npt.NDArray[np.float64]
+        gain: npt.NDArray[np.float64]
+        hessian: npt.NDArray[np.float64]
+        is_leaf: npt.NDArray[np.bool_]
+
+    # structure of arrays for tree structure expected by onnx
+    # this container is a convenience to not have 7 function arguments
+    class OnnxSoa:
+        def __init__(self, size: int, n_outputs: int) -> None:
+            self.nodes_modes = np.full(size, "BRANCH_LEQ")
+            self.nodes_featureids = np.full(size, -1, dtype=np.int32)
+            self.nodes_truenodeids = np.full(size, -1, dtype=np.int32)
+            self.nodes_falsenodeids = np.full(size, -1, dtype=np.int32)
+            self.nodes_nodeids = np.arange(size, dtype=np.int32)
+            self.nodes_values = np.full(size, -1.0, dtype=np.float64)
+            self.leaf_weights = np.full((size, n_outputs), -1.0, dtype=np.float64)
+
+    def recurse_tree(
+        self, tree: TreeAsNumpy, soa: OnnxSoa, old_node_idx: int, new_node_idx: int
+    ) -> int:
+        # new_node_idx is sparse
+        if tree.is_leaf[old_node_idx]:
+            soa.nodes_modes[new_node_idx] = "LEAF"
+            soa.leaf_weights[new_node_idx] = tree.leaf_value[old_node_idx]
+            return new_node_idx
+        else:
+            soa.nodes_modes[new_node_idx] = "BRANCH_LEQ"
+            soa.nodes_featureids[new_node_idx] = tree.feature[old_node_idx]
+            soa.nodes_values[new_node_idx] = tree.split_value[old_node_idx]
+            left_child_idx = new_node_idx + 1
+            soa.nodes_truenodeids[new_node_idx] = left_child_idx
+            node_idx_counter = self.recurse_tree(
+                tree, soa, self.left_child(old_node_idx), left_child_idx
+            )
+            right_child_idx = node_idx_counter + 1
+            soa.nodes_falsenodeids[new_node_idx] = right_child_idx
+            node_idx_counter = self.recurse_tree(
+                tree, soa, self.right_child(old_node_idx), right_child_idx
+            )
+
+        return node_idx_counter
+
     def to_onnx(self, X: cn.array) -> Any:
         from onnx import TensorProto, numpy_helper
         from onnx.helper import (
@@ -325,32 +375,39 @@ def to_onnx(self, X: cn.array) -> Any:
             np_dtype_to_tensor_dtype,
         )
 
+        num_sparse_nodes = (self.hessian[:, 0] > 0.0).sum()
+        num_outputs = self.leaf_value.shape[1]
+        # copy the tree as numpy because single element
+        # access with cupynumeric is very slow
+        tree = Tree.TreeAsNumpy(
+            self.leaf_value.__array__(),
+            self.feature.__array__(),
+            self.split_value.__array__(),
+            self.gain.__array__(),
+            self.hessian.__array__(),
+            self.feature.__array__() == -1,
+        )
+        soa = Tree.OnnxSoa(num_sparse_nodes, num_outputs)
+        # This recursive function could become a bottleneck for large trees
+        # In this case consider implmenting a C++ legate task for this conversion
+        # Cython could also work
+        self.recurse_tree(tree, soa, 0, 0)
+
         onnx_nodes = []
 
-        num_outputs = self.leaf_value.shape[1]
-        tree_max_nodes = self.feature.size
-        all_nodes_idx = np.arange(tree_max_nodes)
-        nodes_featureids = self.feature.__array__()
-        nodes_truenodeids = self.left_child(all_nodes_idx)
-        nodes_falsenodeids = self.right_child(all_nodes_idx)
-        node_modes = np.full(tree_max_nodes, "BRANCH_LEQ")
-        node_modes[self.is_leaf(all_nodes_idx)] = "LEAF"
-        leaf_targetids = np.full(tree_max_nodes, 0, dtype=np.int64)
-        # predict the leaf node index
-        # use it to later index into the 2d array of leaf weights
-        # as ONNX does not support 2d leaf weights
-        target_weights = all_nodes_idx.astype(np.float32)
         kwargs = {}
         # TreeEnsembleRegressor asks us to pass these as tensors when X.dtype is double
+        # we simply pass a set of indices as leaf weights and then add a node later to
+        # look up the (vector valued) leaf weights
         if X.dtype == np.float32:
-            kwargs["nodes_values"] = self.split_value.__array__()
-            kwargs["target_weights"] = target_weights
+            kwargs["nodes_values"] = soa.nodes_values.astype(np.float32)
+            kwargs["target_weights"] = soa.nodes_nodeids.astype(np.float32)
         else:
             kwargs["nodes_values_as_tensor"] = numpy_helper.from_array(
-                self.split_value.__array__(), name="nodes_values"
+                soa.nodes_values, name="nodes_values"
             )
             kwargs["target_weights_as_tensor"] = numpy_helper.from_array(
-                target_weights.astype(np.float64), name="target_weights"
+                soa.nodes_nodeids.astype(np.float64), name="target_weights"
             )
 
         # TreeEnsembleRegressor is deprecated, but its successor TreeEnsemble
@@ -366,22 +423,20 @@ def to_onnx(self, X: cn.array) -> Any:
                 membership_values=None,
                 nodes_missing_value_tracks_true=None,
                 nodes_hitrates=None,
-                nodes_modes=node_modes,
-                nodes_featureids=nodes_featureids,
-                nodes_truenodeids=nodes_truenodeids,
-                nodes_falsenodeids=nodes_falsenodeids,
-                nodes_nodeids=all_nodes_idx,
-                nodes_treeids=np.zeros(tree_max_nodes, dtype=np.int64),
-                target_ids=leaf_targetids,
-                target_nodeids=all_nodes_idx,
-                target_treeids=np.zeros(tree_max_nodes, dtype=np.int64),
+                nodes_modes=soa.nodes_modes,
+                nodes_featureids=soa.nodes_featureids,
+                nodes_truenodeids=soa.nodes_truenodeids,
+                nodes_falsenodeids=soa.nodes_falsenodeids,
+                nodes_nodeids=soa.nodes_nodeids,
+                nodes_treeids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64),
+                target_ids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64),
+                target_nodeids=soa.nodes_nodeids,
+                target_treeids=np.zeros_like(soa.nodes_nodeids, dtype=np.int64),
                 **kwargs,
             )
         )
 
-        leaf_weights = numpy_helper.from_array(
-            self.leaf_value.__array__(), name="leaf_weights"
-        )
+        leaf_weights = numpy_helper.from_array(soa.leaf_weights, name="leaf_weights")
         predictions_out = make_tensor_value_info(
             "predictions_out", TensorProto.DOUBLE, [None, num_outputs]
         )
@@ -424,6 +479,5 @@ def to_onnx(self, X: cn.array) -> Any:
             [X_in, predictions_in],
             [X_out, predictions_out],
             [leaf_weights],
-            # opset_imports=[make_opsetid("ai.onnx.ml", 3), make_opsetid("", 21)],
         )
         return graph

From 05f8553b59b12362f95a305355d3f7c196f9b267 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 24 Apr 2025 04:12:15 -0700
Subject: [PATCH 20/21] Address review

---
 legateboost/models/krr.py     |  3 ++-
 legateboost/objectives.py     | 18 +++++++++---------
 legateboost/test/test_onnx.py |  2 --
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/legateboost/models/krr.py b/legateboost/models/krr.py
index 0e1ea61d..0a41788d 100644
--- a/legateboost/models/krr.py
+++ b/legateboost/models/krr.py
@@ -247,7 +247,8 @@ def to_onnx(self, X: cn.array) -> Any:
         import onnx
 
         X_type_text = "double" if X.dtype == cn.float64 else "float"
-        assert self.sigma is not None, "Has model been trained?"
+        if self.sigma is None:
+            raise ValueError("Model has not been trained. Cannot export to ONNX.")
         denominator = -2.0 * self.sigma**2
         onnx_text = f"""
         KRRModel ({X_type_text}[N, M] X_in, double[N, K] predictions_in) => ({X_type_text}[N, M] X_out, double[N, K] predictions_out)
diff --git a/legateboost/objectives.py b/legateboost/objectives.py
index 1c819258..08219f29 100644
--- a/legateboost/objectives.py
+++ b/legateboost/objectives.py
@@ -71,7 +71,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         """
         return pred
 
-    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
         """Returns an ONNX graph that accepts
         - "predictions_in" : 2D tensor of shape (n_samples, n_outputs) and type double.
         And outputs the transformed predictions.
@@ -152,7 +152,7 @@ def onnx_output_class(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
-        BaseModelOutputClass (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        BaseModelOutputClass (double[N, M] predictions_in) => (int64[N, M] predictions_out)
         {
             predictions_out = ArgMax<axis = -1>(predictions_in)
         }
@@ -287,7 +287,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         pred[:, :, 1] = cn.clip(pred[:, :, 1], -5, 5)
         return pred
 
-    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
@@ -389,7 +389,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return cn.exp(pred)
 
     @override
-    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
@@ -445,7 +445,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return cn.exp(pred)
 
     @override
-    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
@@ -602,7 +602,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         div = cn.sum(e_x, axis=1)
         return e_x / div[:, cn.newaxis]
 
-    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
         import onnx
 
         operator_to_use = "Sigmoid" if pred.shape[1] == 1 else "Softmax"
@@ -650,7 +650,7 @@ def gradient(self, y: cn.ndarray, pred: cn.ndarray) -> GradPair:
     def transform(self, pred: cn.ndarray) -> cn.ndarray:
         return self.one / (self.one + cn.exp(-pred))
 
-    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
@@ -668,7 +668,7 @@ def onnx_output_class(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
-        MultiLabelOutputClass (double[N, M] predictions_in) => (double[N, M] predictions_out)
+        MultiLabelOutputClass (double[N, M] predictions_in) => (int64[N, M] predictions_out)
         {
             half = Constant<value = double {0.5}>()
             greater = Greater(predictions_in, half)
@@ -741,7 +741,7 @@ def transform(self, pred: cn.ndarray) -> cn.ndarray:
         K = pred.shape[1]  # number of classes
         return logloss.transform((1 / (K - 1)) * pred)
 
-    def onnx_transform(self, pred: cn.ndarray) -> cn.ndarray:
+    def onnx_transform(self, pred: cn.ndarray) -> Any:
         import onnx
 
         onnx_text = """
diff --git a/legateboost/test/test_onnx.py b/legateboost/test/test_onnx.py
index 07641e9f..57e75d75 100644
--- a/legateboost/test/test_onnx.py
+++ b/legateboost/test/test_onnx.py
@@ -16,9 +16,7 @@ def compare_model_predictions(model, X):
     pred = model.predict(cn.array(X))
     feeds["predictions_in"] = np.zeros((X.shape[0], pred.shape[1]))
     onnx_pred = sess.run(None, feeds)[1]
-    onnx_pred = onnx_pred.squeeze()
     assert onnx_pred.dtype == np.float64
-    pred = pred.squeeze()
     assert pred.shape == onnx_pred.shape
     assert np.allclose(
         onnx_pred, pred, atol=1e-2 if X.dtype == np.float32 else 1e-6

From 3faf4f39f54ba732854de61ba2e6df407e85e382 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 2 May 2025 03:00:06 -0700
Subject: [PATCH 21/21] Some mypy issues

---
 ci/run_mypy.sh             |  1 +
 legateboost/models/tree.py | 28 +++++++++++++++++++---------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/ci/run_mypy.sh b/ci/run_mypy.sh
index fdb3e026..d3f818d1 100755
--- a/ci/run_mypy.sh
+++ b/ci/run_mypy.sh
@@ -14,6 +14,7 @@
 
 set -e -E -u -o pipefail
 
+mypy --version
 mypy \
     --config-file ./pyproject.toml \
     --exclude=legateboost/test \
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index 5da43355..b5827f13 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -2,7 +2,7 @@
 import warnings
 from dataclasses import dataclass
 from enum import IntEnum
-from typing import Any, Callable, List, Sequence, Union, cast
+from typing import Any, Callable, Dict, List, Sequence, Union, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -333,13 +333,23 @@ class TreeAsNumpy:
     # this container is a convenience to not have 7 function arguments
     class OnnxSoa:
         def __init__(self, size: int, n_outputs: int) -> None:
-            self.nodes_modes = np.full(size, "BRANCH_LEQ")
-            self.nodes_featureids = np.full(size, -1, dtype=np.int32)
-            self.nodes_truenodeids = np.full(size, -1, dtype=np.int32)
-            self.nodes_falsenodeids = np.full(size, -1, dtype=np.int32)
-            self.nodes_nodeids = np.arange(size, dtype=np.int32)
-            self.nodes_values = np.full(size, -1.0, dtype=np.float64)
-            self.leaf_weights = np.full((size, n_outputs), -1.0, dtype=np.float64)
+            self.nodes_modes: npt.NDArray[str] = np.full(size, "BRANCH_LEQ")
+            self.nodes_featureids: npt.NDArray[np.int32] = np.full(
+                size, -1, dtype=np.int32
+            )
+            self.nodes_truenodeids: npt.NDArray[np.int32] = np.full(
+                size, -1, dtype=np.int32
+            )
+            self.nodes_falsenodeids: npt.NDArray[np.int32] = np.full(
+                size, -1, dtype=np.int32
+            )
+            self.nodes_nodeids: npt.NDArray[np.int32] = np.arange(size, dtype=np.int32)
+            self.nodes_values: npt.NDArray[np.float64] = np.full(
+                size, -1.0, dtype=np.float64
+            )
+            self.leaf_weights: npt.NDArray[np.float64] = np.full(
+                (size, n_outputs), -1.0, dtype=np.float64
+            )
 
     def recurse_tree(
         self, tree: TreeAsNumpy, soa: OnnxSoa, old_node_idx: int, new_node_idx: int
@@ -395,7 +405,7 @@ def to_onnx(self, X: cn.array) -> Any:
 
         onnx_nodes = []
 
-        kwargs = {}
+        kwargs: Dict[str, Any] = {}
         # TreeEnsembleRegressor asks us to pass these as tensors when X.dtype is double
         # we simply pass a set of indices as leaf weights and then add a node later to
         # look up the (vector valued) leaf weights