From 9ebc25bc082868424515d5562031b5c579b72907 Mon Sep 17 00:00:00 2001
From: Kyle Montemayor <kyle.e.montemayor@gmail.com>
Date: Mon, 18 May 2026 23:10:24 +0000
Subject: [PATCH 1/3] Fix latent bugs surfaced by ty migration

Resolves 4 `# ty: ignore` comments from PR #585 by fixing the underlying logic (not suppressing). Includes None-division risk in loss.py and masked None flow in optimizer code paths.

See: docs/plans/20260518-ty-ignore-resolution-meta.md
---
 .../common/torchrec/utils.py                  | 22 +++--
 .../node_classification_modeling_task_spec.py |  5 +-
 .../utils/profiler_wrapper.py                 |  2 +-
 gigl/src/common/models/layers/loss.py         | 22 +++--
 .../unit/experimental/test_torchrec_utils.py  | 83 +++++++++++++++++++
 .../common/models/layers/legacy_loss_test.py  | 78 +++++++++++++++++
 6 files changed, 195 insertions(+), 17 deletions(-)
 create mode 100644 tests/unit/experimental/test_torchrec_utils.py
 create mode 100644 tests/unit/src/common/models/layers/legacy_loss_test.py

diff --git a/gigl/experimental/knowledge_graph_embedding/common/torchrec/utils.py b/gigl/experimental/knowledge_graph_embedding/common/torchrec/utils.py
index f8726dcb1..9d3b54ffd 100644
--- a/gigl/experimental/knowledge_graph_embedding/common/torchrec/utils.py
+++ b/gigl/experimental/knowledge_graph_embedding/common/torchrec/utils.py
@@ -120,24 +120,30 @@ def get_sharding_plan(
 def apply_sparse_optimizer(
     parameters: Iterable[nn.Parameter],
     optimizer_cls: Optional[Type[Optimizer]] = None,
-    optimizer_kwargs: Dict[str, Any] = dict(),
+    optimizer_kwargs: Optional[Dict[str, Any]] = None,
 ) -> None:
-    """
-    Apply a sparse optimizer to the sparse/EBC parts of a model.
+    """Apply a sparse optimizer to the sparse/EBC parts of a model.
+
     This optimizer is fused, so it will be applied directly in the backward pass.
 
     This should only be used for sparse parameters.
 
     Args:
         parameters (Iterable[nn.Parameter]): The sparse parameters to apply the optimizer to.
-        optimizer_cls (Type[Optimizer], optional): The optimizer class to use. Defaults to RowWiseAdagrad.
-        optimizer_kwargs (Dict[str, Any], optional): Additional keyword arguments for the optimizer.
+        optimizer_cls (Type[Optimizer], optional): The optimizer class to use.
+            Defaults to ``RowWiseAdagrad`` when ``None`` is passed.
+        optimizer_kwargs (Dict[str, Any], optional): Additional keyword arguments
+            for the optimizer. Defaults to ``{"lr": 0.01}`` when both
+            ``optimizer_cls`` and ``optimizer_kwargs`` are unset.
     """
 
-    if not optimizer_cls and optimizer_kwargs:
+    if optimizer_cls is None:
         optimizer_cls = RowWiseAdagrad
-        optimizer_kwargs = {"lr": 0.01}
-    apply_optimizer_in_backward(optimizer_cls, parameters, optimizer_kwargs)  # ty: ignore[invalid-argument-type] TODO(ty-torch-api-surface): fix ty false positives around the torch API surface.
+        if not optimizer_kwargs:
+            optimizer_kwargs = {"lr": 0.01}
+    if optimizer_kwargs is None:
+        optimizer_kwargs = {}
+    apply_optimizer_in_backward(optimizer_cls, parameters, optimizer_kwargs)
 
 
 def apply_dense_optimizer(
diff --git a/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py b/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py
index bfaba1fb0..feed188b4 100644
--- a/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py
+++ b/gigl/src/common/modeling_task_specs/node_classification_modeling_task_spec.py
@@ -248,11 +248,14 @@ def train(
             graph_backend=self._graph_backend,
             device=device,
         )
+        assert data_loaders.train_main is not None, (
+            "train_main dataloader required for training"
+        )
         best_val_acc = 0.0
         for epoch in range(self.__num_epochs):
             logger.info(f"Batch training... for epoch {epoch}/{self.__num_epochs}")
             train_loss = self._train(
-                data_loader=data_loaders.train_main,  # type: ignore[arg-type]  # ty: ignore[invalid-argument-type] TODO(ty-torch-api-surface): fix ty false positives around the torch API surface.
+                data_loader=data_loaders.train_main,
                 device=device,
             )
             train_loss_str = (
diff --git a/gigl/src/common/modeling_task_specs/utils/profiler_wrapper.py b/gigl/src/common/modeling_task_specs/utils/profiler_wrapper.py
index 9e634fce4..2a7c423a5 100644
--- a/gigl/src/common/modeling_task_specs/utils/profiler_wrapper.py
+++ b/gigl/src/common/modeling_task_specs/utils/profiler_wrapper.py
@@ -20,7 +20,7 @@
 class TorchProfiler:
     def __init__(self, **kwargs) -> None:
         self.trace_handler = tensorboard_trace_handler(
-            dir_name=TMP_PROFILER_LOG_DIR_NAME,  # type: ignore[arg-type]  # ty: ignore[invalid-argument-type] TODO(ty-torch-api-surface): fix ty false positives around the torch API surface.
+            dir_name=TMP_PROFILER_LOG_DIR_NAME.uri,
             use_gzip=True,
         )
         self.wait = int(kwargs.get("wait", 5))
diff --git a/gigl/src/common/models/layers/loss.py b/gigl/src/common/models/layers/loss.py
index 08ef93c7a..9ca934126 100644
--- a/gigl/src/common/models/layers/loss.py
+++ b/gigl/src/common/models/layers/loss.py
@@ -100,18 +100,27 @@ def forward(
 
 
 class SoftmaxLoss(nn.Module):
-    """
-    A loss layer built on top of the PyTorch implementation of the softmax cross entropy loss.
+    """A loss layer built on top of the PyTorch implementation of the softmax cross entropy loss.
+
+    The loss function by default calculates the loss by
+        cross_entropy(all_scores / softmax_temperature, ys, reduction='sum').
 
-    The loss function by default calculate the loss by
-        cross_entropy(all_scores, ys, reduction='sum')
+    Dividing the scores by ``softmax_temperature`` controls the sharpness of the
+    softmax distribution. A temperature of ``1.0`` is a no-op and corresponds to
+    plain cross-entropy.
 
     See: https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for more information.
+
+    Args:
+        softmax_temperature (float): Scaling factor applied via ``scores /
+            softmax_temperature`` before computing cross-entropy. Defaults to
+            ``1.0`` (no scaling). Must be non-zero; the caller is responsible for
+            supplying a finite, non-zero value.
     """
 
     def __init__(
         self,
-        softmax_temperature: Optional[float] = None,
+        softmax_temperature: float = 1.0,
     ):
         super(SoftmaxLoss, self).__init__()
         self.softmax_temperature = softmax_temperature
@@ -142,8 +151,7 @@ def _calculate_softmax_loss(
         )  # shape=[num_pos_nodes]
 
         loss = F.cross_entropy(
-            input=all_scores
-            / self.softmax_temperature,  # https://github.com/Snapchat/GiGL/issues/408  # ty: ignore[unsupported-operator] TODO(ty-torch-union-inference): fix ty Tensor/Module union inference regressions.
+            input=all_scores / self.softmax_temperature,
             target=ys,
             reduction="sum",
         )
diff --git a/tests/unit/experimental/test_torchrec_utils.py b/tests/unit/experimental/test_torchrec_utils.py
new file mode 100644
index 000000000..6307a7be1
--- /dev/null
+++ b/tests/unit/experimental/test_torchrec_utils.py
@@ -0,0 +1,83 @@
+"""Regression tests for ``apply_sparse_optimizer`` in ``gigl.experimental.knowledge_graph_embedding.common.torchrec.utils``.
+
+These tests guard the optimizer-flow bug surfaced by the ty migration: the
+previous guard ``if not optimizer_cls and optimizer_kwargs`` only fired when
+both arguments were truthy, allowing ``None`` to leak through into
+``apply_optimizer_in_backward`` whenever ``optimizer_cls`` was unset but
+``optimizer_kwargs`` was the default empty dict.
+"""
+
+from typing import Any
+from unittest.mock import patch
+
+import torch
+import torch.nn as nn
+from torch.optim import SGD
+
+from gigl.experimental.knowledge_graph_embedding.common.torchrec import utils
+from tests.test_assets.test_case import TestCase
+
+
+def _make_dummy_parameters() -> list[nn.Parameter]:
+    """Build a one-parameter list so the optimizer call has something to bind to."""
+    return [nn.Parameter(torch.zeros(1))]
+
+
+class ApplySparseOptimizerTest(TestCase):
+    def test_none_optimizer_cls_defaults_to_rowwise_adagrad(self) -> None:
+        """Calling without ``optimizer_cls`` must default to RowWiseAdagrad.
+
+        Regression: the old guard ``if not optimizer_cls and optimizer_kwargs``
+        skipped the assignment when ``optimizer_kwargs`` defaulted to ``{}``,
+        leaving ``optimizer_cls=None`` to crash ``apply_optimizer_in_backward``.
+        """
+        parameters = _make_dummy_parameters()
+        with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
+            utils.apply_sparse_optimizer(parameters=parameters)
+            mock_apply.assert_called_once()
+            forwarded_cls, forwarded_params, forwarded_kwargs = mock_apply.call_args[0]
+            self.assertIs(forwarded_cls, utils.RowWiseAdagrad)
+            self.assertEqual(list(forwarded_params), parameters)
+            self.assertEqual(forwarded_kwargs, {"lr": 0.01})
+
+    def test_none_optimizer_cls_with_kwargs_still_defaults_to_rowwise_adagrad(
+        self,
+    ) -> None:
+        """``optimizer_cls=None`` with user kwargs uses RowWiseAdagrad + user kwargs.
+
+        The user-supplied kwargs (when non-empty) are preserved verbatim; only
+        the missing class is filled in.
+        """
+        parameters = _make_dummy_parameters()
+        user_kwargs: dict[str, Any] = {"lr": 0.5}
+        with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
+            utils.apply_sparse_optimizer(
+                parameters=parameters, optimizer_kwargs=user_kwargs
+            )
+            forwarded_cls, _, forwarded_kwargs = mock_apply.call_args[0]
+            self.assertIs(forwarded_cls, utils.RowWiseAdagrad)
+            self.assertEqual(forwarded_kwargs, {"lr": 0.5})
+
+    def test_explicit_optimizer_cls_is_forwarded(self) -> None:
+        """An explicit ``optimizer_cls`` must be forwarded unchanged."""
+        parameters = _make_dummy_parameters()
+        with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
+            utils.apply_sparse_optimizer(parameters=parameters, optimizer_cls=SGD)
+            forwarded_cls, _, forwarded_kwargs = mock_apply.call_args[0]
+            self.assertIs(forwarded_cls, SGD)
+            # No kwargs were supplied; the call site sees an empty dict, not None.
+            self.assertEqual(forwarded_kwargs, {})
+
+    def test_explicit_optimizer_cls_and_kwargs_forwarded(self) -> None:
+        """Explicit ``optimizer_cls`` and kwargs must flow through verbatim."""
+        parameters = _make_dummy_parameters()
+        user_kwargs: dict[str, Any] = {"lr": 0.1, "momentum": 0.9}
+        with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
+            utils.apply_sparse_optimizer(
+                parameters=parameters,
+                optimizer_cls=SGD,
+                optimizer_kwargs=user_kwargs,
+            )
+            forwarded_cls, _, forwarded_kwargs = mock_apply.call_args[0]
+            self.assertIs(forwarded_cls, SGD)
+            self.assertEqual(forwarded_kwargs, {"lr": 0.1, "momentum": 0.9})
diff --git a/tests/unit/src/common/models/layers/legacy_loss_test.py b/tests/unit/src/common/models/layers/legacy_loss_test.py
new file mode 100644
index 000000000..8ec3b7c61
--- /dev/null
+++ b/tests/unit/src/common/models/layers/legacy_loss_test.py
@@ -0,0 +1,78 @@
+"""Regression tests for the legacy loss layers under ``gigl.src.common.models.layers.loss``.
+
+These tests guard against latent ``None`` propagation bugs that the ty migration
+surfaced. In particular, ``SoftmaxLoss.softmax_temperature`` used to be typed as
+``Optional[float]`` with a ``None`` default, which made the in-loop expression
+``all_scores / self.softmax_temperature`` raise ``TypeError`` at runtime whenever
+the constructor was called without an explicit temperature.
+"""
+
+import torch
+
+from gigl.src.common.models.layers.loss import SoftmaxLoss
+from gigl.src.common.types.graph_data import CondensedEdgeType
+from gigl.src.common.types.task_inputs import BatchScores
+from tests.test_assets.test_case import TestCase
+
+
+def _make_batch_scores() -> dict[CondensedEdgeType, BatchScores]:
+    """Build a minimal non-empty BatchScores dict suitable for SoftmaxLoss.forward.
+
+    ``BatchScores`` is annotated as ``FloatTensor`` but PyTorch factories return
+    plain ``Tensor`` — PR 3 of the ty-cleanup plan will relax these annotations
+    to ``Tensor``. Until then, ty rejects the call, so we ignore the
+    specialization mismatch at the construction site.
+    """
+    pos_scores = torch.tensor([[0.8, 0.6]])
+    hard_neg_scores = torch.tensor([[0.1, 0.2]])
+    random_neg_scores = torch.tensor([[0.05, 0.15]])
+    return {
+        CondensedEdgeType(0): BatchScores(
+            pos_scores=pos_scores,  # ty: ignore[invalid-argument-type] TODO(ty-torch-tensor-specialization): cleared by PR 3 relaxing BatchScores to Tensor.
+            hard_neg_scores=hard_neg_scores,  # ty: ignore[invalid-argument-type] TODO(ty-torch-tensor-specialization): cleared by PR 3 relaxing BatchScores to Tensor.
+            random_neg_scores=random_neg_scores,  # ty: ignore[invalid-argument-type] TODO(ty-torch-tensor-specialization): cleared by PR 3 relaxing BatchScores to Tensor.
+        )
+    }
+
+
+class SoftmaxLossTest(TestCase):
+    def test_default_temperature_does_not_raise(self) -> None:
+        """SoftmaxLoss must be constructible without explicit temperature.
+
+        Regression: before the ty-driven fix, ``softmax_temperature`` defaulted to
+        ``None``, so ``all_scores / self.softmax_temperature`` raised
+        ``TypeError: unsupported operand type(s) for /: 'Tensor' and 'NoneType'``.
+        """
+        loss_fn = SoftmaxLoss()
+        # The default is 1.0 (no scaling), and dividing by 1.0 must not raise.
+        self.assertEqual(loss_fn.softmax_temperature, 1.0)
+        loss, sample_size = loss_fn(loss_input=[_make_batch_scores()])
+        self.assertTrue(torch.isfinite(loss).item())
+        self.assertGreater(sample_size, 0)
+
+    def test_default_temperature_matches_unit_scale(self) -> None:
+        """Default temperature ``1.0`` produces the same loss as explicit ``1.0``."""
+        default_loss, _ = SoftmaxLoss()(loss_input=[_make_batch_scores()])
+        explicit_loss, _ = SoftmaxLoss(softmax_temperature=1.0)(
+            loss_input=[_make_batch_scores()]
+        )
+        self.assert_tensor_equality(default_loss, explicit_loss)
+
+    def test_custom_temperature_scales_loss(self) -> None:
+        """Smaller temperatures sharpen the softmax, yielding a different loss."""
+        unit_loss, _ = SoftmaxLoss(softmax_temperature=1.0)(
+            loss_input=[_make_batch_scores()]
+        )
+        sharper_loss, _ = SoftmaxLoss(softmax_temperature=0.5)(
+            loss_input=[_make_batch_scores()]
+        )
+        # The two scalar losses must differ; both must be finite.
+        self.assertTrue(torch.isfinite(unit_loss).item())
+        self.assertTrue(torch.isfinite(sharper_loss).item())
+        self.assertNotAlmostEqual(unit_loss.item(), sharper_loss.item())
+
+    def test_empty_input_returns_zero_loss(self) -> None:
+        """An empty ``loss_input`` list yields a zero loss with sample_size 1."""
+        loss, sample_size = SoftmaxLoss()(loss_input=[])
+        self.assert_tensor_equality(loss, torch.tensor(0.0))
+        self.assertEqual(sample_size, 1)

From efbbd1810c8501cb66e7689c34af1bcc5cf3d5a9 Mon Sep 17 00:00:00 2001
From: Kyle Montemayor <kyle.e.montemayor@gmail.com>
Date: Tue, 19 May 2026 17:46:02 +0000
Subject: [PATCH 2/3] Address codex review: rename regression test to match
 discovery pattern

The repo's default unit-test pattern is *_test.py (Makefile:29,
tests/unit/main.py:8); the original name test_torchrec_utils.py uses
the prefix form, so the new regression coverage for apply_sparse_optimizer
would be silently skipped by make unit_test_py / CI.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../{test_torchrec_utils.py => torchrec_utils_test.py}            | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/unit/experimental/{test_torchrec_utils.py => torchrec_utils_test.py} (100%)

diff --git a/tests/unit/experimental/test_torchrec_utils.py b/tests/unit/experimental/torchrec_utils_test.py
similarity index 100%
rename from tests/unit/experimental/test_torchrec_utils.py
rename to tests/unit/experimental/torchrec_utils_test.py

From 93fb43e46a407ab1860f37b1140f8220e40447c8 Mon Sep 17 00:00:00 2001
From: Kyle Montemayor <kyle.e.montemayor@gmail.com>
Date: Tue, 19 May 2026 20:55:06 +0000
Subject: [PATCH 3/3] Address PR feedback: simplify regression tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Consolidate SoftmaxLoss regression coverage into the existing
  loss_test.py (one assertion on the default temperature).
- Delete legacy_loss_test.py (was a misnamed parallel file created
  to avoid clobbering the existing loss_test.py).
- Trim torchrec_utils_test.py from 4 tests to 1 — the actual
  regression path (defaults flowing through to RowWiseAdagrad).
- Drop test-side narrative docstrings that re-described what the
  fix did rather than the contract under test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../unit/experimental/torchrec_utils_test.py  | 78 +++----------------
 .../common/models/layers/legacy_loss_test.py  | 78 -------------------
 .../src/common/models/layers/loss_test.py     |  9 +++
 3 files changed, 18 insertions(+), 147 deletions(-)
 delete mode 100644 tests/unit/src/common/models/layers/legacy_loss_test.py

diff --git a/tests/unit/experimental/torchrec_utils_test.py b/tests/unit/experimental/torchrec_utils_test.py
index 6307a7be1..3a8f07061 100644
--- a/tests/unit/experimental/torchrec_utils_test.py
+++ b/tests/unit/experimental/torchrec_utils_test.py
@@ -1,83 +1,23 @@
-"""Regression tests for ``apply_sparse_optimizer`` in ``gigl.experimental.knowledge_graph_embedding.common.torchrec.utils``.
+"""Regression tests for ``apply_sparse_optimizer``."""
 
-These tests guard the optimizer-flow bug surfaced by the ty migration: the
-previous guard ``if not optimizer_cls and optimizer_kwargs`` only fired when
-both arguments were truthy, allowing ``None`` to leak through into
-``apply_optimizer_in_backward`` whenever ``optimizer_cls`` was unset but
-``optimizer_kwargs`` was the default empty dict.
-"""
-
-from typing import Any
 from unittest.mock import patch
 
 import torch
 import torch.nn as nn
-from torch.optim import SGD
 
 from gigl.experimental.knowledge_graph_embedding.common.torchrec import utils
 from tests.test_assets.test_case import TestCase
 
 
-def _make_dummy_parameters() -> list[nn.Parameter]:
-    """Build a one-parameter list so the optimizer call has something to bind to."""
-    return [nn.Parameter(torch.zeros(1))]
-
-
 class ApplySparseOptimizerTest(TestCase):
-    def test_none_optimizer_cls_defaults_to_rowwise_adagrad(self) -> None:
-        """Calling without ``optimizer_cls`` must default to RowWiseAdagrad.
-
-        Regression: the old guard ``if not optimizer_cls and optimizer_kwargs``
-        skipped the assignment when ``optimizer_kwargs`` defaulted to ``{}``,
-        leaving ``optimizer_cls=None`` to crash ``apply_optimizer_in_backward``.
+    def test_default_optimizer_uses_rowwise_adagrad(self) -> None:
+        """When called with neither ``optimizer_cls`` nor ``optimizer_kwargs``,
+        ``apply_sparse_optimizer`` falls back to ``RowWiseAdagrad`` with
+        ``lr=0.01`` and forwards them to ``apply_optimizer_in_backward``.
         """
-        parameters = _make_dummy_parameters()
+        parameters = [nn.Parameter(torch.zeros(1))]
         with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
             utils.apply_sparse_optimizer(parameters=parameters)
-            mock_apply.assert_called_once()
-            forwarded_cls, forwarded_params, forwarded_kwargs = mock_apply.call_args[0]
-            self.assertIs(forwarded_cls, utils.RowWiseAdagrad)
-            self.assertEqual(list(forwarded_params), parameters)
-            self.assertEqual(forwarded_kwargs, {"lr": 0.01})
-
-    def test_none_optimizer_cls_with_kwargs_still_defaults_to_rowwise_adagrad(
-        self,
-    ) -> None:
-        """``optimizer_cls=None`` with user kwargs uses RowWiseAdagrad + user kwargs.
-
-        The user-supplied kwargs (when non-empty) are preserved verbatim; only
-        the missing class is filled in.
-        """
-        parameters = _make_dummy_parameters()
-        user_kwargs: dict[str, Any] = {"lr": 0.5}
-        with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
-            utils.apply_sparse_optimizer(
-                parameters=parameters, optimizer_kwargs=user_kwargs
-            )
-            forwarded_cls, _, forwarded_kwargs = mock_apply.call_args[0]
-            self.assertIs(forwarded_cls, utils.RowWiseAdagrad)
-            self.assertEqual(forwarded_kwargs, {"lr": 0.5})
-
-    def test_explicit_optimizer_cls_is_forwarded(self) -> None:
-        """An explicit ``optimizer_cls`` must be forwarded unchanged."""
-        parameters = _make_dummy_parameters()
-        with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
-            utils.apply_sparse_optimizer(parameters=parameters, optimizer_cls=SGD)
-            forwarded_cls, _, forwarded_kwargs = mock_apply.call_args[0]
-            self.assertIs(forwarded_cls, SGD)
-            # No kwargs were supplied; the call site sees an empty dict, not None.
-            self.assertEqual(forwarded_kwargs, {})
-
-    def test_explicit_optimizer_cls_and_kwargs_forwarded(self) -> None:
-        """Explicit ``optimizer_cls`` and kwargs must flow through verbatim."""
-        parameters = _make_dummy_parameters()
-        user_kwargs: dict[str, Any] = {"lr": 0.1, "momentum": 0.9}
-        with patch.object(utils, "apply_optimizer_in_backward") as mock_apply:
-            utils.apply_sparse_optimizer(
-                parameters=parameters,
-                optimizer_cls=SGD,
-                optimizer_kwargs=user_kwargs,
-            )
-            forwarded_cls, _, forwarded_kwargs = mock_apply.call_args[0]
-            self.assertIs(forwarded_cls, SGD)
-            self.assertEqual(forwarded_kwargs, {"lr": 0.1, "momentum": 0.9})
+        forwarded_cls, _, forwarded_kwargs = mock_apply.call_args[0]
+        self.assertIs(forwarded_cls, utils.RowWiseAdagrad)
+        self.assertEqual(forwarded_kwargs, {"lr": 0.01})
diff --git a/tests/unit/src/common/models/layers/legacy_loss_test.py b/tests/unit/src/common/models/layers/legacy_loss_test.py
deleted file mode 100644
index 8ec3b7c61..000000000
--- a/tests/unit/src/common/models/layers/legacy_loss_test.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Regression tests for the legacy loss layers under ``gigl.src.common.models.layers.loss``.
-
-These tests guard against latent ``None`` propagation bugs that the ty migration
-surfaced. In particular, ``SoftmaxLoss.softmax_temperature`` used to be typed as
-``Optional[float]`` with a ``None`` default, which made the in-loop expression
-``all_scores / self.softmax_temperature`` raise ``TypeError`` at runtime whenever
-the constructor was called without an explicit temperature.
-"""
-
-import torch
-
-from gigl.src.common.models.layers.loss import SoftmaxLoss
-from gigl.src.common.types.graph_data import CondensedEdgeType
-from gigl.src.common.types.task_inputs import BatchScores
-from tests.test_assets.test_case import TestCase
-
-
-def _make_batch_scores() -> dict[CondensedEdgeType, BatchScores]:
-    """Build a minimal non-empty BatchScores dict suitable for SoftmaxLoss.forward.
-
-    ``BatchScores`` is annotated as ``FloatTensor`` but PyTorch factories return
-    plain ``Tensor`` — PR 3 of the ty-cleanup plan will relax these annotations
-    to ``Tensor``. Until then, ty rejects the call, so we ignore the
-    specialization mismatch at the construction site.
-    """
-    pos_scores = torch.tensor([[0.8, 0.6]])
-    hard_neg_scores = torch.tensor([[0.1, 0.2]])
-    random_neg_scores = torch.tensor([[0.05, 0.15]])
-    return {
-        CondensedEdgeType(0): BatchScores(
-            pos_scores=pos_scores,  # ty: ignore[invalid-argument-type] TODO(ty-torch-tensor-specialization): cleared by PR 3 relaxing BatchScores to Tensor.
-            hard_neg_scores=hard_neg_scores,  # ty: ignore[invalid-argument-type] TODO(ty-torch-tensor-specialization): cleared by PR 3 relaxing BatchScores to Tensor.
-            random_neg_scores=random_neg_scores,  # ty: ignore[invalid-argument-type] TODO(ty-torch-tensor-specialization): cleared by PR 3 relaxing BatchScores to Tensor.
-        )
-    }
-
-
-class SoftmaxLossTest(TestCase):
-    def test_default_temperature_does_not_raise(self) -> None:
-        """SoftmaxLoss must be constructible without explicit temperature.
-
-        Regression: before the ty-driven fix, ``softmax_temperature`` defaulted to
-        ``None``, so ``all_scores / self.softmax_temperature`` raised
-        ``TypeError: unsupported operand type(s) for /: 'Tensor' and 'NoneType'``.
-        """
-        loss_fn = SoftmaxLoss()
-        # The default is 1.0 (no scaling), and dividing by 1.0 must not raise.
-        self.assertEqual(loss_fn.softmax_temperature, 1.0)
-        loss, sample_size = loss_fn(loss_input=[_make_batch_scores()])
-        self.assertTrue(torch.isfinite(loss).item())
-        self.assertGreater(sample_size, 0)
-
-    def test_default_temperature_matches_unit_scale(self) -> None:
-        """Default temperature ``1.0`` produces the same loss as explicit ``1.0``."""
-        default_loss, _ = SoftmaxLoss()(loss_input=[_make_batch_scores()])
-        explicit_loss, _ = SoftmaxLoss(softmax_temperature=1.0)(
-            loss_input=[_make_batch_scores()]
-        )
-        self.assert_tensor_equality(default_loss, explicit_loss)
-
-    def test_custom_temperature_scales_loss(self) -> None:
-        """Smaller temperatures sharpen the softmax, yielding a different loss."""
-        unit_loss, _ = SoftmaxLoss(softmax_temperature=1.0)(
-            loss_input=[_make_batch_scores()]
-        )
-        sharper_loss, _ = SoftmaxLoss(softmax_temperature=0.5)(
-            loss_input=[_make_batch_scores()]
-        )
-        # The two scalar losses must differ; both must be finite.
-        self.assertTrue(torch.isfinite(unit_loss).item())
-        self.assertTrue(torch.isfinite(sharper_loss).item())
-        self.assertNotAlmostEqual(unit_loss.item(), sharper_loss.item())
-
-    def test_empty_input_returns_zero_loss(self) -> None:
-        """An empty ``loss_input`` list yields a zero loss with sample_size 1."""
-        loss, sample_size = SoftmaxLoss()(loss_input=[])
-        self.assert_tensor_equality(loss, torch.tensor(0.0))
-        self.assertEqual(sample_size, 1)
diff --git a/tests/unit/src/common/models/layers/loss_test.py b/tests/unit/src/common/models/layers/loss_test.py
index 5c0f3f073..4be356ca3 100644
--- a/tests/unit/src/common/models/layers/loss_test.py
+++ b/tests/unit/src/common/models/layers/loss_test.py
@@ -2,6 +2,7 @@
 import torch.nn.functional as F
 
 from gigl.nn.loss import RetrievalLoss
+from gigl.src.common.models.layers.loss import SoftmaxLoss
 from tests.test_assets.test_case import TestCase
 
 
@@ -176,3 +177,11 @@ def test_empty_loss(self):
             scores=empty_scores, query_ids=query_ids, candidate_ids=candidate_ids
         )
         self.assert_tensor_equality(loss, expected_loss)
+
+
+class SoftmaxLossDefaultTemperatureTest(TestCase):
+    def test_default_temperature_is_one(self) -> None:
+        """SoftmaxLoss defaults softmax_temperature to 1.0 so the forward-pass
+        division scores / softmax_temperature is always well-defined.
+        """
+        self.assertEqual(SoftmaxLoss().softmax_temperature, 1.0)