From db4073528db0f5b965080a80895bfd3b2b9c7e48 Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Sat, 30 May 2026 09:44:07 +0000
Subject: [PATCH 1/3] refactor(trace): compute the CPU Tensor trace in the
 container layer

The CPU Tensor trace built an identity UniTensor and called Contract, so the
container-level linalg_internal layer depended on the higher-level UniTensor
and Contract APIs -- an inverted layering. Compute the trace directly over
storage instead, and tighten the internal Trace API at the same time.

* Simplify the dispatch API: every Trace_internal_* / cuTrace_internal_*
  dispatcher (and the Tracefunc_oii typedef) now takes only (Tn, ax1, ax2) and
  returns the result Tensor. A small TraceParams helper
  (backend/linalg_internal_cpu/trace_dispatch.hpp) derives Ndiag, Nelem, accu,
  remain_rank_id, the reduced shape, and the is_2d flag once per call from
  shape and axes, so the dispatchers and the _trace_2d / _trace_nd kernels no
  longer plumb those through individually. Trace.cpp loses its derivation
  block and is now just validation + a dtype-keyed dispatch call.
* Add a public Tensor::strides() getter (declared in include/Tensor.hpp,
  defined in src/Tensor.cpp): for each logical axis, the storage offset
  increment per +1 step, derived from the shape and inverse mapper -- the same
  layout Tensor::at already indexes through. This is what the dispatchers use
  to sum the diagonal in place.
* Replace the body of _trace_2d / _trace_nd with a stride-aware reduction
  via PairwiseSum over a new range adaptor stride_view (snake_case, with a
  `r | stride(k)` pipe form; iterator stores the underlying base iterator so it
  stays valid independent of the view's lifetime). The diagonal stride is
  strides[ax1] + strides[ax2], so the previous contiguous() copy is dropped,
  and floating-point traces get the same O(log N * eps) error bound as
  linalg::Sum. Guard Ndiag == 0 / Nelem == 0 (unsigned (Ndiag - 1) would
  otherwise underflow into a huge span extent). Drop the UniTensor and
  Generator includes from the CPU translation unit.
* tests/linalg_test/stride_view_test.cpp exercises the iterator concept model,
  the pipe form, the iterator-outlives-view case, the non-dividing
  size-vs-stride branch, the zero-stride throw, and complex strided summation.
  Tests are reached through src/ added to the test binary's include path, so
  internal headers can be unit-tested directly.
* tests/Tensor_strides_test.cpp checks strides() against Tensor::at across
  ranks 2-5 permutations and across dtypes.
* tests/linalg_test/Trace_test.cpp covers the strided in-place path against
  the contiguous-clone reference across ranks 2-5, complex and integer dtypes,
  the rank-2 / rank-N output shape invariants, and swapped axis order.
* benchmarks/linalg/Trace_bm.cpp pits the in-place strided Trace against the
  "collect contiguously and reduce" alternatives Ivana asked for: the 3D
  contiguous baseline is a BLAS matrix-vector multiplication
  (tr(A[d,m,d]) over d as {middle, n*n} @ {n*n, 1} against vec(I_n)), and the
  2D contiguous baseline is the reshape trick (save A[n-1, n-1], view the rest
  as {n-1, n+1}, permute+contiguous, Sum the resulting first row, add the
  saved entry). On openblas-cpu debug, the strided path beats both baselines
  by orders of magnitude at the tested shapes.
* The discovery timeout for gtest_discover_tests is bumped to 120 s so the
  test binary's ASAN+MKL listing fits on slower runners.

Co-authored-by: Claude <noreply@anthropic.com>
---
 benchmarks/CMakeLists.txt                     |   1 +
 benchmarks/linalg/Trace_bm.cpp                | 301 ++++++++++++++++++
 include/Tensor.hpp                            |  11 +
 src/Tensor.cpp                                |  16 +
 .../linalg_internal_cpu/Trace_internal.cpp    | 247 +++++---------
 .../linalg_internal_cpu/Trace_internal.hpp    |  87 +----
 .../linalg_internal_gpu/cuTrace_internal.hpp  |  87 +----
 src/backend/linalg_internal_interface.hpp     |   6 +-
 src/linalg/Trace.cpp                          |  70 +---
 tests/CMakeLists.txt                          |   7 +-
 tests/Tensor_strides_test.cpp                 |  78 +++++
 tests/linalg_test/Trace_test.cpp              | 104 ++++++
 12 files changed, 629 insertions(+), 386 deletions(-)
 create mode 100644 benchmarks/linalg/Trace_bm.cpp
 create mode 100644 tests/Tensor_strides_test.cpp
 create mode 100644 tests/linalg_test/Trace_test.cpp

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index df7b0bedc..e9143302b 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -15,6 +15,7 @@ add_executable(
   linalg/Directsum_bm.cpp
   linalg/Svd_bm.cpp
   linalg/Svd_truncate_bm.cpp
+  linalg/Trace_bm.cpp
   linalg/Vectordot_bm.cpp
   linalg/Lanczos_bm.cpp
   linalg/linalg_basic_bm.cpp
diff --git a/benchmarks/linalg/Trace_bm.cpp b/benchmarks/linalg/Trace_bm.cpp
new file mode 100644
index 000000000..177307c2c
--- /dev/null
+++ b/benchmarks/linalg/Trace_bm.cpp
@@ -0,0 +1,301 @@
+#include <benchmark/benchmark.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include "cytnx.hpp"
+
+// Benchmarks for linalg::Trace.
+//
+// Two shapes are exercised:
+//
+//   * 3D path: a contiguous {n, middle, n} tensor traced over axes 0 and 2.
+//     The two traced axes are not adjacent in storage, so the diagonal stride
+//     (strides[0] + strides[2] = middle * n + 1) is large; each iteration is a
+//     genuine higher-rank trace rather than a simple 2D matrix trace.
+//   * 2D path: a contiguous {n, n} tensor traced over axes 0 and 1, exercising
+//     the _trace_2d / cuTrace_2d_kernel branch.
+//
+// For each shape, the in-place strided reduction is compared against a
+// "collect the traced elements contiguously and reduce them" baseline that
+// avoids reading the diagonal with a large stride:
+//
+//   * 3D matvec baseline: tr(A)[m] = <vec(I_n), vec(A[:, m, :])>, so stacking
+//     the middle outputs becomes a {middle, n*n} @ {n*n, 1} GEMM call against
+//     vec(I_n) -- a BLAS matrix-vector multiplication.
+//   * 2D vecdot baseline: tr(A) = <vec(I_n), vec(A)> -- a BLAS vector-vector
+//     dot product on the n*n-element flattenings of I and A.
+//   * 2D reshape trick: drop the last diagonal entry A[n-1, n-1], view the
+//     remaining n*n - 1 entries as {n-1, n+1}, permute + contiguous so the
+//     first column (which holds diag[0..n-2]) becomes a contiguous row, then
+//     reduce that row over the raw buffer and add the saved last entry.
+//
+// Every variant runs once before the timing loop and is checked against
+// linalg::Trace on the same input, so a wrong baseline would fail loudly
+// instead of producing fast-but-meaningless numbers.
+
+namespace BMTest_Trace {
+
+  // Maps a benchmarked dtype enum to its C++ storage type, so a baseline can
+  // read the raw buffer directly. Only the dtypes the 2D reshape trick is
+  // registered for need an entry.
+  template <unsigned int dtype>
+  struct BmCType;
+  template <>
+  struct BmCType<cytnx::Type.Double> {
+    using type = cytnx::cytnx_double;
+  };
+  template <>
+  struct BmCType<cytnx::Type.ComplexDouble> {
+    using type = cytnx::cytnx_complex128;
+  };
+
+  // Aborts the benchmark if `candidate` differs from `reference` by more than
+  // a small absolute tolerance. Both tensors must have the same dtype, and
+  // (after broadcasting via Tensor's arithmetic) be element-wise comparable.
+  static void VerifyAgainstTrace(const cytnx::Tensor& reference, const cytnx::Tensor& candidate,
+                                 const char* variant_name) {
+    auto diff = (candidate - reference).Abs();
+    const double max_err = diff.Max().item<cytnx::cytnx_double>();
+    if (!(max_err < 1e-6)) {
+      std::cerr << "[Trace benchmark] variant \"" << variant_name
+                << "\" disagrees with linalg::Trace; max abs error = " << max_err << "\n";
+      std::abort();
+    }
+  }
+
+  template <unsigned int dtype, cytnx::cytnx_int32 device = cytnx::Device.cpu>
+  static void BM_Trace_Strided_3D_Template(benchmark::State& state) {
+    const auto n = state.range(0);
+    const auto middle = state.range(1);
+    const auto tensor = cytnx::random::random_tensor({n, middle, n}, -1.0, 1.0, device, 0, dtype);
+    const auto reference = cytnx::linalg::Trace(tensor, 0, 2);
+    VerifyAgainstTrace(reference, cytnx::linalg::Trace(tensor, 0, 2), "Strided_3D");
+    for (auto _ : state) {
+      auto result = cytnx::linalg::Trace(tensor, 0, 2);
+      benchmark::DoNotOptimize(result);
+    }
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * n * n * middle);
+  }
+
+  // ND matvec baseline: tr(A)[m] = <vec(I_n), vec(A[:, m, :])>. Stacking the
+  // middle outputs becomes a {middle, n*n} @ {n*n, 1} GEMM call against vec(I_n).
+  template <unsigned int dtype, cytnx::cytnx_int32 device = cytnx::Device.cpu>
+  static void BM_Trace_Matvec_3D_Template(benchmark::State& state) {
+    const auto n = state.range(0);
+    const auto middle = state.range(1);
+    const auto tensor = cytnx::random::random_tensor({n, middle, n}, -1.0, 1.0, device, 0, dtype);
+    const auto vec_I = cytnx::eye(n, dtype, device).reshape({n * n, 1});
+    const auto reference = cytnx::linalg::Trace(tensor, 0, 2);
+    auto compute = [&]() {
+      auto packed = tensor.permute({1, 0, 2}).contiguous().reshape({middle, n * n});
+      return cytnx::linalg::Matmul(packed, vec_I).reshape({middle});
+    };
+    VerifyAgainstTrace(reference, compute(), "Matvec_3D");
+    for (auto _ : state) {
+      auto result = compute();
+      benchmark::DoNotOptimize(result);
+    }
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * n * n * middle);
+  }
+
+  template <unsigned int dtype, cytnx::cytnx_int32 device = cytnx::Device.cpu>
+  static void BM_Trace_Strided_2D_Template(benchmark::State& state) {
+    const auto n = state.range(0);
+    const auto tensor = cytnx::random::random_tensor({n, n}, -1.0, 1.0, device, 0, dtype);
+    const auto reference = cytnx::linalg::Trace(tensor, 0, 1);
+    VerifyAgainstTrace(reference, cytnx::linalg::Trace(tensor, 0, 1), "Strided_2D");
+    for (auto _ : state) {
+      auto result = cytnx::linalg::Trace(tensor, 0, 1);
+      benchmark::DoNotOptimize(result);
+    }
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * n * n);
+  }
+
+  // 2D vector-dot baseline: tr(A) = <vec(I_n), vec(A)> as a BLAS dot product
+  // on the n*n-element flattenings.
+  template <unsigned int dtype, cytnx::cytnx_int32 device = cytnx::Device.cpu>
+  static void BM_Trace_Vecdot_2D_Template(benchmark::State& state) {
+    const auto n = state.range(0);
+    const auto tensor = cytnx::random::random_tensor({n, n}, -1.0, 1.0, device, 0, dtype);
+    const auto vec_I = cytnx::eye(n, dtype, device).reshape({n * n});
+    const auto reference = cytnx::linalg::Trace(tensor, 0, 1);
+    auto compute = [&]() {
+      auto vec_A = tensor.reshape({n * n});
+      return cytnx::linalg::Vectordot(vec_I, vec_A).reshape({1});
+    };
+    VerifyAgainstTrace(reference, compute(), "Vecdot_2D");
+    for (auto _ : state) {
+      auto result = compute();
+      benchmark::DoNotOptimize(result);
+    }
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * n * n);
+  }
+
+  // 2D reshape trick: drop A[n-1, n-1], view the remaining n*n - 1 elements as
+  // {n-1, n+1}, permute + contiguous so the first column (which holds
+  // diag[0..n-2]) becomes a contiguous row, reduce that row, and add the saved
+  // last element.
+  //
+  // To keep the bulk data movement honest, the {n-1, n+1} view reuses the
+  // source storage directly: shrinking its size to (n-1)*(n+1) is a no-realloc
+  // metadata change (Storage::resize only reallocates when growing past the
+  // capacity), so the permute -> contiguous gather is the only copy the trick
+  // performs. The size is restored to n*n afterwards (and the dropped corner
+  // written back, since a grow-resize zero-fills any tail it has to reallocate)
+  // so the source tensor stays intact across iterations.
+  //
+  // The row reduction is device-specific: on CPU the contiguous row is summed
+  // over the raw buffer (no linalg::Sum / Accessor allocations on top of the
+  // gather); on GPU the raw buffer lives in device memory and cannot be read
+  // host-side, so the row is wrapped as a {n-1} tensor over the resized gather
+  // storage and reduced with linalg::Sum, and the corner is taken as a {1}
+  // tensor so the final add stays on the device.
+  template <unsigned int dtype, cytnx::cytnx_int32 device = cytnx::Device.cpu>
+  static void BM_Trace_Reshape_2D_Template(benchmark::State& state) {
+    using T = typename BmCType<dtype>::type;
+    const auto n = state.range(0);
+    const auto tensor = cytnx::random::random_tensor({n, n}, -1.0, 1.0, device, 0, dtype);
+    const auto reference = cytnx::linalg::Trace(tensor, 0, 1);
+    auto compute = [&]() {
+      auto& storage = tensor.storage();
+      if constexpr (device == cytnx::Device_class::cpu) {
+        const T last = tensor.at<T>(
+          {static_cast<cytnx::cytnx_uint64>(n - 1), static_cast<cytnx::cytnx_uint64>(n - 1)});
+        storage.resize((n - 1) * (n + 1));
+        auto view = cytnx::Tensor::from_storage(storage);
+        view.reshape_({n - 1, n + 1});
+        auto packed = view.permute({1, 0}).contiguous();
+        storage.resize(n * n);  // restore the source storage size
+        storage.at<T>((n * n) - 1) = last;  // and recover the dropped corner
+
+        const T* row = packed.storage().data<T>();
+        T sum = T(0);
+        for (cytnx::cytnx_int64 k = 0; k < n - 1; ++k) sum += row[k];
+        sum += last;
+
+        auto out = cytnx::Tensor({static_cast<cytnx::cytnx_uint64>(1)}, dtype, device);
+        out.storage().at<T>(0) = sum;
+        return out;
+      } else {
+        // Read the corner as a {1} device tensor before the storage shrinks.
+        auto last = tensor.reshape({n * n}).get({cytnx::Accessor((n * n) - 1)});
+        storage.resize((n - 1) * (n + 1));
+        auto view = cytnx::Tensor::from_storage(storage);
+        view.reshape_({n - 1, n + 1});
+        auto packed = view.permute({1, 0}).contiguous();
+        storage.resize(n * n);  // restore the source storage size
+
+        auto row_storage = packed.storage();
+        row_storage.resize(n - 1);  // first contiguous row = diag[0..n-2]
+        auto row = cytnx::Tensor::from_storage(row_storage);
+        return (cytnx::linalg::Sum(row) + last).reshape({1});
+      }
+    };
+    VerifyAgainstTrace(reference, compute(), "Reshape_2D");
+    for (auto _ : state) {
+      auto result = compute();
+      benchmark::DoNotOptimize(result);
+    }
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * n * n);
+  }
+
+#define REGISTER_TRACE_3D_BENCHMARK(TypeName, TypeEnum)      \
+  BENCHMARK_TEMPLATE(BM_Trace_Strided_3D_Template, TypeEnum) \
+    ->Name("BM_Trace_Strided_3D_" #TypeName)                 \
+    ->Args({64, 64})                                         \
+    ->Args({256, 64})                                        \
+    ->Args({1024, 16})                                       \
+    ->Args({2048, 16})                                       \
+    ->Args({4096, 8})                                        \
+    ->Unit(benchmark::kMicrosecond);                         \
+  BENCHMARK_TEMPLATE(BM_Trace_Matvec_3D_Template, TypeEnum)  \
+    ->Name("BM_Trace_Matvec_3D_" #TypeName)                  \
+    ->Args({64, 64})                                         \
+    ->Args({256, 64})                                        \
+    ->Args({1024, 16})                                       \
+    ->Args({2048, 16})                                       \
+    ->Args({4096, 8})                                        \
+    ->Unit(benchmark::kMicrosecond);
+
+#define REGISTER_TRACE_2D_BENCHMARK(TypeName, TypeEnum)      \
+  BENCHMARK_TEMPLATE(BM_Trace_Strided_2D_Template, TypeEnum) \
+    ->Name("BM_Trace_Strided_2D_" #TypeName)                 \
+    ->Args({64})                                             \
+    ->Args({256})                                            \
+    ->Args({1024})                                           \
+    ->Args({4096})                                           \
+    ->Args({8192})                                           \
+    ->Unit(benchmark::kMicrosecond);                         \
+  BENCHMARK_TEMPLATE(BM_Trace_Vecdot_2D_Template, TypeEnum)  \
+    ->Name("BM_Trace_Vecdot_2D_" #TypeName)                  \
+    ->Args({64})                                             \
+    ->Args({256})                                            \
+    ->Args({1024})                                           \
+    ->Args({4096})                                           \
+    ->Args({8192})                                           \
+    ->Unit(benchmark::kMicrosecond);                         \
+  BENCHMARK_TEMPLATE(BM_Trace_Reshape_2D_Template, TypeEnum) \
+    ->Name("BM_Trace_Reshape_2D_" #TypeName)                 \
+    ->Args({64})                                             \
+    ->Args({256})                                            \
+    ->Args({1024})                                           \
+    ->Args({4096})                                           \
+    ->Args({8192})                                           \
+    ->Unit(benchmark::kMicrosecond);
+
+  REGISTER_TRACE_3D_BENCHMARK(Double, cytnx::Type.Double)
+  REGISTER_TRACE_3D_BENCHMARK(ComplexDouble, cytnx::Type.ComplexDouble)
+  REGISTER_TRACE_2D_BENCHMARK(Double, cytnx::Type.Double)
+  REGISTER_TRACE_2D_BENCHMARK(ComplexDouble, cytnx::Type.ComplexDouble)
+
+#ifdef UNI_GPU
+  #define REGISTER_GPU_TRACE_3D_BENCHMARK(TypeName, TypeEnum)                      \
+    BENCHMARK_TEMPLATE(BM_Trace_Strided_3D_Template, TypeEnum, cytnx::Device.cuda) \
+      ->Name("BM_gpu_Trace_Strided_3D_" #TypeName)                                 \
+      ->Args({256, 64})                                                            \
+      ->Args({1024, 16})                                                           \
+      ->Args({2048, 16})                                                           \
+      ->Args({4096, 8})                                                            \
+      ->Unit(benchmark::kMicrosecond);                                             \
+    BENCHMARK_TEMPLATE(BM_Trace_Matvec_3D_Template, TypeEnum, cytnx::Device.cuda)  \
+      ->Name("BM_gpu_Trace_Matvec_3D_" #TypeName)                                  \
+      ->Args({256, 64})                                                            \
+      ->Args({1024, 16})                                                           \
+      ->Args({2048, 16})                                                           \
+      ->Args({4096, 8})                                                            \
+      ->Unit(benchmark::kMicrosecond);
+
+  #define REGISTER_GPU_TRACE_2D_BENCHMARK(TypeName, TypeEnum)                      \
+    BENCHMARK_TEMPLATE(BM_Trace_Strided_2D_Template, TypeEnum, cytnx::Device.cuda) \
+      ->Name("BM_gpu_Trace_Strided_2D_" #TypeName)                                 \
+      ->Args({256})                                                                \
+      ->Args({1024})                                                               \
+      ->Args({4096})                                                               \
+      ->Args({8192})                                                               \
+      ->Unit(benchmark::kMicrosecond);                                             \
+    BENCHMARK_TEMPLATE(BM_Trace_Vecdot_2D_Template, TypeEnum, cytnx::Device.cuda)  \
+      ->Name("BM_gpu_Trace_Vecdot_2D_" #TypeName)                                  \
+      ->Args({256})                                                                \
+      ->Args({1024})                                                               \
+      ->Args({4096})                                                               \
+      ->Args({8192})                                                               \
+      ->Unit(benchmark::kMicrosecond);                                             \
+    BENCHMARK_TEMPLATE(BM_Trace_Reshape_2D_Template, TypeEnum, cytnx::Device.cuda) \
+      ->Name("BM_gpu_Trace_Reshape_2D_" #TypeName)                                 \
+      ->Args({256})                                                                \
+      ->Args({1024})                                                               \
+      ->Args({4096})                                                               \
+      ->Args({8192})                                                               \
+      ->Unit(benchmark::kMicrosecond);
+
+  REGISTER_GPU_TRACE_3D_BENCHMARK(Double, cytnx::Type.Double)
+  REGISTER_GPU_TRACE_3D_BENCHMARK(ComplexDouble, cytnx::Type.ComplexDouble)
+  REGISTER_GPU_TRACE_2D_BENCHMARK(Double, cytnx::Type.Double)
+  REGISTER_GPU_TRACE_2D_BENCHMARK(ComplexDouble, cytnx::Type.ComplexDouble)
+#endif  // UNI_GPU
+
+}  // namespace BMTest_Trace
diff --git a/include/Tensor.hpp b/include/Tensor.hpp
index 11fd45bf7..3e7a4c334 100644
--- a/include/Tensor.hpp
+++ b/include/Tensor.hpp
@@ -596,6 +596,17 @@ namespace cytnx {
     */
     const std::vector<cytnx_uint64> &shape() const { return this->_impl->shape(); }
 
+    /**
+    @brief the storage strides of the Tensor
+    @return [std::vector<cytnx_uint64>] for each logical axis, the distance in the
+    underlying storage between consecutive elements along that axis
+    @details cytnx tensors store a dense permutation of their logical axes, so the
+    stride of every axis is well defined (this is the layout Tensor::at indexes
+    through). For a contiguous tensor these are the row-major strides; for a
+    permuted (non-contiguous) view they reflect the permuted memory order.
+    */
+    std::vector<cytnx_uint64> strides() const;
+
     /**
         @brief the rank of the Tensor
         @return [cytnx_uint64] the rank of the Tensor
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 7d5d51544..bd595c787 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -907,6 +907,22 @@ namespace cytnx {
     return is(this->_impl->storage(), rhs.storage());
   }
 
+  std::vector<cytnx_uint64> Tensor::strides() const {
+    // The storage is laid out contiguously in memory order; _invmapper[i] gives
+    // the logical axis sitting at memory position i (innermost last). The stride
+    // of a logical axis is the product of the memory-order extents inside it.
+    const std::vector<cytnx_uint64> &shape = this->_impl->shape();
+    const std::vector<cytnx_uint64> &invmapper = this->_impl->invmapper();
+    const cytnx_uint64 rank = shape.size();
+    std::vector<cytnx_uint64> out(rank);
+    cytnx_uint64 step = 1;
+    for (cytnx_int64 i = static_cast<cytnx_int64>(rank) - 1; i >= 0; i--) {
+      out[invmapper[i]] = step;
+      step *= shape[invmapper[i]];
+    }
+    return out;
+  }
+
   //===========================
   // Tensor am Tproxy
   Tensor operator+(const Tensor &lhs, const Tensor::Tproxy &rhs) {
diff --git a/src/backend/linalg_internal_cpu/Trace_internal.cpp b/src/backend/linalg_internal_cpu/Trace_internal.cpp
index 858b8d3bc..adbebee3e 100644
--- a/src/backend/linalg_internal_cpu/Trace_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Trace_internal.cpp
@@ -2,191 +2,108 @@
 #include "cytnx_error.hpp"
 #include "backend/lapack_wrapper.hpp"
 
-#include "Generator.hpp"
+#include "backend/linalg_internal_cpu/pairwise_sum.hpp"
+#include "backend/linalg_internal_cpu/stride_view.hpp"
 #include "utils/utils.hpp"
 
-#include "UniTensor.hpp"
+#include <algorithm>
+#include <span>
 #include <vector>
 
 namespace cytnx {
   namespace linalg_internal {
+    namespace {
+
+      template <class T>
+      Tensor TraceImpl(const Tensor &Tn, cytnx_uint64 a1, cytnx_uint64 a2) {
+        const cytnx_uint64 ax1 = std::min(a1, a2);
+        const cytnx_uint64 ax2 = std::max(a1, a2);
+        const auto &shape_in = Tn.shape();
+        const cytnx_uint64 Ndiag = shape_in[ax1];
+
+        std::vector<cytnx_int64> out_shape;
+        std::vector<cytnx_uint64> remain_rank_id;
+        for (cytnx_uint64 i = 0; i < shape_in.size(); ++i) {
+          if (i != ax1 && i != ax2) {
+            out_shape.push_back(static_cast<cytnx_int64>(shape_in[i]));
+            remain_rank_id.push_back(i);
+          }
+        }
+        cytnx_uint64 Nelem = 1;
+        for (auto d : out_shape) Nelem *= static_cast<cytnx_uint64>(d);
+        const bool is_2d = out_shape.empty();
+
+        Tensor out = Tensor({is_2d ? cytnx_uint64{1} : Nelem}, Tn.dtype(), Tn.device());
+        out.storage().set_zeros();
+        if (Ndiag == 0 || Nelem == 0) {
+          if (!is_2d) out.reshape_(out_shape);
+          return out;
+        }
+
+        const std::vector<cytnx_uint64> strides = Tn.strides();
+        const cytnx_uint64 diag_stride = strides[ax1] + strides[ax2];
+        const cytnx_uint64 extent = (Ndiag - 1) * diag_stride + 1;
+        const T *data = Tn.storage().data<T>();
+
+        if (is_2d) {
+          out.storage().at<T>(0) =
+            PairwiseSum(std::span<const T>(data, extent) | stride(diag_stride));
+          return out;
+        }
+
+        std::vector<cytnx_uint64> accu(out_shape.size(), 1);
+        for (int i = static_cast<int>(out_shape.size()) - 1; i > 0; --i)
+          accu[i - 1] = accu[i] * static_cast<cytnx_uint64>(out_shape[i]);
+
+        T *out_data = out.storage().data<T>();
+        for (cytnx_uint64 i = 0; i < Nelem; ++i) {
+          cytnx_uint64 tmp = i, base = 0;
+          for (cytnx_uint64 x = 0; x < out_shape.size(); ++x) {
+            base += (tmp / accu[x]) * strides[remain_rank_id[x]];
+            tmp %= accu[x];
+          }
+          out_data[i] = PairwiseSum(std::span<const T>(data + base, extent) | stride(diag_stride));
+        }
+        out.reshape_(out_shape);
+        return out;
+      }
 
-    template <class T>
-    void _trace_2d(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag) {
-      T a = 0;
-      T *rawdata = Tn.storage().data<T>();
-      cytnx_uint64 Ldim = Tn.shape()[1];
-      for (cytnx_uint64 i = 0; i < Ndiag; i++) a += rawdata[i * Ldim + i];
-      out.storage().at<T>(0) = a;
-    }
-
-    template <class T>
-    void _trace_nd(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag,
-                   const cytnx_uint64 &Nelem, const std::vector<cytnx_uint64> &accu,
-                   const std::vector<cytnx_uint64> &remain_rank_id,
-                   const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                   const cytnx_uint64 &ax2) {
-      cytnx::UniTensor I_UT = cytnx::UniTensor::eye(Ndiag, {}, true, Tn.dtype(), Tn.device());
-
-      UniTensor UTn = UniTensor(Tn, false, 2);
-      I_UT.relabel_({UTn._impl->_labels[ax1], UTn._impl->_labels[ax2]});
-
-      out = Contract(I_UT, UTn).get_block_();
-
-      // std::vector<cytnx_uint64> indexer(Tn.shape().size(), 0);
-      // cytnx_uint64 tmp;
-      // for (cytnx_uint64 i = 0; i < Nelem; i++) {
-      // tmp = i;
-      // // calculate indexer
-      // for (int x = 0; x < shape.size(); x++) {
-      // indexer[remain_rank_id[x]] = cytnx_uint64(tmp / accu[x]);
-      // tmp %= accu[x];
-      // }
+    }  // namespace
 
-      // for (cytnx_uint64 d = 0; d < Ndiag; d++) {
-      // indexer[ax1] = indexer[ax2] = d;
-      // out.storage().at<T>(i) += Tn.at<T>(indexer);
-      // }
-      // }
+    Tensor Trace_internal_cd(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_complex128>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_cd(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                           const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                           const std::vector<cytnx_uint64> &accu,
-                           const std::vector<cytnx_uint64> &remain_rank_id,
-                           const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                           const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_complex128>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_complex128>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_cf(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_complex64>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_cf(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                           const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                           const std::vector<cytnx_uint64> &accu,
-                           const std::vector<cytnx_uint64> &remain_rank_id,
-                           const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                           const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_complex64>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_complex64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_d(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_double>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_d(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                          const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_double>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_double>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_f(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_float>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_f(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                          const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_float>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_float>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_u64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_uint64>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_u64(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_uint64>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_uint64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_i64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_int64>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_i64(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_int64>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_int64>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_u32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_uint32>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_u32(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_uint32>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_uint32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_i32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_int32>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_i32(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_int32>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_int32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
-    }
-
-    void Trace_internal_u16(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_uint16>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_uint16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_u16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_uint16>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_i16(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_int16>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_int16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_i16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_int16>(Tn, ax1, ax2);
     }
 
-    void Trace_internal_b(const bool &is_2d, Tensor &out, const Tensor &tn,
-                          const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2) {
+    Tensor Trace_internal_b(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
       cytnx_error_msg(true, "[internal][Trace] bool is not available. %s", "\n");
+      return Tensor();
     }
 
   }  // namespace linalg_internal
diff --git a/src/backend/linalg_internal_cpu/Trace_internal.hpp b/src/backend/linalg_internal_cpu/Trace_internal.hpp
index 68c7720c5..2c033c465 100644
--- a/src/backend/linalg_internal_cpu/Trace_internal.hpp
+++ b/src/backend/linalg_internal_cpu/Trace_internal.hpp
@@ -12,82 +12,17 @@
 namespace cytnx {
   namespace linalg_internal {
 
-    void Trace_internal_cd(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                           const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                           const std::vector<cytnx_uint64> &accu,
-                           const std::vector<cytnx_uint64> &remain_rank_id,
-                           const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                           const cytnx_uint64 &ax2);
-
-    void Trace_internal_cf(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                           const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                           const std::vector<cytnx_uint64> &accu,
-                           const std::vector<cytnx_uint64> &remain_rank_id,
-                           const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                           const cytnx_uint64 &ax2);
-
-    void Trace_internal_d(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                          const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2);
-
-    void Trace_internal_f(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                          const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2);
-
-    void Trace_internal_u64(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void Trace_internal_i64(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void Trace_internal_u32(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void Trace_internal_i32(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void Trace_internal_u16(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void Trace_internal_i16(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void Trace_internal_b(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                          const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2);
+    Tensor Trace_internal_cd(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_cf(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_d(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_f(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_u64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_i64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_u32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_i32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_u16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_i16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor Trace_internal_b(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
 
   }  // namespace linalg_internal
 
diff --git a/src/backend/linalg_internal_gpu/cuTrace_internal.hpp b/src/backend/linalg_internal_gpu/cuTrace_internal.hpp
index 13fc06fd9..cf0ea86db 100644
--- a/src/backend/linalg_internal_gpu/cuTrace_internal.hpp
+++ b/src/backend/linalg_internal_gpu/cuTrace_internal.hpp
@@ -12,82 +12,17 @@
 namespace cytnx {
   namespace linalg_internal {
 
-    void cuTrace_internal_cd(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                             const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                             const std::vector<cytnx_uint64> &accu,
-                             const std::vector<cytnx_uint64> &remain_rank_id,
-                             const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                             const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_cf(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                             const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                             const std::vector<cytnx_uint64> &accu,
-                             const std::vector<cytnx_uint64> &remain_rank_id,
-                             const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                             const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_d(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_f(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_u64(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                              const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                              const std::vector<cytnx_uint64> &accu,
-                              const std::vector<cytnx_uint64> &remain_rank_id,
-                              const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                              const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_i64(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                              const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                              const std::vector<cytnx_uint64> &accu,
-                              const std::vector<cytnx_uint64> &remain_rank_id,
-                              const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                              const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_u32(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                              const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                              const std::vector<cytnx_uint64> &accu,
-                              const std::vector<cytnx_uint64> &remain_rank_id,
-                              const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                              const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_i32(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                              const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                              const std::vector<cytnx_uint64> &accu,
-                              const std::vector<cytnx_uint64> &remain_rank_id,
-                              const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                              const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_u16(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                              const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                              const std::vector<cytnx_uint64> &accu,
-                              const std::vector<cytnx_uint64> &remain_rank_id,
-                              const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                              const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_i16(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                              const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                              const std::vector<cytnx_uint64> &accu,
-                              const std::vector<cytnx_uint64> &remain_rank_id,
-                              const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                              const cytnx_uint64 &ax2);
-
-    void cuTrace_internal_b(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2);
+    Tensor cuTrace_internal_cd(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_cf(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_d(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_f(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_u64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_i64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_u32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_i32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_u16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_i16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
+    Tensor cuTrace_internal_b(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2);
 
   }  // namespace linalg_internal
 
diff --git a/src/backend/linalg_internal_interface.hpp b/src/backend/linalg_internal_interface.hpp
index 5f5b5b696..73a948bd7 100644
--- a/src/backend/linalg_internal_interface.hpp
+++ b/src/backend/linalg_internal_interface.hpp
@@ -178,11 +178,7 @@ namespace cytnx {
     typedef void (*Sumfunc_oii)(boost::intrusive_ptr<Storage_base> &,
                                 const boost::intrusive_ptr<Storage_base> &, const cytnx_uint64 &);
 
-    typedef void (*Tracefunc_oii)(const bool &, Tensor &, const Tensor &, const cytnx_uint64 &,
-                                  const cytnx_uint64 &, const std::vector<cytnx_uint64> &,
-                                  const std::vector<cytnx_uint64> &,
-                                  const std::vector<cytnx_int64> &, const cytnx_uint64 &,
-                                  const cytnx_uint64 &);
+    typedef Tensor (*Tracefunc_oii)(const Tensor &, cytnx_uint64, cytnx_uint64);
 
     typedef void (*Tensordotfunc_oii)(Tensor &out, const Tensor &Lin, const Tensor &Rin,
                                       const std::vector<cytnx_uint64> &idxl,
diff --git a/src/linalg/Trace.cpp b/src/linalg/Trace.cpp
index 5b481c652..4bd0c9e08 100644
--- a/src/linalg/Trace.cpp
+++ b/src/linalg/Trace.cpp
@@ -45,73 +45,17 @@ namespace cytnx {
                       "shape(%d) = %ld and shape(%d) = %ld do not match.%s",
                       axisA, Tn.shape()[axisA], axisB, Tn.shape()[axisB], "\n");
 
-      cytnx_uint64 ax1, ax2;
-      if (axisA < axisB) {
-        ax1 = axisA;
-        ax2 = axisB;
-      } else {
-        ax1 = axisB;
-        ax2 = axisA;
+      if (Tn.device() == Device.cpu) {
+        return linalg_internal::lii.Trace_ii[Tn.dtype()](Tn, axisA, axisB);
       }
-
-      // 1) get redundant rank:
-      vector<cytnx_int64> shape(Tn.shape().begin(), Tn.shape().end());
-      vector<cytnx_uint64> accu;
-      shape.erase(shape.begin() + ax2);
-      shape.erase(shape.begin() + ax1);
-      // 2) get output element size.
-      cytnx_uint64 Nelem = 1;
-      for (int i = 0; i < shape.size(); i++) Nelem *= shape[i];
-
-      Tensor out = Tensor({Nelem}, Tn.dtype(), Tn.device());
-      out.storage().set_zeros();
-
-      if (shape.size() == 0) {
-        // 2d
-        if (Tn.device() == Device.cpu)
-          linalg_internal::lii.Trace_ii[Tn.dtype()](true, out, Tn, Tn.shape()[ax1], 0, {}, {}, {},
-                                                    0,
-                                                    0);  // only the first 4 args will be used.
-        else {
   #ifdef UNI_GPU
-          checkCudaErrors(cudaSetDevice(Tn.device()));
-          linalg_internal::lii.cuTrace_ii[Tn.dtype()](true, out, Tn, Tn.shape()[ax1], 0, {}, {}, {},
-                                                      0,
-                                                      0);  // only the first 4 args will be used.
+      checkCudaErrors(cudaSetDevice(Tn.device()));
+      return linalg_internal::lii.cuTrace_ii[Tn.dtype()](Tn, axisA, axisB);
   #else
-          cytnx_error_msg(true, "[Trace] fatal error,%s",
-                          "try to call the gpu section without CUDA support.\n");
-          return out;
+      cytnx_error_msg(true, "[Trace] fatal error,%s",
+                      "try to call the gpu section without CUDA support.\n");
+      return Tensor();
   #endif
-        }
-      } else {
-        // nd
-        vector<cytnx_uint64> remain_rank_id;
-        vector<cytnx_uint64> accu(shape.size());
-        accu.back() = 1;
-        for (int i = shape.size() - 1; i > 0; i--) accu[i - 1] = accu[i] * shape[i];
-
-        for (cytnx_uint64 i = 0; i < Tn.shape().size(); i++) {
-          if (i != ax1 && i != ax2) remain_rank_id.push_back(i);
-        }
-        if (Tn.device() == Device.cpu)
-          linalg_internal::lii.Trace_ii[Tn.dtype()](false, out, Tn, Tn.shape()[ax1], Nelem, accu,
-                                                    remain_rank_id, shape, ax1, ax2);
-        else {
-  #ifdef UNI_GPU
-          checkCudaErrors(cudaSetDevice(Tn.device()));
-          linalg_internal::lii.cuTrace_ii[Tn.dtype()](false, out, Tn, Tn.shape()[ax1], Nelem, accu,
-                                                      remain_rank_id, shape, ax1, ax2);
-  #else
-          cytnx_error_msg(true, "[Trace] fatal error,%s",
-                          "try to call the gpu section without CUDA support.\n");
-          return out;
-  #endif
-        }
-        out.reshape_(shape);
-      }
-
-      return out;
     }
 
   }  // namespace linalg
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7509bfaa7..da7b356cb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -22,6 +22,7 @@ add_executable(
   DenseUniTensor_test.cpp
   Accessor_test.cpp
   Tensor_test.cpp
+  Tensor_strides_test.cpp
   Storage_test.cpp
   search_tree_test.cpp
   utils_test/vec_concatenate.cpp
@@ -42,6 +43,7 @@ add_executable(
   linalg_test/Svd_truncate_test.cpp
   linalg_test/Gesvd_truncate_test.cpp
   linalg_test/Rsvd_truncate_test.cpp
+  linalg_test/Trace_test.cpp
   linalg_test/linalg_test.cpp
   linalg_test/stride_view_test.cpp
   linalg_test/sum_test.cpp
@@ -80,7 +82,10 @@ add_link_options(-fsanitize=address)
 #target_link_libraries(test_main PUBLIC "-lgcov --coverage")
 include(GoogleTest)
 gtest_discover_tests(test_main
-                     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+                     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                     # The test binary takes longer to enumerate than the
+                     # default 5s under MKL + ASAN on slower runners.
+                     DISCOVERY_TIMEOUT 120)
 
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/testNet.net"
       DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/tests/Tensor_strides_test.cpp b/tests/Tensor_strides_test.cpp
new file mode 100644
index 000000000..3fe287d32
--- /dev/null
+++ b/tests/Tensor_strides_test.cpp
@@ -0,0 +1,78 @@
+#include <gtest/gtest.h>
+
+#include <functional>
+#include <vector>
+
+#include "Tensor.hpp"
+#include "Type.hpp"
+#include "random.hpp"
+
+namespace {
+
+  // For a tensor T (possibly permuted / non-contiguous), the storage offset
+  // implied by strides() at multi-index idx must equal the offset that at(idx)
+  // actually reads -- otherwise the trace's stride-aware diagonal sum would
+  // disagree with Tensor::at across permutations.
+  template <typename T>
+  void ExpectStridesMatchAt(const cytnx::Tensor& tensor) {
+    const auto shape = tensor.shape();
+    const auto strides = tensor.strides();
+    ASSERT_EQ(strides.size(), shape.size());
+    std::vector<cytnx::cytnx_uint64> idx(shape.size(), 0);
+    std::function<void(std::size_t)> recurse = [&](std::size_t axis) {
+      if (axis == shape.size()) {
+        cytnx::cytnx_uint64 offset = 0;
+        for (std::size_t a = 0; a < shape.size(); ++a) offset += idx[a] * strides[a];
+        EXPECT_EQ(tensor.at<T>(idx), tensor.storage().at<T>(offset));
+      } else {
+        for (idx[axis] = 0; idx[axis] < shape[axis]; ++idx[axis]) recurse(axis + 1);
+      }
+    };
+    recurse(0);
+  }
+
+  TEST(TensorStridesTest, ContiguousIsRowMajor) {
+    auto tensor =
+      cytnx::random::random_tensor({4, 5, 3}, -1.0, 1.0, cytnx::Device.cpu, 0, cytnx::Type.Double);
+    ASSERT_TRUE(tensor.is_contiguous());
+    const auto s = tensor.strides();
+    // Row-major: strides = (5*3, 3, 1).
+    ASSERT_EQ(s.size(), 3u);
+    EXPECT_EQ(s[0], 15u);
+    EXPECT_EQ(s[1], 3u);
+    EXPECT_EQ(s[2], 1u);
+    ExpectStridesMatchAt<cytnx::cytnx_double>(tensor);
+  }
+
+  TEST(TensorStridesTest, PermutedMatchesAtForRanks2to5) {
+    using cytnx::cytnx_uint64;
+    struct Case {
+      std::vector<cytnx_uint64> shape;
+      std::vector<cytnx_uint64> perm;
+    };
+    const std::vector<Case> cases = {
+      {{3, 4}, {1, 0}},
+      {{3, 4, 5}, {2, 0, 1}},
+      {{2, 3, 4, 2}, {3, 1, 0, 2}},
+      {{2, 3, 2, 3, 2}, {4, 2, 0, 3, 1}},
+    };
+    for (const auto& c : cases) {
+      auto t =
+        cytnx::random::random_tensor(c.shape, -1.0, 1.0, cytnx::Device.cpu, 0, cytnx::Type.Double);
+      auto p = t.permute(c.perm);
+      EXPECT_FALSE(p.is_contiguous()) << "expected the permutation to be non-contiguous";
+      ExpectStridesMatchAt<cytnx::cytnx_double>(p);
+    }
+  }
+
+  TEST(TensorStridesTest, ComplexAndIntegerDtypes) {
+    auto td = cytnx::random::random_tensor({3, 4, 2}, -1.0, 1.0, cytnx::Device.cpu, 0,
+                                           cytnx::Type.ComplexDouble);
+    ExpectStridesMatchAt<cytnx::cytnx_complex128>(td.permute({2, 0, 1}));
+
+    auto ti =
+      cytnx::random::random_tensor({3, 4, 2}, 0.0, 10.0, cytnx::Device.cpu, 0, cytnx::Type.Int32);
+    ExpectStridesMatchAt<cytnx::cytnx_int32>(ti.permute({1, 2, 0}));
+  }
+
+}  // namespace
diff --git a/tests/linalg_test/Trace_test.cpp b/tests/linalg_test/Trace_test.cpp
new file mode 100644
index 000000000..1f3e6b58d
--- /dev/null
+++ b/tests/linalg_test/Trace_test.cpp
@@ -0,0 +1,104 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "Tensor.hpp"
+#include "Type.hpp"
+#include "linalg.hpp"
+#include "random.hpp"
+#include "test_tools.h"
+
+namespace {
+
+  using cytnx::cytnx_int32;
+  using cytnx::cytnx_int64;
+  using cytnx::cytnx_uint64;
+  using cytnx::Device;
+  using cytnx::Tensor;
+  using cytnx::Type;
+
+  // The strided in-place trace must agree with the trace of a fully materialized
+  // contiguous clone of the same tensor (which is the layout the old code always
+  // assumed). Pairing both via the same public API isolates the layout choice.
+  static Tensor ContiguousReferenceTrace(const Tensor& t, cytnx_uint64 a, cytnx_uint64 b) {
+    return cytnx::linalg::Trace(t.contiguous(), a, b);
+  }
+
+  TEST(LinalgTraceTest, PermutedRank3MatchesContiguous) {
+    auto t = cytnx::random::random_tensor({4, 3, 4}, -1.0, 1.0, Device.cpu, 0, Type.Double);
+    auto p = t.permute({2, 1, 0});  // shape {4, 3, 4} but non-contiguous
+    ASSERT_FALSE(p.is_contiguous());
+    auto strided = cytnx::linalg::Trace(p, 0, 2);
+    auto reference = ContiguousReferenceTrace(p, 0, 2);
+    EXPECT_TRUE(cytnx::TestTools::AreNearlyEqTensor(strided, reference, 1e-12));
+  }
+
+  TEST(LinalgTraceTest, PermutedTracesAcrossRanksAndDtypes) {
+    struct Case {
+      std::vector<cytnx_uint64> shape;
+      std::vector<cytnx_uint64> perm;
+      cytnx_uint64 ax1, ax2;
+    };
+    // Each case picks two equal-sized logical axes after a non-trivial permute.
+    const std::vector<Case> cases = {
+      // rank 3
+      {{4, 3, 4}, {1, 0, 2}, 1, 2},
+      // rank 4 with non-adjacent traced axes (permuted shape = {4, 4, 5, 3})
+      {{4, 3, 4, 5}, {2, 0, 3, 1}, 0, 1},
+      // rank 5
+      {{3, 4, 3, 2, 3}, {4, 1, 0, 3, 2}, 0, 2},
+    };
+    for (unsigned int dtype : {Type.Double, Type.ComplexDouble, Type.Int32}) {
+      for (const auto& c : cases) {
+        auto t = cytnx::random::random_tensor(c.shape, -2.0, 2.0, Device.cpu, 0, dtype);
+        auto p = t.permute(c.perm);
+        EXPECT_FALSE(p.is_contiguous());
+        auto strided = cytnx::linalg::Trace(p, c.ax1, c.ax2);
+        auto reference = ContiguousReferenceTrace(p, c.ax1, c.ax2);
+        EXPECT_TRUE(cytnx::TestTools::AreNearlyEqTensor(strided, reference, 1e-10))
+          << "dtype=" << dtype << " rank=" << c.shape.size();
+      }
+    }
+  }
+
+  TEST(LinalgTraceTest, Rank2Path) {
+    // Exercises _trace_2d / cuTrace_2d_kernel (the 2d branch is only taken when
+    // every remaining axis has been traced away).
+    auto t = cytnx::random::random_tensor({6, 6}, -1.0, 1.0, Device.cpu, 0, Type.Double);
+    auto p = t.permute({1, 0});
+    ASSERT_FALSE(p.is_contiguous());
+    auto out_p = cytnx::linalg::Trace(p, 0, 1);
+    auto out_c = cytnx::linalg::Trace(t, 0, 1);  // tr(A) == tr(A^T)
+    ASSERT_EQ(out_p.shape().size(), 1u);
+    ASSERT_EQ(out_p.shape()[0], 1u);
+    EXPECT_TRUE(cytnx::TestTools::AreNearlyEqTensor(out_p, out_c, 1e-12));
+  }
+
+  TEST(LinalgTraceTest, OutputRankIsInputMinusTwo) {
+    // tr(rank-N) -> rank-(N-2); tr(rank-2) -> a 1-element rank-1 tensor.
+    auto r4 = cytnx::random::random_tensor({2, 3, 2, 4}, -1.0, 1.0, Device.cpu, 0, Type.Double);
+    auto out4 = cytnx::linalg::Trace(r4, 0, 2);
+    EXPECT_EQ(out4.shape().size(), 2u);
+    EXPECT_EQ(out4.shape()[0], 3u);
+    EXPECT_EQ(out4.shape()[1], 4u);
+
+    auto r3 = cytnx::random::random_tensor({3, 4, 3}, -1.0, 1.0, Device.cpu, 0, Type.Double);
+    auto out3 = cytnx::linalg::Trace(r3, 0, 2);
+    EXPECT_EQ(out3.shape().size(), 1u);
+    EXPECT_EQ(out3.shape()[0], 4u);
+
+    auto r2 = cytnx::random::random_tensor({5, 5}, -1.0, 1.0, Device.cpu, 0, Type.Double);
+    auto out2 = cytnx::linalg::Trace(r2, 0, 1);
+    EXPECT_EQ(out2.shape().size(), 1u);
+    EXPECT_EQ(out2.shape()[0], 1u);
+  }
+
+  TEST(LinalgTraceTest, SwappedAxisOrderMatches) {
+    // Trace(T, a, b) == Trace(T, b, a) (the function normalizes the order).
+    auto t = cytnx::random::random_tensor({3, 4, 3, 2}, -1.0, 1.0, Device.cpu, 0, Type.Double);
+    auto ab = cytnx::linalg::Trace(t, 0, 2);
+    auto ba = cytnx::linalg::Trace(t, 2, 0);
+    EXPECT_TRUE(cytnx::TestTools::AreNearlyEqTensor(ab, ba, 1e-12));
+  }
+
+}  // namespace

From 3ceb1db19cef29172fbd95d88f332a2ca72cb18b Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Sun, 31 May 2026 07:51:06 +0000
Subject: [PATCH 2/3] perf(trace): walk the ND trace output on an odometer
 instead of div/mod

The ND CPU trace decoded each flat output index into an input base offset by
dividing and taking the modulo against precomputed row-major accumulators on
every element. Division and modulo dominate that hot loop, and the accumulator
vector only existed to support them.

Carry the base offset on an odometer instead: precompute the input stride for
each surviving axis once, then for each output element bump the last axis index
(carrying into earlier axes on wrap) and adjust the base offset by the affected
axes' strides. Advancing is amortized O(1) per element with no division, modulo,
or per-element flat-index decode, and the row-major accumulator vector is gone.

Also drop the now-unused backend/lapack_wrapper.hpp and utils/utils.hpp
includes, left over from the previous Contract-based implementation.

Co-authored-by: Claude <noreply@anthropic.com>
---
 .../linalg_internal_cpu/Trace_internal.cpp    | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/backend/linalg_internal_cpu/Trace_internal.cpp b/src/backend/linalg_internal_cpu/Trace_internal.cpp
index adbebee3e..219df2be7 100644
--- a/src/backend/linalg_internal_cpu/Trace_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Trace_internal.cpp
@@ -1,10 +1,8 @@
 #include "Trace_internal.hpp"
 #include "cytnx_error.hpp"
-#include "backend/lapack_wrapper.hpp"
 
 #include "backend/linalg_internal_cpu/pairwise_sum.hpp"
 #include "backend/linalg_internal_cpu/stride_view.hpp"
-#include "utils/utils.hpp"
 
 #include <algorithm>
 #include <span>
@@ -51,18 +49,30 @@ namespace cytnx {
           return out;
         }
 
-        std::vector<cytnx_uint64> accu(out_shape.size(), 1);
-        for (int i = static_cast<int>(out_shape.size()) - 1; i > 0; --i)
-          accu[i - 1] = accu[i] * static_cast<cytnx_uint64>(out_shape[i]);
+        // Input stride for each surviving (output) axis, so the hot loop indexes a
+        // flat array instead of going through remain_rank_id on every step.
+        std::vector<cytnx_uint64> out_strides(out_shape.size());
+        for (cytnx_uint64 x = 0; x < out_shape.size(); ++x)
+          out_strides[x] = strides[remain_rank_id[x]];
 
+        // Walk the output elements in row-major order, carrying the input base
+        // offset on an odometer: each step bumps the last axis index (carrying into
+        // earlier axes on wrap) and adjusts base by the affected axes' strides. This
+        // avoids the per-element division and modulo of decoding the flat index, and
+        // needs no precomputed row-major accumulators.
         T *out_data = out.storage().data<T>();
+        std::vector<cytnx_uint64> index(out_shape.size(), 0);
+        cytnx_uint64 base = 0;
         for (cytnx_uint64 i = 0; i < Nelem; ++i) {
-          cytnx_uint64 tmp = i, base = 0;
-          for (cytnx_uint64 x = 0; x < out_shape.size(); ++x) {
-            base += (tmp / accu[x]) * strides[remain_rank_id[x]];
-            tmp %= accu[x];
-          }
           out_data[i] = PairwiseSum(std::span<const T>(data + base, extent) | stride(diag_stride));
+          for (cytnx_uint64 x = out_shape.size(); x-- > 0;) {
+            if (++index[x] < static_cast<cytnx_uint64>(out_shape[x])) {
+              base += out_strides[x];
+              break;
+            }
+            index[x] = 0;
+            base -= (static_cast<cytnx_uint64>(out_shape[x]) - 1) * out_strides[x];
+          }
         }
         out.reshape_(out_shape);
         return out;

From 954e3668fe193d1c8fa25d7e5abe524ee36e24dd Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Tue, 2 Jun 2026 02:28:38 +0000
Subject: [PATCH 3/3] refactor(trace): compose the CPU trace output via
 Tensor::from_storage

TraceImpl<T> allocated the result as a Tensor and then reached through
out.storage() to fill it, mixing the container and storage layers. Fill a flat
result Storage directly and compose the Tensor with Tensor::from_storage once
the diagonal sums are written, so the storage is populated at the storage layer
and only promoted to a Tensor (and reshaped to the reduced rank) at the end.

Behavior is unchanged: the 2D trace yields a single-element result, the ND trace
one element per remaining-rank multi-index, and the Ndiag == 0 / Nelem == 0
guard still returns a zeroed result of the reduced shape.

Co-authored-by: Claude <noreply@anthropic.com>
---
 .../linalg_internal_cpu/Trace_internal.cpp     | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/backend/linalg_internal_cpu/Trace_internal.cpp b/src/backend/linalg_internal_cpu/Trace_internal.cpp
index 219df2be7..409c454a7 100644
--- a/src/backend/linalg_internal_cpu/Trace_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Trace_internal.cpp
@@ -1,4 +1,6 @@
 #include "Trace_internal.hpp"
+#include "Tensor.hpp"
+#include "backend/Storage.hpp"
 #include "cytnx_error.hpp"
 
 #include "backend/linalg_internal_cpu/pairwise_sum.hpp"
@@ -31,9 +33,13 @@ namespace cytnx {
         for (auto d : out_shape) Nelem *= static_cast<cytnx_uint64>(d);
         const bool is_2d = out_shape.empty();
 
-        Tensor out = Tensor({is_2d ? cytnx_uint64{1} : Nelem}, Tn.dtype(), Tn.device());
-        out.storage().set_zeros();
+        // Fill a flat result Storage, then compose the output Tensor from it; the
+        // 2D trace produces a single element, the ND trace one element per
+        // remaining-rank multi-index.
+        Storage out_storage(is_2d ? cytnx_uint64{1} : Nelem, Tn.dtype(), Tn.device());
         if (Ndiag == 0 || Nelem == 0) {
+          out_storage.set_zeros();
+          Tensor out = Tensor::from_storage(out_storage);
           if (!is_2d) out.reshape_(out_shape);
           return out;
         }
@@ -42,11 +48,11 @@ namespace cytnx {
         const cytnx_uint64 diag_stride = strides[ax1] + strides[ax2];
         const cytnx_uint64 extent = (Ndiag - 1) * diag_stride + 1;
         const T *data = Tn.storage().data<T>();
+        T *out_data = out_storage.data<T>();
 
         if (is_2d) {
-          out.storage().at<T>(0) =
-            PairwiseSum(std::span<const T>(data, extent) | stride(diag_stride));
-          return out;
+          out_data[0] = PairwiseSum(std::span<const T>(data, extent) | stride(diag_stride));
+          return Tensor::from_storage(out_storage);
         }
 
         // Input stride for each surviving (output) axis, so the hot loop indexes a
@@ -60,7 +66,6 @@ namespace cytnx {
         // earlier axes on wrap) and adjusts base by the affected axes' strides. This
         // avoids the per-element division and modulo of decoding the flat index, and
         // needs no precomputed row-major accumulators.
-        T *out_data = out.storage().data<T>();
         std::vector<cytnx_uint64> index(out_shape.size(), 0);
         cytnx_uint64 base = 0;
         for (cytnx_uint64 i = 0; i < Nelem; ++i) {
@@ -74,6 +79,7 @@ namespace cytnx {
             base -= (static_cast<cytnx_uint64>(out_shape[x]) - 1) * out_strides[x];
           }
         }
+        Tensor out = Tensor::from_storage(out_storage);
         out.reshape_(out_shape);
         return out;
       }