Cytnx-dev · IvanaGyro · May 30, 2026 · May 31, 2026 · Jun 2, 2026 · IvanaGyro
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -15,6 +15,7 @@ add_executable(
   linalg/Directsum_bm.cpp
   linalg/Svd_bm.cpp
   linalg/Svd_truncate_bm.cpp
+  linalg/Trace_bm.cpp
   linalg/Vectordot_bm.cpp
   linalg/Lanczos_bm.cpp
   linalg/linalg_basic_bm.cpp

diff --git a/benchmarks/linalg/Trace_bm.cpp b/benchmarks/linalg/Trace_bm.cpp
diff --git a/include/Tensor.hpp b/include/Tensor.hpp
@@ -596,6 +596,17 @@ namespace cytnx {
     */
     const std::vector<cytnx_uint64> &shape() const { return this->_impl->shape(); }
 
+    /**
+    @brief the storage strides of the Tensor
+    @return [std::vector<cytnx_uint64>] for each logical axis, the distance in the
+    underlying storage between consecutive elements along that axis
+    @details cytnx tensors store a dense permutation of their logical axes, so the
+    stride of every axis is well defined (this is the layout Tensor::at indexes
+    through). For a contiguous tensor these are the row-major strides; for a
+    permuted (non-contiguous) view they reflect the permuted memory order.
+    */
+    std::vector<cytnx_uint64> strides() const;
+
     /**
         @brief the rank of the Tensor
         @return [cytnx_uint64] the rank of the Tensor

diff --git a/src/Tensor.cpp b/src/Tensor.cpp
@@ -907,6 +907,22 @@ namespace cytnx {
     return is(this->_impl->storage(), rhs.storage());
   }
 
+  std::vector<cytnx_uint64> Tensor::strides() const {
+    // The storage is laid out contiguously in memory order; _invmapper[i] gives
+    // the logical axis sitting at memory position i (innermost last). The stride
+    // of a logical axis is the product of the memory-order extents inside it.
+    const std::vector<cytnx_uint64> &shape = this->_impl->shape();
+    const std::vector<cytnx_uint64> &invmapper = this->_impl->invmapper();
+    const cytnx_uint64 rank = shape.size();
+    std::vector<cytnx_uint64> out(rank);
+    cytnx_uint64 step = 1;
+    for (cytnx_int64 i = static_cast<cytnx_int64>(rank) - 1; i >= 0; i--) {
+      out[invmapper[i]] = step;
+      step *= shape[invmapper[i]];
+    }
+    return out;
+  }
+
   //===========================
   // Tensor am Tproxy
   Tensor operator+(const Tensor &lhs, const Tensor::Tproxy &rhs) {

diff --git a/src/backend/linalg_internal_cpu/Trace_internal.cpp b/src/backend/linalg_internal_cpu/Trace_internal.cpp
@@ -1,192 +1,125 @@
 #include "Trace_internal.hpp"
+#include "Tensor.hpp"
+#include "backend/Storage.hpp"
 #include "cytnx_error.hpp"
-#include "backend/lapack_wrapper.hpp"
 
-#include "Generator.hpp"
-#include "utils/utils.hpp"
+#include "backend/linalg_internal_cpu/pairwise_sum.hpp"
+#include "backend/linalg_internal_cpu/stride_view.hpp"
 
-#include "UniTensor.hpp"
+#include <algorithm>
+#include <span>
 #include <vector>
 
 namespace cytnx {
   namespace linalg_internal {
+    namespace {
+
+      template <class T>
+      Tensor TraceImpl(const Tensor &Tn, cytnx_uint64 a1, cytnx_uint64 a2) {
+        const cytnx_uint64 ax1 = std::min(a1, a2);
+        const cytnx_uint64 ax2 = std::max(a1, a2);
+        const auto &shape_in = Tn.shape();
+        const cytnx_uint64 Ndiag = shape_in[ax1];
+
+        std::vector<cytnx_int64> out_shape;
+        std::vector<cytnx_uint64> remain_rank_id;
+        for (cytnx_uint64 i = 0; i < shape_in.size(); ++i) {
+          if (i != ax1 && i != ax2) {
+            out_shape.push_back(static_cast<cytnx_int64>(shape_in[i]));
+            remain_rank_id.push_back(i);
+          }
+        }
+        cytnx_uint64 Nelem = 1;
+        for (auto d : out_shape) Nelem *= static_cast<cytnx_uint64>(d);
+        const bool is_2d = out_shape.empty();
+
+        // Fill a flat result Storage, then compose the output Tensor from it; the
+        // 2D trace produces a single element, the ND trace one element per
+        // remaining-rank multi-index.
+        Storage out_storage(is_2d ? cytnx_uint64{1} : Nelem, Tn.dtype(), Tn.device());
+        if (Ndiag == 0 || Nelem == 0) {
+          out_storage.set_zeros();
+          Tensor out = Tensor::from_storage(out_storage);
+          if (!is_2d) out.reshape_(out_shape);
+          return out;
+        }
+
+        const std::vector<cytnx_uint64> strides = Tn.strides();
+        const cytnx_uint64 diag_stride = strides[ax1] + strides[ax2];
+        const cytnx_uint64 extent = (Ndiag - 1) * diag_stride + 1;
+        const T *data = Tn.storage().data<T>();
+        T *out_data = out_storage.data<T>();
+
+        if (is_2d) {
+          out_data[0] = PairwiseSum(std::span<const T>(data, extent) | stride(diag_stride));
+          return Tensor::from_storage(out_storage);
+        }
+
+        // Input stride for each surviving (output) axis, so the hot loop indexes a
+        // flat array instead of going through remain_rank_id on every step.
+        std::vector<cytnx_uint64> out_strides(out_shape.size());
+        for (cytnx_uint64 x = 0; x < out_shape.size(); ++x)
+          out_strides[x] = strides[remain_rank_id[x]];
+
+        // Walk the output elements in row-major order, carrying the input base
+        // offset on an odometer: each step bumps the last axis index (carrying into
+        // earlier axes on wrap) and adjusts base by the affected axes' strides. This
+        // avoids the per-element division and modulo of decoding the flat index, and
+        // needs no precomputed row-major accumulators.
+        std::vector<cytnx_uint64> index(out_shape.size(), 0);
+        cytnx_uint64 base = 0;
+        for (cytnx_uint64 i = 0; i < Nelem; ++i) {
+          out_data[i] = PairwiseSum(std::span<const T>(data + base, extent) | stride(diag_stride));
+          for (cytnx_uint64 x = out_shape.size(); x-- > 0;) {
+            if (++index[x] < static_cast<cytnx_uint64>(out_shape[x])) {
+              base += out_strides[x];
+              break;
+            }
+            index[x] = 0;
+            base -= (static_cast<cytnx_uint64>(out_shape[x]) - 1) * out_strides[x];
+          }
+        }
+        Tensor out = Tensor::from_storage(out_storage);
+        out.reshape_(out_shape);
+        return out;
+      }
 
-    template <class T>
-    void _trace_2d(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag) {
-      T a = 0;
-      T *rawdata = Tn.storage().data<T>();
-      cytnx_uint64 Ldim = Tn.shape()[1];
-      for (cytnx_uint64 i = 0; i < Ndiag; i++) a += rawdata[i * Ldim + i];
-      out.storage().at<T>(0) = a;
-    }
-
-    template <class T>
-    void _trace_nd(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag,
-                   const cytnx_uint64 &Nelem, const std::vector<cytnx_uint64> &accu,
-                   const std::vector<cytnx_uint64> &remain_rank_id,
-                   const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                   const cytnx_uint64 &ax2) {
-      cytnx::UniTensor I_UT = cytnx::UniTensor::eye(Ndiag, {}, true, Tn.dtype(), Tn.device());
-
-      UniTensor UTn = UniTensor(Tn, false, 2);
-      I_UT.relabel_({UTn._impl->_labels[ax1], UTn._impl->_labels[ax2]});
-
-      out = Contract(I_UT, UTn).get_block_();
-
-      // std::vector<cytnx_uint64> indexer(Tn.shape().size(), 0);
-      // cytnx_uint64 tmp;
-      // for (cytnx_uint64 i = 0; i < Nelem; i++) {
-      // tmp = i;
-      // // calculate indexer
-      // for (int x = 0; x < shape.size(); x++) {
-      // indexer[remain_rank_id[x]] = cytnx_uint64(tmp / accu[x]);
-      // tmp %= accu[x];
-      // }
+    }  // namespace
 
-      // for (cytnx_uint64 d = 0; d < Ndiag; d++) {
-      // indexer[ax1] = indexer[ax2] = d;
-      // out.storage().at<T>(i) += Tn.at<T>(indexer);
-      // }
-      // }
+    Tensor Trace_internal_cd(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_complex128>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_cd(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                           const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                           const std::vector<cytnx_uint64> &accu,
-                           const std::vector<cytnx_uint64> &remain_rank_id,
-                           const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                           const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_complex128>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_complex128>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_cf(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_complex64>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_cf(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                           const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                           const std::vector<cytnx_uint64> &accu,
-                           const std::vector<cytnx_uint64> &remain_rank_id,
-                           const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                           const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_complex64>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_complex64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_d(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_double>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_d(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                          const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_double>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_double>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_f(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_float>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_f(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                          const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_float>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_float>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_u64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_uint64>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_u64(const bool &is_2d, Tensor &out, const Tensor &Tn,
-                            const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_uint64>(out, Tn, Ndiag);
-      } else {
-        _trace_nd<cytnx_uint64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_i64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_int64>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_i64(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_int64>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_int64>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_u32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_uint32>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_u32(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_uint32>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_uint32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_i32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_int32>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_i32(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_int32>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_int32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
-    }
-
-    void Trace_internal_u16(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_uint16>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_uint16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_u16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_uint16>(Tn, ax1, ax2);
     }
-
-    void Trace_internal_i16(const bool &is_2d, Tensor &out, const Tensor &tn,
-                            const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                            const std::vector<cytnx_uint64> &accu,
-                            const std::vector<cytnx_uint64> &remain_rank_id,
-                            const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                            const cytnx_uint64 &ax2) {
-      if (is_2d) {
-        _trace_2d<cytnx_int16>(out, tn, ndiag);
-      } else {
-        _trace_nd<cytnx_int16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
-      }
+    Tensor Trace_internal_i16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
+      return TraceImpl<cytnx_int16>(Tn, ax1, ax2);
     }
 
-    void Trace_internal_b(const bool &is_2d, Tensor &out, const Tensor &tn,
-                          const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
-                          const std::vector<cytnx_uint64> &accu,
-                          const std::vector<cytnx_uint64> &remain_rank_id,
-                          const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
-                          const cytnx_uint64 &ax2) {
+    Tensor Trace_internal_b(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
       cytnx_error_msg(true, "[internal][Trace] bool is not available. %s", "\n");
+      return Tensor();
     }
 
   }  // namespace linalg_internal