Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ add_executable(
linalg/Directsum_bm.cpp
linalg/Svd_bm.cpp
linalg/Svd_truncate_bm.cpp
linalg/Trace_bm.cpp
linalg/Vectordot_bm.cpp
linalg/Lanczos_bm.cpp
linalg/linalg_basic_bm.cpp
Expand Down
301 changes: 301 additions & 0 deletions benchmarks/linalg/Trace_bm.cpp

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions include/Tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,17 @@ namespace cytnx {
*/
const std::vector<cytnx_uint64> &shape() const { return this->_impl->shape(); }

/**
@brief the storage strides of the Tensor
@return [std::vector<cytnx_uint64>] for each logical axis, the distance in the
underlying storage between consecutive elements along that axis
@details cytnx tensors store a dense permutation of their logical axes, so the
stride of every axis is well defined (this is the layout Tensor::at indexes
through). For a contiguous tensor these are the row-major strides; for a
permuted (non-contiguous) view they reflect the permuted memory order.
*/
std::vector<cytnx_uint64> strides() const;

/**
@brief the rank of the Tensor
@return [cytnx_uint64] the rank of the Tensor
Expand Down
16 changes: 16 additions & 0 deletions src/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,22 @@ namespace cytnx {
return is(this->_impl->storage(), rhs.storage());
}

std::vector<cytnx_uint64> Tensor::strides() const {
// The storage is laid out contiguously in memory order; _invmapper[i] gives
// the logical axis sitting at memory position i (innermost last). The stride
// of a logical axis is the product of the memory-order extents inside it.
const std::vector<cytnx_uint64> &shape = this->_impl->shape();
const std::vector<cytnx_uint64> &invmapper = this->_impl->invmapper();
const cytnx_uint64 rank = shape.size();
std::vector<cytnx_uint64> out(rank);
cytnx_uint64 step = 1;
for (cytnx_int64 i = static_cast<cytnx_int64>(rank) - 1; i >= 0; i--) {
out[invmapper[i]] = step;
step *= shape[invmapper[i]];
}
return out;
}

//===========================
// Tensor am Tproxy
Tensor operator+(const Tensor &lhs, const Tensor::Tproxy &rhs) {
Expand Down
267 changes: 100 additions & 167 deletions src/backend/linalg_internal_cpu/Trace_internal.cpp
Original file line number Diff line number Diff line change
@@ -1,192 +1,125 @@
#include "Trace_internal.hpp"
#include "Tensor.hpp"
#include "backend/Storage.hpp"
#include "cytnx_error.hpp"
#include "backend/lapack_wrapper.hpp"

#include "Generator.hpp"
#include "utils/utils.hpp"
#include "backend/linalg_internal_cpu/pairwise_sum.hpp"
#include "backend/linalg_internal_cpu/stride_view.hpp"

#include "UniTensor.hpp"
#include <algorithm>
#include <span>
#include <vector>

namespace cytnx {
namespace linalg_internal {
namespace {

template <class T>
Tensor TraceImpl(const Tensor &Tn, cytnx_uint64 a1, cytnx_uint64 a2) {
const cytnx_uint64 ax1 = std::min(a1, a2);
const cytnx_uint64 ax2 = std::max(a1, a2);
Comment on lines +19 to +20
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two lines are inherited from the old code. They are not needed now.

const auto &shape_in = Tn.shape();
const cytnx_uint64 Ndiag = shape_in[ax1];

std::vector<cytnx_int64> out_shape;
std::vector<cytnx_uint64> remain_rank_id;
for (cytnx_uint64 i = 0; i < shape_in.size(); ++i) {
if (i != ax1 && i != ax2) {
out_shape.push_back(static_cast<cytnx_int64>(shape_in[i]));
remain_rank_id.push_back(i);
}
}
cytnx_uint64 Nelem = 1;
for (auto d : out_shape) Nelem *= static_cast<cytnx_uint64>(d);
const bool is_2d = out_shape.empty();

// Fill a flat result Storage, then compose the output Tensor from it; the
// 2D trace produces a single element, the ND trace one element per
// remaining-rank multi-index.
Storage out_storage(is_2d ? cytnx_uint64{1} : Nelem, Tn.dtype(), Tn.device());
if (Ndiag == 0 || Nelem == 0) {
out_storage.set_zeros();
Tensor out = Tensor::from_storage(out_storage);
if (!is_2d) out.reshape_(out_shape);
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should throw error here instead.

return out;
}

const std::vector<cytnx_uint64> strides = Tn.strides();
const cytnx_uint64 diag_stride = strides[ax1] + strides[ax2];
const cytnx_uint64 extent = (Ndiag - 1) * diag_stride + 1;
const T *data = Tn.storage().data<T>();
T *out_data = out_storage.data<T>();

if (is_2d) {
out_data[0] = PairwiseSum(std::span<const T>(data, extent) | stride(diag_stride));
return Tensor::from_storage(out_storage);
}

// Input stride for each surviving (output) axis, so the hot loop indexes a
// flat array instead of going through remain_rank_id on every step.
std::vector<cytnx_uint64> out_strides(out_shape.size());
for (cytnx_uint64 x = 0; x < out_shape.size(); ++x)
out_strides[x] = strides[remain_rank_id[x]];

// Walk the output elements in row-major order, carrying the input base
// offset on an odometer: each step bumps the last axis index (carrying into
// earlier axes on wrap) and adjusts base by the affected axes' strides. This
// avoids the per-element division and modulo of decoding the flat index, and
// needs no precomputed row-major accumulators.
std::vector<cytnx_uint64> index(out_shape.size(), 0);
cytnx_uint64 base = 0;
for (cytnx_uint64 i = 0; i < Nelem; ++i) {
out_data[i] = PairwiseSum(std::span<const T>(data + base, extent) | stride(diag_stride));
for (cytnx_uint64 x = out_shape.size(); x-- > 0;) {
if (++index[x] < static_cast<cytnx_uint64>(out_shape[x])) {
base += out_strides[x];
break;
}
index[x] = 0;
base -= (static_cast<cytnx_uint64>(out_shape[x]) - 1) * out_strides[x];
}
}
Tensor out = Tensor::from_storage(out_storage);
out.reshape_(out_shape);
return out;
}

template <class T>
void _trace_2d(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag) {
T a = 0;
T *rawdata = Tn.storage().data<T>();
cytnx_uint64 Ldim = Tn.shape()[1];
for (cytnx_uint64 i = 0; i < Ndiag; i++) a += rawdata[i * Ldim + i];
out.storage().at<T>(0) = a;
}

template <class T>
void _trace_nd(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag,
const cytnx_uint64 &Nelem, const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
cytnx::UniTensor I_UT = cytnx::UniTensor::eye(Ndiag, {}, true, Tn.dtype(), Tn.device());

UniTensor UTn = UniTensor(Tn, false, 2);
I_UT.relabel_({UTn._impl->_labels[ax1], UTn._impl->_labels[ax2]});

out = Contract(I_UT, UTn).get_block_();

// std::vector<cytnx_uint64> indexer(Tn.shape().size(), 0);
// cytnx_uint64 tmp;
// for (cytnx_uint64 i = 0; i < Nelem; i++) {
// tmp = i;
// // calculate indexer
// for (int x = 0; x < shape.size(); x++) {
// indexer[remain_rank_id[x]] = cytnx_uint64(tmp / accu[x]);
// tmp %= accu[x];
// }
} // namespace

// for (cytnx_uint64 d = 0; d < Ndiag; d++) {
// indexer[ax1] = indexer[ax2] = d;
// out.storage().at<T>(i) += Tn.at<T>(indexer);
// }
// }
Tensor Trace_internal_cd(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_complex128>(Tn, ax1, ax2);
}

void Trace_internal_cd(const bool &is_2d, Tensor &out, const Tensor &Tn,
const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_complex128>(out, Tn, Ndiag);
} else {
_trace_nd<cytnx_complex128>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_cf(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_complex64>(Tn, ax1, ax2);
}

void Trace_internal_cf(const bool &is_2d, Tensor &out, const Tensor &Tn,
const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_complex64>(out, Tn, Ndiag);
} else {
_trace_nd<cytnx_complex64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_d(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_double>(Tn, ax1, ax2);
}

void Trace_internal_d(const bool &is_2d, Tensor &out, const Tensor &Tn,
const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_double>(out, Tn, Ndiag);
} else {
_trace_nd<cytnx_double>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_f(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_float>(Tn, ax1, ax2);
}

void Trace_internal_f(const bool &is_2d, Tensor &out, const Tensor &Tn,
const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_float>(out, Tn, Ndiag);
} else {
_trace_nd<cytnx_float>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_u64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_uint64>(Tn, ax1, ax2);
}

void Trace_internal_u64(const bool &is_2d, Tensor &out, const Tensor &Tn,
const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_uint64>(out, Tn, Ndiag);
} else {
_trace_nd<cytnx_uint64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_i64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_int64>(Tn, ax1, ax2);
}

void Trace_internal_i64(const bool &is_2d, Tensor &out, const Tensor &tn,
const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_int64>(out, tn, ndiag);
} else {
_trace_nd<cytnx_int64>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_u32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_uint32>(Tn, ax1, ax2);
}

void Trace_internal_u32(const bool &is_2d, Tensor &out, const Tensor &tn,
const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_uint32>(out, tn, ndiag);
} else {
_trace_nd<cytnx_uint32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_i32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_int32>(Tn, ax1, ax2);
}

void Trace_internal_i32(const bool &is_2d, Tensor &out, const Tensor &tn,
const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_int32>(out, tn, ndiag);
} else {
_trace_nd<cytnx_int32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
}
}

void Trace_internal_u16(const bool &is_2d, Tensor &out, const Tensor &tn,
const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_uint16>(out, tn, ndiag);
} else {
_trace_nd<cytnx_uint16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_u16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_uint16>(Tn, ax1, ax2);
}

void Trace_internal_i16(const bool &is_2d, Tensor &out, const Tensor &tn,
const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
if (is_2d) {
_trace_2d<cytnx_int16>(out, tn, ndiag);
} else {
_trace_nd<cytnx_int16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2);
}
Tensor Trace_internal_i16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
return TraceImpl<cytnx_int16>(Tn, ax1, ax2);
}

void Trace_internal_b(const bool &is_2d, Tensor &out, const Tensor &tn,
const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem,
const std::vector<cytnx_uint64> &accu,
const std::vector<cytnx_uint64> &remain_rank_id,
const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1,
const cytnx_uint64 &ax2) {
Tensor Trace_internal_b(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) {
cytnx_error_msg(true, "[internal][Trace] bool is not available. %s", "\n");
return Tensor();
}

} // namespace linalg_internal
Expand Down
Loading