-
Notifications
You must be signed in to change notification settings - Fork 22
refactor(trace): compute the CPU Tensor trace in the container layer #862
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
IvanaGyro
wants to merge
3
commits into
claude/stride-view
Choose a base branch
from
claude/trace-cpu
base: claude/stride-view
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
db40735
refactor(trace): compute the CPU Tensor trace in the container layer
IvanaGyro 3ceb1db
perf(trace): walk the ND trace output on an odometer instead of div/mod
IvanaGyro 954e366
refactor(trace): compose the CPU trace output via Tensor::from_storage
IvanaGyro File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,192 +1,125 @@ | ||
| #include "Trace_internal.hpp" | ||
| #include "Tensor.hpp" | ||
| #include "backend/Storage.hpp" | ||
| #include "cytnx_error.hpp" | ||
| #include "backend/lapack_wrapper.hpp" | ||
|
|
||
| #include "Generator.hpp" | ||
| #include "utils/utils.hpp" | ||
| #include "backend/linalg_internal_cpu/pairwise_sum.hpp" | ||
| #include "backend/linalg_internal_cpu/stride_view.hpp" | ||
|
|
||
| #include "UniTensor.hpp" | ||
| #include <algorithm> | ||
| #include <span> | ||
| #include <vector> | ||
|
|
||
| namespace cytnx { | ||
| namespace linalg_internal { | ||
| namespace { | ||
|
|
||
| template <class T> | ||
| Tensor TraceImpl(const Tensor &Tn, cytnx_uint64 a1, cytnx_uint64 a2) { | ||
| const cytnx_uint64 ax1 = std::min(a1, a2); | ||
| const cytnx_uint64 ax2 = std::max(a1, a2); | ||
| const auto &shape_in = Tn.shape(); | ||
| const cytnx_uint64 Ndiag = shape_in[ax1]; | ||
|
|
||
| std::vector<cytnx_int64> out_shape; | ||
| std::vector<cytnx_uint64> remain_rank_id; | ||
| for (cytnx_uint64 i = 0; i < shape_in.size(); ++i) { | ||
| if (i != ax1 && i != ax2) { | ||
| out_shape.push_back(static_cast<cytnx_int64>(shape_in[i])); | ||
| remain_rank_id.push_back(i); | ||
| } | ||
| } | ||
| cytnx_uint64 Nelem = 1; | ||
| for (auto d : out_shape) Nelem *= static_cast<cytnx_uint64>(d); | ||
| const bool is_2d = out_shape.empty(); | ||
|
|
||
| // Fill a flat result Storage, then compose the output Tensor from it; the | ||
| // 2D trace produces a single element, the ND trace one element per | ||
| // remaining-rank multi-index. | ||
| Storage out_storage(is_2d ? cytnx_uint64{1} : Nelem, Tn.dtype(), Tn.device()); | ||
| if (Ndiag == 0 || Nelem == 0) { | ||
| out_storage.set_zeros(); | ||
| Tensor out = Tensor::from_storage(out_storage); | ||
| if (!is_2d) out.reshape_(out_shape); | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should throw error here instead. |
||
| return out; | ||
| } | ||
|
|
||
| const std::vector<cytnx_uint64> strides = Tn.strides(); | ||
| const cytnx_uint64 diag_stride = strides[ax1] + strides[ax2]; | ||
| const cytnx_uint64 extent = (Ndiag - 1) * diag_stride + 1; | ||
| const T *data = Tn.storage().data<T>(); | ||
| T *out_data = out_storage.data<T>(); | ||
|
|
||
| if (is_2d) { | ||
| out_data[0] = PairwiseSum(std::span<const T>(data, extent) | stride(diag_stride)); | ||
| return Tensor::from_storage(out_storage); | ||
| } | ||
|
|
||
| // Input stride for each surviving (output) axis, so the hot loop indexes a | ||
| // flat array instead of going through remain_rank_id on every step. | ||
| std::vector<cytnx_uint64> out_strides(out_shape.size()); | ||
| for (cytnx_uint64 x = 0; x < out_shape.size(); ++x) | ||
| out_strides[x] = strides[remain_rank_id[x]]; | ||
|
|
||
| // Walk the output elements in row-major order, carrying the input base | ||
| // offset on an odometer: each step bumps the last axis index (carrying into | ||
| // earlier axes on wrap) and adjusts base by the affected axes' strides. This | ||
| // avoids the per-element division and modulo of decoding the flat index, and | ||
| // needs no precomputed row-major accumulators. | ||
| std::vector<cytnx_uint64> index(out_shape.size(), 0); | ||
| cytnx_uint64 base = 0; | ||
| for (cytnx_uint64 i = 0; i < Nelem; ++i) { | ||
| out_data[i] = PairwiseSum(std::span<const T>(data + base, extent) | stride(diag_stride)); | ||
| for (cytnx_uint64 x = out_shape.size(); x-- > 0;) { | ||
| if (++index[x] < static_cast<cytnx_uint64>(out_shape[x])) { | ||
| base += out_strides[x]; | ||
| break; | ||
| } | ||
| index[x] = 0; | ||
| base -= (static_cast<cytnx_uint64>(out_shape[x]) - 1) * out_strides[x]; | ||
| } | ||
| } | ||
| Tensor out = Tensor::from_storage(out_storage); | ||
| out.reshape_(out_shape); | ||
| return out; | ||
| } | ||
|
|
||
| template <class T> | ||
| void _trace_2d(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag) { | ||
| T a = 0; | ||
| T *rawdata = Tn.storage().data<T>(); | ||
| cytnx_uint64 Ldim = Tn.shape()[1]; | ||
| for (cytnx_uint64 i = 0; i < Ndiag; i++) a += rawdata[i * Ldim + i]; | ||
| out.storage().at<T>(0) = a; | ||
| } | ||
|
|
||
| template <class T> | ||
| void _trace_nd(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag, | ||
| const cytnx_uint64 &Nelem, const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| cytnx::UniTensor I_UT = cytnx::UniTensor::eye(Ndiag, {}, true, Tn.dtype(), Tn.device()); | ||
|
|
||
| UniTensor UTn = UniTensor(Tn, false, 2); | ||
| I_UT.relabel_({UTn._impl->_labels[ax1], UTn._impl->_labels[ax2]}); | ||
|
|
||
| out = Contract(I_UT, UTn).get_block_(); | ||
|
|
||
| // std::vector<cytnx_uint64> indexer(Tn.shape().size(), 0); | ||
| // cytnx_uint64 tmp; | ||
| // for (cytnx_uint64 i = 0; i < Nelem; i++) { | ||
| // tmp = i; | ||
| // // calculate indexer | ||
| // for (int x = 0; x < shape.size(); x++) { | ||
| // indexer[remain_rank_id[x]] = cytnx_uint64(tmp / accu[x]); | ||
| // tmp %= accu[x]; | ||
| // } | ||
| } // namespace | ||
|
|
||
| // for (cytnx_uint64 d = 0; d < Ndiag; d++) { | ||
| // indexer[ax1] = indexer[ax2] = d; | ||
| // out.storage().at<T>(i) += Tn.at<T>(indexer); | ||
| // } | ||
| // } | ||
| Tensor Trace_internal_cd(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_complex128>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_cd(const bool &is_2d, Tensor &out, const Tensor &Tn, | ||
| const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_complex128>(out, Tn, Ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_complex128>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_cf(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_complex64>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_cf(const bool &is_2d, Tensor &out, const Tensor &Tn, | ||
| const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_complex64>(out, Tn, Ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_complex64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_d(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_double>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_d(const bool &is_2d, Tensor &out, const Tensor &Tn, | ||
| const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_double>(out, Tn, Ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_double>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_f(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_float>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_f(const bool &is_2d, Tensor &out, const Tensor &Tn, | ||
| const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_float>(out, Tn, Ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_float>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_u64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_uint64>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_u64(const bool &is_2d, Tensor &out, const Tensor &Tn, | ||
| const cytnx_uint64 &Ndiag, const cytnx_uint64 &Nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_uint64>(out, Tn, Ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_uint64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_i64(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_int64>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_i64(const bool &is_2d, Tensor &out, const Tensor &tn, | ||
| const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_int64>(out, tn, ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_int64>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_u32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_uint32>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_u32(const bool &is_2d, Tensor &out, const Tensor &tn, | ||
| const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_uint32>(out, tn, ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_uint32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_i32(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_int32>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_i32(const bool &is_2d, Tensor &out, const Tensor &tn, | ||
| const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_int32>(out, tn, ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_int32>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| } | ||
|
|
||
| void Trace_internal_u16(const bool &is_2d, Tensor &out, const Tensor &tn, | ||
| const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_uint16>(out, tn, ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_uint16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_u16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_uint16>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_i16(const bool &is_2d, Tensor &out, const Tensor &tn, | ||
| const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| if (is_2d) { | ||
| _trace_2d<cytnx_int16>(out, tn, ndiag); | ||
| } else { | ||
| _trace_nd<cytnx_int16>(out, tn, ndiag, nelem, accu, remain_rank_id, shape, ax1, ax2); | ||
| } | ||
| Tensor Trace_internal_i16(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| return TraceImpl<cytnx_int16>(Tn, ax1, ax2); | ||
| } | ||
|
|
||
| void Trace_internal_b(const bool &is_2d, Tensor &out, const Tensor &tn, | ||
| const cytnx_uint64 &ndiag, const cytnx_uint64 &nelem, | ||
| const std::vector<cytnx_uint64> &accu, | ||
| const std::vector<cytnx_uint64> &remain_rank_id, | ||
| const std::vector<cytnx_int64> &shape, const cytnx_uint64 &ax1, | ||
| const cytnx_uint64 &ax2) { | ||
| Tensor Trace_internal_b(const Tensor &Tn, cytnx_uint64 ax1, cytnx_uint64 ax2) { | ||
| cytnx_error_msg(true, "[internal][Trace] bool is not available. %s", "\n"); | ||
| return Tensor(); | ||
| } | ||
|
|
||
| } // namespace linalg_internal | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These two lines are inherited from the old code. They are not needed now.