From a19b1480276857200dfdd6301e83a9e4545fc1d7 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Fri, 13 Mar 2026 17:28:43 +0800 Subject: [PATCH 01/13] introduce turbo --- cmake/option.cmake | 7 +- src/CMakeLists.txt | 1 + src/core/CMakeLists.txt | 2 +- src/core/metric/CMakeLists.txt | 2 +- src/core/metric/quantized_integer_metric.cc | 19 + src/include/zvec/turbo/turbo.h | 53 +++ src/turbo/CMakeLists.txt | 21 ++ src/turbo/euclidean/avx2_impl.cc | 0 src/turbo/euclidean/avx2_impl.h | 0 src/turbo/euclidean/avx512_impl.cc | 364 ++++++++++++++++++++ src/turbo/euclidean/avx512_impl.h | 32 ++ src/turbo/turbo.cc | 75 ++++ 12 files changed, 572 insertions(+), 4 deletions(-) create mode 100644 src/include/zvec/turbo/turbo.h create mode 100644 src/turbo/CMakeLists.txt create mode 100644 src/turbo/euclidean/avx2_impl.cc create mode 100644 src/turbo/euclidean/avx2_impl.h create mode 100644 src/turbo/euclidean/avx512_impl.cc create mode 100644 src/turbo/euclidean/avx512_impl.h create mode 100644 src/turbo/turbo.cc diff --git a/cmake/option.cmake b/cmake/option.cmake index 3c0424221..b9b7361d5 100644 --- a/cmake/option.cmake +++ b/cmake/option.cmake @@ -226,11 +226,14 @@ if(NOT AUTO_DETECT_ARCH) else() # AUTO DETECT - # Heuristic: detect host architecture and probe appropriate flags + # Heuristic: detect host architecture and probe appropriate flags. + # For x86, per-file march flags are managed via setup_compiler_march_for_x86() + # in each library's CMakeLists.txt, so no global -march is set here to avoid + # conflicting with per-file flags (e.g. -march=icelake, -march=core-avx2). if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") _setup_armv8_march() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") - _setup_x86_march() + # intentionally no global -march for x86 in AUTO_DETECT_ARCH mode else() message(WARNING "Unknown host architecture: ${CMAKE_SYSTEM_PROCESSOR}; no -march= set.") endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c516187c7..c516bf4a8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,6 +8,7 @@ git_version(ZVEC_VERSION ${CMAKE_CURRENT_SOURCE_DIR}) cc_directory(ailego) cc_directory(core) cc_directory(db) +cc_directory(turbo) if(BUILD_PYTHON_BINDINGS) cc_directory(binding) endif() diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 7742db594..5f696c085 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -15,7 +15,7 @@ file(GLOB_RECURSE ALL_CORE_SRCS *.cc *.c *.h) cc_library( NAME zvec_core STATIC STRICT PACKED SRCS ${ALL_CORE_SRCS} - LIBS zvec_ailego sparsehash magic_enum + LIBS zvec_ailego zvec_turbo sparsehash magic_enum INCS . ${PROJECT_ROOT_DIR}/src/core VERSION "${GIT_SRCS_VER}" ) \ No newline at end of file diff --git a/src/core/metric/CMakeLists.txt b/src/core/metric/CMakeLists.txt index cbc1049f1..55dfc901e 100644 --- a/src/core/metric/CMakeLists.txt +++ b/src/core/metric/CMakeLists.txt @@ -5,7 +5,7 @@ cc_library( NAME core_metric STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc - LIBS zvec_ailego core_framework + LIBS zvec_ailego zvec_turbo core_framework INCS . ${PROJECT_ROOT_DIR}/src/core VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index 2b4e757a2..4b7d6ed31 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "metric_params.h" #include "quantized_integer_metric_batch.h" #include "quantized_integer_metric_matrix.h" @@ -95,6 +96,12 @@ class QuantizedIntegerMetric : public IndexMetric { switch (origin_metric_type_) { case MetricType::kSquaredEuclidean: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { + auto turbo_ret = turbo::get_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault); + if (turbo_ret) { + return turbo_ret; + } return DistanceMatrixCompute(m, n); } if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { @@ -146,6 +153,12 @@ class QuantizedIntegerMetric : public IndexMetric { switch (origin_metric_type_) { case MetricType::kSquaredEuclidean: if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { + auto turbo_ret = turbo::get_batch_distance_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault); + if (turbo_ret) { + return turbo_ret; + } return reinterpret_cast( BaseDistanceBatchWithScoreUnquantized::ComputeBatch); @@ -268,6 +281,12 @@ class QuantizedIntegerMetric : public IndexMetric { int8_t, 1, 1>::GetQueryPreprocessFunc(); } else if (origin_metric_type_ == MetricType::kSquaredEuclidean && meta_.data_type() == IndexMeta::DataType::DT_INT8) { + auto turbo_ret = turbo::get_query_preprocess_func( + turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8, + turbo::QuantizeType::kDefault); + if (turbo_ret) { + return turbo_ret; + } return SquaredEuclideanDistanceBatchWithScoreUnquantized< int8_t, 1, 1>::GetQueryPreprocessFunc(); } diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h new file mode 100644 index 000000000..d852611a7 --- /dev/null +++ b/src/include/zvec/turbo/turbo.h @@ -0,0 +1,53 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include + +namespace zvec::turbo { + +using DistanceFunc = + std::function; +using BatchDistanceFunc = std::function; +using QueryPreprocessFunc = + zvec::ailego::DistanceBatch::DistanceBatchQueryPreprocessFunc; + +enum class MetricType { + kSquaredEuclidean, + kUnknown, +}; + +enum class DataType { + kInt8, + kUnknown, +}; + +enum class QuantizeType { + kDefault, +}; + +DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, + QuantizeType quantize_type); + +BatchDistanceFunc get_batch_distance_func(MetricType metric_type, + DataType data_type, + QuantizeType quantize_type); + +QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type, + DataType data_type, + QuantizeType quantize_type); + +} // namespace zvec::turbo diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt new file mode 100644 index 000000000..3f795e9ae --- /dev/null +++ b/src/turbo/CMakeLists.txt @@ -0,0 +1,21 @@ +include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) +include(${PROJECT_ROOT_DIR}/cmake/option.cmake) + +if(NOT ANDROID) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") + setup_compiler_march_for_x86(TURBO_MARCH_FLAG_SSE TURBO_MARCH_FLAG_AVX2 TURBO_MARCH_FLAG_AVX512) + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.cc + PROPERTIES + COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" + ) + endif() +endif() + +file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h) + +cc_library( + NAME zvec_turbo STATIC STRICT PACKED + SRCS ${ALL_SRCS} + INCS ${PROJECT_ROOT_DIR}/src/include/zvec +) diff --git a/src/turbo/euclidean/avx2_impl.cc b/src/turbo/euclidean/avx2_impl.cc new file mode 100644 index 000000000..e69de29bb diff --git a/src/turbo/euclidean/avx2_impl.h b/src/turbo/euclidean/avx2_impl.h new file mode 100644 index 000000000..e69de29bb diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc new file mode 100644 index 000000000..6f6ecce26 --- /dev/null +++ b/src/turbo/euclidean/avx512_impl.cc @@ -0,0 +1,364 @@ +#include "turbo/euclidean/avx512_impl.h" +#include +#include + +namespace zvec::turbo { + +#if defined(__AVX512VNNI__) +static __attribute__((always_inline)) int32_t HorizontalAdd_INT32_V256( + __m256i v) { + __m256i x1 = _mm256_hadd_epi32(v, v); + __m256i x2 = _mm256_hadd_epi32(x1, x1); + __m128i x3 = _mm256_extractf128_si256(x2, 1); + __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3); + return _mm_cvtsi128_si32(x4); +} + +#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); + +static __attribute__((always_inline)) void ip_int8_distance_avx512_vnni( + const void *a, const void *b, int size, float *distance) { + const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001); + const __m128i ONES_INT16_SSE = _mm_set1_epi32(0x00010001); + + const int8_t *lhs = reinterpret_cast(a); + const int8_t *rhs = reinterpret_cast(b); + + const int8_t *last = lhs + size; + const int8_t *last_aligned = lhs + ((size >> 6) << 6); + + float result = 0.0f; + + __m256i ymm_sum_0 = _mm256_setzero_si256(); + __m256i ymm_sum_1 = _mm256_setzero_si256(); + + if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } else { + for (; lhs != last_aligned; lhs += 64, rhs += 64) { + __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0)); + __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32)); + __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0)); + __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32)); + + ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0); + ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1); + ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0); + ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1); + + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0), + ONES_INT16_AVX), + ymm_sum_0); + ymm_sum_1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1), + ONES_INT16_AVX), + ymm_sum_1); + } + + if (last >= last_aligned + 32) { + __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs); + __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs); + ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs); + ymm_rhs = _mm256_abs_epi8(ymm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs), + ONES_INT16_AVX), + ymm_sum_0); + lhs += 32; + rhs += 32; + } + + if (last >= lhs + 16) { + __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs); + __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs); + xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs); + xmm_rhs = _mm_abs_epi8(xmm_rhs); + ymm_sum_0 = _mm256_add_epi32( + _mm256_set_m128i(_mm_setzero_si128(), + _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs), + ONES_INT16_SSE)), + ymm_sum_0); + lhs += 16; + rhs += 16; + } + } + result = static_cast( + HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1))); + + switch (last - lhs) { + case 15: + FMA_INT8_GENERAL(lhs[14], rhs[14], result) + /* FALLTHRU */ + case 14: + FMA_INT8_GENERAL(lhs[13], rhs[13], result) + /* FALLTHRU */ + case 13: + FMA_INT8_GENERAL(lhs[12], rhs[12], result) + /* FALLTHRU */ + case 12: + FMA_INT8_GENERAL(lhs[11], rhs[11], result) + /* FALLTHRU */ + case 11: + FMA_INT8_GENERAL(lhs[10], rhs[10], result) + /* FALLTHRU */ + case 10: + FMA_INT8_GENERAL(lhs[9], rhs[9], result) + /* FALLTHRU */ + case 9: + FMA_INT8_GENERAL(lhs[8], rhs[8], result) + /* FALLTHRU */ + case 8: + FMA_INT8_GENERAL(lhs[7], rhs[7], result) + /* FALLTHRU */ + case 7: + FMA_INT8_GENERAL(lhs[6], rhs[6], result) + /* FALLTHRU */ + case 6: + FMA_INT8_GENERAL(lhs[5], rhs[5], result) + /* FALLTHRU */ + case 5: + FMA_INT8_GENERAL(lhs[4], rhs[4], result) + /* FALLTHRU */ + case 4: + FMA_INT8_GENERAL(lhs[3], rhs[3], result) + /* FALLTHRU */ + case 3: + FMA_INT8_GENERAL(lhs[2], rhs[2], result) + /* FALLTHRU */ + case 2: + FMA_INT8_GENERAL(lhs[1], rhs[1], result) + /* FALLTHRU */ + case 1: + FMA_INT8_GENERAL(lhs[0], rhs[0], result) + } + *distance = result; +} +#endif + +void l2_int8_distance_avx512_vnni(const void *a, const void *b, int dim, + float *distance) { +#if defined(__AVX512VNNI__) + const int d = dim - 20; + ip_int8_distance_avx512_vnni(a, b, d, distance); + + const float *a_tail = + reinterpret_cast(reinterpret_cast(a) + d); + const float *b_tail = + reinterpret_cast(reinterpret_cast(b) + d); + + float qa = b_tail[0]; + float qb = b_tail[1]; + float qs = b_tail[2]; + float qs2 = b_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + + float ma = a_tail[0]; + float mb = a_tail[1]; + float ms = a_tail[2]; + float ms2 = a_tail[3]; + + *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance + + (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum); +#else + (void)a; + (void)b; + (void)dim; + (void)distance; +#endif +} + +#if defined(__AVX512VNNI__) +template +__attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni_impl( + const void *query, const void *const *vectors, + const std::array &prefetch_ptrs, + int dimensionality, float *distances) { + __m512i accs[batch_size]; + for (int i = 0; i < batch_size; ++i) { + accs[i] = _mm512_setzero_si512(); + } + int dim = 0; + for (; dim + 64 <= dimensionality; dim += 64) { + __m512i q = _mm512_loadu_si512(reinterpret_cast( + reinterpret_cast(query) + dim)); + __m512i data_regs[batch_size]; + for (int i = 0; i < batch_size; ++i) { + data_regs[i] = _mm512_loadu_si512(reinterpret_cast( + reinterpret_cast(vectors[i]) + dim)); + } + if (prefetch_ptrs[0]) { + for (int i = 0; i < batch_size; ++i) { + _mm_prefetch( + reinterpret_cast( + reinterpret_cast(prefetch_ptrs[i]) + dim), + _MM_HINT_T0); + } + } + for (int i = 0; i < batch_size; ++i) { + accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]); + } + } + std::array temp_results{}; + for (int i = 0; i < batch_size; ++i) { + temp_results[i] = _mm512_reduce_add_epi32(accs[i]); + } + for (; dim < dimensionality; ++dim) { + uint q = reinterpret_cast(query)[dim]; + for (int i = 0; i < batch_size; ++i) { + temp_results[i] += + q * + static_cast(reinterpret_cast(vectors[i])[dim]); + } + } + for (int i = 0; i < batch_size; ++i) { + distances[i] = static_cast(temp_results[i]); + } +} + +static __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni( + const void *const *vectors, const void *query, int n, int dim, + float *distances) { + static constexpr int batch_size = 2; + static constexpr int prefetch_step = 2; + int i = 0; + for (; i + batch_size <= n; i += batch_size) { + std::array prefetch_ptrs; + for (int j = 0; j < batch_size; ++j) { + if (i + j + batch_size * prefetch_step <= n) { + prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + ip_int8_batch_distance_avx512_vnni_impl( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } + for (; i < n; i++) { + std::array prefetch_ptrs{nullptr}; + ip_int8_batch_distance_avx512_vnni_impl<1>( + query, &vectors[i], prefetch_ptrs, dim, distances + i); + } +} +#endif + +void l2_int8_batch_distance_avx512_vnni(const void *const *vectors, + const void *query, int n, int dim, + float *distances) { +#if defined(__AVX512VNNI__) + int original_dim = dim - 20; + + ip_int8_batch_distance_avx512_vnni(vectors, query, n, original_dim, + distances); + const float *q_tail = reinterpret_cast( + reinterpret_cast(query) + original_dim); + float qa = q_tail[0]; + float qb = q_tail[1]; + float qs = q_tail[2]; + float qs2 = q_tail[3]; + + const float sum = qa * qs; + const float sum2 = qa * qa * qs2; + for (int i = 0; i < n; ++i) { + const float *m_tail = reinterpret_cast( + reinterpret_cast(vectors[i]) + original_dim); + float ma = m_tail[0]; + float mb = m_tail[1]; + float ms = m_tail[2]; + float ms2 = m_tail[3]; + int int8_sum = reinterpret_cast(m_tail)[4]; + float &result = distances[i]; + result -= 128 * int8_sum; + result = ma * ma * ms2 + sum2 - 2 * ma * qa * result + + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); + } +#else + (void)vectors; + (void)query; + (void)n; + (void)dim; + (void)distances; +#endif +} + +void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim) { +#if defined(__AVX512VNNI__) + int d = dim - 20; + + const int8_t *input = reinterpret_cast(query); + uint8_t *output = reinterpret_cast(query); + + // // AVX512 constant: 128 in each byte (cast to int8_t, which becomes -128 + // // in signed representation, but addition works correctly due to two's + // // complement arithmetic) + const __m512i offset = _mm512_set1_epi8(static_cast(128)); + // + int i = 0; + // // Process 64 bytes at a time using AVX512 + for (; i + 64 <= d; i += 64) { + __m512i data = + _mm512_loadu_si512(reinterpret_cast(input + i)); + __m512i result = _mm512_add_epi8(data, offset); + _mm512_storeu_si512(reinterpret_cast<__m512i *>(output + i), result); + } + + // Handle remaining elements with scalar loop + for (; i < d; ++i) { + output[i] = static_cast(static_cast(input[i]) + 128); + } +#else + (void)query; + (void)dim; +#endif +} + +} // namespace zvec::turbo diff --git a/src/turbo/euclidean/avx512_impl.h b/src/turbo/euclidean/avx512_impl.h new file mode 100644 index 000000000..8f1510de3 --- /dev/null +++ b/src/turbo/euclidean/avx512_impl.h @@ -0,0 +1,32 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace zvec::turbo { + +void l2_int8_distance_avx512_vnni(const void *a, const void *b, int dim, + float *distance); + +void l2_int8_batch_distance_avx512_vnni(const void *const *vectors, + const void *query, int n, int dim, + float *distances); + +void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim); + + +} // namespace zvec::turbo \ No newline at end of file diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc new file mode 100644 index 000000000..c59041df9 --- /dev/null +++ b/src/turbo/turbo.cc @@ -0,0 +1,75 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "euclidean/avx512_impl.h" + +namespace zvec::turbo { + +DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, + QuantizeType quantize_type) { + if (metric_type == MetricType::kSquaredEuclidean) { + if (data_type == DataType::kInt8) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + return l2_int8_distance_avx512_vnni; + } + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + return nullptr; + } + } + } + } + return nullptr; +} + +BatchDistanceFunc get_batch_distance_func(MetricType metric_type, + DataType data_type, + QuantizeType quantize_type) { + if (metric_type == MetricType::kSquaredEuclidean) { + if (data_type == DataType::kInt8) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + return l2_int8_batch_distance_avx512_vnni; + } + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + return nullptr; + } + } + } + } + return nullptr; +} + +QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type, + DataType data_type, + QuantizeType quantize_type) { + if (metric_type == MetricType::kSquaredEuclidean) { + if (data_type == DataType::kInt8) { + if (quantize_type == QuantizeType::kDefault) { + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { + return l2_int8_query_preprocess_avx512_vnni; + } + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + return nullptr; + } + } + } + } + return nullptr; +} + +} // namespace zvec::turbo From 8974bcf3f0267573ffb22344719caca1d5b3f436 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Fri, 13 Mar 2026 17:45:11 +0800 Subject: [PATCH 02/13] format --- src/turbo/euclidean/avx512_impl.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc index 6f6ecce26..bf19779f3 100644 --- a/src/turbo/euclidean/avx512_impl.cc +++ b/src/turbo/euclidean/avx512_impl.cc @@ -5,8 +5,7 @@ namespace zvec::turbo { #if defined(__AVX512VNNI__) -static __attribute__((always_inline)) int32_t HorizontalAdd_INT32_V256( - __m256i v) { +static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { __m256i x1 = _mm256_hadd_epi32(v, v); __m256i x2 = _mm256_hadd_epi32(x1, x1); __m128i x3 = _mm256_extractf128_si256(x2, 1); From f332f2f0b609a77b8009248f2c722b63a60b3b43 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Fri, 13 Mar 2026 17:56:22 +0800 Subject: [PATCH 03/13] fix --- src/turbo/euclidean/avx512_impl.cc | 14 +++++++------- src/turbo/euclidean/avx512_impl.h | 2 +- src/turbo/turbo.cc | 1 - 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc index bf19779f3..06de318cf 100644 --- a/src/turbo/euclidean/avx512_impl.cc +++ b/src/turbo/euclidean/avx512_impl.cc @@ -251,11 +251,11 @@ __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni_impl( temp_results[i] = _mm512_reduce_add_epi32(accs[i]); } for (; dim < dimensionality; ++dim) { - uint q = reinterpret_cast(query)[dim]; + int q = static_cast(reinterpret_cast(query)[dim]); for (int i = 0; i < batch_size; ++i) { temp_results[i] += q * - static_cast(reinterpret_cast(vectors[i])[dim]); + static_cast(reinterpret_cast(vectors[i])[dim]); } } for (int i = 0; i < batch_size; ++i) { @@ -336,13 +336,13 @@ void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim) { const int8_t *input = reinterpret_cast(query); uint8_t *output = reinterpret_cast(query); - // // AVX512 constant: 128 in each byte (cast to int8_t, which becomes -128 - // // in signed representation, but addition works correctly due to two's - // // complement arithmetic) + // AVX512 constant: 128 in each byte (cast to int8_t, which becomes -128 + // in signed representation, but addition works correctly due to two's + // complement arithmetic) const __m512i offset = _mm512_set1_epi8(static_cast(128)); - // + int i = 0; - // // Process 64 bytes at a time using AVX512 + // Process 64 bytes at a time using AVX512 for (; i + 64 <= d; i += 64) { __m512i data = _mm512_loadu_si512(reinterpret_cast(input + i)); diff --git a/src/turbo/euclidean/avx512_impl.h b/src/turbo/euclidean/avx512_impl.h index 8f1510de3..cfcf0bf4d 100644 --- a/src/turbo/euclidean/avx512_impl.h +++ b/src/turbo/euclidean/avx512_impl.h @@ -29,4 +29,4 @@ void l2_int8_batch_distance_avx512_vnni(const void *const *vectors, void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim); -} // namespace zvec::turbo \ No newline at end of file +} // namespace zvec::turbo diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index c59041df9..12b385333 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include #include #include "euclidean/avx512_impl.h" From cffaa75b1eff0864c769fdc61f15768f59dced11 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Fri, 13 Mar 2026 17:58:28 +0800 Subject: [PATCH 04/13] comment --- src/turbo/euclidean/avx512_impl.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc index 06de318cf..97b7dee70 100644 --- a/src/turbo/euclidean/avx512_impl.cc +++ b/src/turbo/euclidean/avx512_impl.cc @@ -15,6 +15,10 @@ static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { #define FMA_INT8_GENERAL(m, q, sum) sum += static_cast(m * q); +// This is done to align with the previous behavior +// (DistanceMatrixCompute), where SquaredEuclidean +// assumes no preprocessing on the query, and both the query and data are of +// type int8_t. static __attribute__((always_inline)) void ip_int8_distance_avx512_vnni( const void *a, const void *b, int size, float *distance) { const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001); From a1001ef233814d7ec867e60a7448cb70265bb288 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Fri, 13 Mar 2026 18:47:50 +0800 Subject: [PATCH 05/13] add guard --- src/turbo/euclidean/avx512_impl.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc index 97b7dee70..b891563ec 100644 --- a/src/turbo/euclidean/avx512_impl.cc +++ b/src/turbo/euclidean/avx512_impl.cc @@ -1,5 +1,7 @@ #include "turbo/euclidean/avx512_impl.h" +#if defined(__AVX512VNNI__) #include +#endif #include namespace zvec::turbo { From d08d17c1391d7a97f109016e37d93c6e41871762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=93=E7=AE=80?= Date: Tue, 17 Mar 2026 17:55:33 +0800 Subject: [PATCH 06/13] fix --- src/turbo/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 3f795e9ae..69dd80787 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -9,6 +9,10 @@ if(NOT ANDROID) PROPERTIES COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}" ) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + # ARM64 architecture - no special march flags needed for now + # NEON implementations can be added here if needed + message(STATUS "turbo: ARM64 detected, skipping x86-specific optimizations") endif() endif() From 7a8e2a4204f3a0c5a128884242f8d905e2619905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=93=E7=AE=80?= Date: Tue, 17 Mar 2026 18:25:53 +0800 Subject: [PATCH 07/13] fix --- src/turbo/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 69dd80787..79a3c139d 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -1,7 +1,7 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) -if(NOT ANDROID) +if(NOT ANDROID AND AUTO_DETECT_ARCH) if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") setup_compiler_march_for_x86(TURBO_MARCH_FLAG_SSE TURBO_MARCH_FLAG_AVX2 TURBO_MARCH_FLAG_AVX512) set_source_files_properties( From cb842fb639ea4f2eff18cfa3fa40a0d4e2fe9e18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=93=E7=AE=80?= Date: Tue, 17 Mar 2026 19:02:03 +0800 Subject: [PATCH 08/13] fix --- src/turbo/CMakeLists.txt | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 79a3c139d..8b068d7f7 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -16,7 +16,19 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) endif() endif() -file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h) +# Collect source files explicitly +set(TURBO_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/turbo.cc + ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.cc + ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.cc +) + +set(TURBO_HDRS + ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.h + ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.h +) + +set(ALL_SRCS ${TURBO_SRCS} ${TURBO_HDRS}) cc_library( NAME zvec_turbo STATIC STRICT PACKED From 8ec213fd3d14bc0f3b84a36dd4d2be85fcb228b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=93=E7=AE=80?= Date: Tue, 17 Mar 2026 19:29:55 +0800 Subject: [PATCH 09/13] fix --- examples/c++/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/c++/CMakeLists.txt b/examples/c++/CMakeLists.txt index d0dbf8b60..d17315792 100644 --- a/examples/c++/CMakeLists.txt +++ b/examples/c++/CMakeLists.txt @@ -43,7 +43,7 @@ set(zvec_ailego_deps ) set(zvec_core_deps - # empty + zvec_turbo ) set(zvec_db_deps From 2ef289f7a3721694a8cc3cc81e941018df62bcd8 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Tue, 17 Mar 2026 20:16:16 +0800 Subject: [PATCH 10/13] revert some modifications --- cmake/option.cmake | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cmake/option.cmake b/cmake/option.cmake index b9b7361d5..3c0424221 100644 --- a/cmake/option.cmake +++ b/cmake/option.cmake @@ -226,14 +226,11 @@ if(NOT AUTO_DETECT_ARCH) else() # AUTO DETECT - # Heuristic: detect host architecture and probe appropriate flags. - # For x86, per-file march flags are managed via setup_compiler_march_for_x86() - # in each library's CMakeLists.txt, so no global -march is set here to avoid - # conflicting with per-file flags (e.g. -march=icelake, -march=core-avx2). + # Heuristic: detect host architecture and probe appropriate flags if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") _setup_armv8_march() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64") - # intentionally no global -march for x86 in AUTO_DETECT_ARCH mode + _setup_x86_march() else() message(WARNING "Unknown host architecture: ${CMAKE_SYSTEM_PROCESSOR}; no -march= set.") endif() From 642deff02b1e748d9057fa777f61def7decb6a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=93=E7=AE=80?= Date: Tue, 17 Mar 2026 20:44:05 +0800 Subject: [PATCH 11/13] refine --- src/turbo/CMakeLists.txt | 14 +------------- src/turbo/euclidean/avx2_impl.cc | 0 src/turbo/euclidean/avx2_impl.h | 0 src/turbo/turbo.cc | 9 --------- 4 files changed, 1 insertion(+), 22 deletions(-) delete mode 100644 src/turbo/euclidean/avx2_impl.cc delete mode 100644 src/turbo/euclidean/avx2_impl.h diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt index 8b068d7f7..79a3c139d 100644 --- a/src/turbo/CMakeLists.txt +++ b/src/turbo/CMakeLists.txt @@ -16,19 +16,7 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) endif() endif() -# Collect source files explicitly -set(TURBO_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/turbo.cc - ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.cc - ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.cc -) - -set(TURBO_HDRS - ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.h - ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.h -) - -set(ALL_SRCS ${TURBO_SRCS} ${TURBO_HDRS}) +file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h) cc_library( NAME zvec_turbo STATIC STRICT PACKED diff --git a/src/turbo/euclidean/avx2_impl.cc b/src/turbo/euclidean/avx2_impl.cc deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/turbo/euclidean/avx2_impl.h b/src/turbo/euclidean/avx2_impl.h deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc index 12b385333..c0331e28a 100644 --- a/src/turbo/turbo.cc +++ b/src/turbo/turbo.cc @@ -26,9 +26,6 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type, if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { return l2_int8_distance_avx512_vnni; } - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { - return nullptr; - } } } } @@ -44,9 +41,6 @@ BatchDistanceFunc get_batch_distance_func(MetricType metric_type, if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { return l2_int8_batch_distance_avx512_vnni; } - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { - return nullptr; - } } } } @@ -62,9 +56,6 @@ QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type, if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { return l2_int8_query_preprocess_avx512_vnni; } - if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { - return nullptr; - } } } } From ee3af4ef744d23cabd460bf60e403ae89e2c68d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=93=E7=AE=80?= Date: Tue, 17 Mar 2026 20:48:05 +0800 Subject: [PATCH 12/13] fix bound check --- src/turbo/euclidean/avx512_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc index b891563ec..bee702dd0 100644 --- a/src/turbo/euclidean/avx512_impl.cc +++ b/src/turbo/euclidean/avx512_impl.cc @@ -278,7 +278,7 @@ static __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni( for (; i + batch_size <= n; i += batch_size) { std::array prefetch_ptrs; for (int j = 0; j < batch_size; ++j) { - if (i + j + batch_size * prefetch_step <= n) { + if (i + j + batch_size * prefetch_step < n) { prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step]; } else { prefetch_ptrs[j] = nullptr; From 5fd4a0f605f8e5fb5b9e0d443d0fb8a42fbe0d69 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Tue, 17 Mar 2026 21:06:29 +0800 Subject: [PATCH 13/13] fix prefetch --- src/turbo/euclidean/avx512_impl.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc index bee702dd0..844b25acf 100644 --- a/src/turbo/euclidean/avx512_impl.cc +++ b/src/turbo/euclidean/avx512_impl.cc @@ -240,15 +240,13 @@ __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni_impl( data_regs[i] = _mm512_loadu_si512(reinterpret_cast( reinterpret_cast(vectors[i]) + dim)); } - if (prefetch_ptrs[0]) { - for (int i = 0; i < batch_size; ++i) { + for (int i = 0; i < batch_size; ++i) { + if (prefetch_ptrs[i]) { _mm_prefetch( reinterpret_cast( reinterpret_cast(prefetch_ptrs[i]) + dim), _MM_HINT_T0); } - } - for (int i = 0; i < batch_size; ++i) { accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]); } }