From a19b1480276857200dfdd6301e83a9e4545fc1d7 Mon Sep 17 00:00:00 2001
From: luoxiaojian <lxj193371@alibaba-inc.com>
Date: Fri, 13 Mar 2026 17:28:43 +0800
Subject: [PATCH 01/13] introduce turbo

---
 cmake/option.cmake                          |   7 +-
 src/CMakeLists.txt                          |   1 +
 src/core/CMakeLists.txt                     |   2 +-
 src/core/metric/CMakeLists.txt              |   2 +-
 src/core/metric/quantized_integer_metric.cc |  19 +
 src/include/zvec/turbo/turbo.h              |  53 +++
 src/turbo/CMakeLists.txt                    |  21 ++
 src/turbo/euclidean/avx2_impl.cc            |   0
 src/turbo/euclidean/avx2_impl.h             |   0
 src/turbo/euclidean/avx512_impl.cc          | 364 ++++++++++++++++++++
 src/turbo/euclidean/avx512_impl.h           |  32 ++
 src/turbo/turbo.cc                          |  75 ++++
 12 files changed, 572 insertions(+), 4 deletions(-)
 create mode 100644 src/include/zvec/turbo/turbo.h
 create mode 100644 src/turbo/CMakeLists.txt
 create mode 100644 src/turbo/euclidean/avx2_impl.cc
 create mode 100644 src/turbo/euclidean/avx2_impl.h
 create mode 100644 src/turbo/euclidean/avx512_impl.cc
 create mode 100644 src/turbo/euclidean/avx512_impl.h
 create mode 100644 src/turbo/turbo.cc

diff --git a/cmake/option.cmake b/cmake/option.cmake
index 3c0424221..b9b7361d5 100644
--- a/cmake/option.cmake
+++ b/cmake/option.cmake
@@ -226,11 +226,14 @@ if(NOT AUTO_DETECT_ARCH)
 
 else()
   # AUTO DETECT
-  # Heuristic: detect host architecture and probe appropriate flags
+  # Heuristic: detect host architecture and probe appropriate flags.
+  # For x86, per-file march flags are managed via setup_compiler_march_for_x86()
+  # in each library's CMakeLists.txt, so no global -march is set here to avoid
+  # conflicting with per-file flags (e.g. -march=icelake, -march=core-avx2).
   if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
     _setup_armv8_march()
   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64")
-    _setup_x86_march()
+    # intentionally no global -march for x86 in AUTO_DETECT_ARCH mode
   else()
     message(WARNING "Unknown host architecture: ${CMAKE_SYSTEM_PROCESSOR}; no -march= set.")
   endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c516187c7..c516bf4a8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,6 +8,7 @@ git_version(ZVEC_VERSION ${CMAKE_CURRENT_SOURCE_DIR})
 cc_directory(ailego)
 cc_directory(core)
 cc_directory(db)
+cc_directory(turbo)
 if(BUILD_PYTHON_BINDINGS)
     cc_directory(binding)
 endif()
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 7742db594..5f696c085 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -15,7 +15,7 @@ file(GLOB_RECURSE ALL_CORE_SRCS *.cc *.c *.h)
 cc_library(
     NAME zvec_core STATIC STRICT PACKED
     SRCS ${ALL_CORE_SRCS}
-    LIBS zvec_ailego sparsehash magic_enum
+    LIBS zvec_ailego zvec_turbo sparsehash magic_enum
     INCS . ${PROJECT_ROOT_DIR}/src/core
     VERSION "${GIT_SRCS_VER}"
 )
\ No newline at end of file
diff --git a/src/core/metric/CMakeLists.txt b/src/core/metric/CMakeLists.txt
index cbc1049f1..55dfc901e 100644
--- a/src/core/metric/CMakeLists.txt
+++ b/src/core/metric/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(
     NAME core_metric 
     STATIC SHARED STRICT ALWAYS_LINK
     SRCS *.cc
-    LIBS zvec_ailego core_framework 
+    LIBS zvec_ailego zvec_turbo core_framework 
     INCS . ${PROJECT_ROOT_DIR}/src/core
     VERSION "${PROXIMA_ZVEC_VERSION}"
   )
diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc
index 2b4e757a2..4b7d6ed31 100644
--- a/src/core/metric/quantized_integer_metric.cc
+++ b/src/core/metric/quantized_integer_metric.cc
@@ -18,6 +18,7 @@
 #include <ailego/math_batch/distance_batch.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_factory.h>
+#include <zvec/turbo/turbo.h>
 #include "metric_params.h"
 #include "quantized_integer_metric_batch.h"
 #include "quantized_integer_metric_matrix.h"
@@ -95,6 +96,12 @@ class QuantizedIntegerMetric : public IndexMetric {
     switch (origin_metric_type_) {
       case MetricType::kSquaredEuclidean:
         if (meta_.data_type() == IndexMeta::DataType::DT_INT8) {
+          auto turbo_ret = turbo::get_distance_func(
+              turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+              turbo::QuantizeType::kDefault);
+          if (turbo_ret) {
+            return turbo_ret;
+          }
           return DistanceMatrixCompute<SquaredEuclidean, int8_t>(m, n);
         }
         if (meta_.data_type() == IndexMeta::DataType::DT_INT4) {
@@ -146,6 +153,12 @@ class QuantizedIntegerMetric : public IndexMetric {
     switch (origin_metric_type_) {
       case MetricType::kSquaredEuclidean:
         if (meta_.data_type() == IndexMeta::DataType::DT_INT8) {
+          auto turbo_ret = turbo::get_batch_distance_func(
+              turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+              turbo::QuantizeType::kDefault);
+          if (turbo_ret) {
+            return turbo_ret;
+          }
           return reinterpret_cast<IndexMetric::MatrixBatchDistanceHandle>(
               BaseDistanceBatchWithScoreUnquantized<SquaredEuclidean, int8_t,
                                                     12, 2>::ComputeBatch);
@@ -268,6 +281,12 @@ class QuantizedIntegerMetric : public IndexMetric {
           int8_t, 1, 1>::GetQueryPreprocessFunc();
     } else if (origin_metric_type_ == MetricType::kSquaredEuclidean &&
                meta_.data_type() == IndexMeta::DataType::DT_INT8) {
+      auto turbo_ret = turbo::get_query_preprocess_func(
+          turbo::MetricType::kSquaredEuclidean, turbo::DataType::kInt8,
+          turbo::QuantizeType::kDefault);
+      if (turbo_ret) {
+        return turbo_ret;
+      }
       return SquaredEuclideanDistanceBatchWithScoreUnquantized<
           int8_t, 1, 1>::GetQueryPreprocessFunc();
     }
diff --git a/src/include/zvec/turbo/turbo.h b/src/include/zvec/turbo/turbo.h
new file mode 100644
index 000000000..d852611a7
--- /dev/null
+++ b/src/include/zvec/turbo/turbo.h
@@ -0,0 +1,53 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <functional>
+#include <zvec/ailego/math_batch/utils.h>
+
+namespace zvec::turbo {
+
+using DistanceFunc =
+    std::function<void(const void *m, const void *q, size_t dim, float *out)>;
+using BatchDistanceFunc = std::function<void(
+    const void **m, const void *q, size_t num, size_t dim, float *out)>;
+using QueryPreprocessFunc =
+    zvec::ailego::DistanceBatch::DistanceBatchQueryPreprocessFunc;
+
+enum class MetricType {
+  kSquaredEuclidean,
+  kUnknown,
+};
+
+enum class DataType {
+  kInt8,
+  kUnknown,
+};
+
+enum class QuantizeType {
+  kDefault,
+};
+
+DistanceFunc get_distance_func(MetricType metric_type, DataType data_type,
+                               QuantizeType quantize_type);
+
+BatchDistanceFunc get_batch_distance_func(MetricType metric_type,
+                                          DataType data_type,
+                                          QuantizeType quantize_type);
+
+QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type,
+                                              DataType data_type,
+                                              QuantizeType quantize_type);
+
+}  // namespace zvec::turbo
diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt
new file mode 100644
index 000000000..3f795e9ae
--- /dev/null
+++ b/src/turbo/CMakeLists.txt
@@ -0,0 +1,21 @@
+include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake)
+include(${PROJECT_ROOT_DIR}/cmake/option.cmake)
+
+if(NOT ANDROID)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64")
+        setup_compiler_march_for_x86(TURBO_MARCH_FLAG_SSE TURBO_MARCH_FLAG_AVX2 TURBO_MARCH_FLAG_AVX512)
+        set_source_files_properties(
+            ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.cc
+            PROPERTIES
+            COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}"
+        )
+    endif()
+endif()
+
+file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h)
+
+cc_library(
+    NAME zvec_turbo STATIC STRICT PACKED
+    SRCS ${ALL_SRCS}
+    INCS ${PROJECT_ROOT_DIR}/src/include/zvec
+)
diff --git a/src/turbo/euclidean/avx2_impl.cc b/src/turbo/euclidean/avx2_impl.cc
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/turbo/euclidean/avx2_impl.h b/src/turbo/euclidean/avx2_impl.h
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc
new file mode 100644
index 000000000..6f6ecce26
--- /dev/null
+++ b/src/turbo/euclidean/avx512_impl.cc
@@ -0,0 +1,364 @@
+#include "turbo/euclidean/avx512_impl.h"
+#include <immintrin.h>
+#include <array>
+
+namespace zvec::turbo {
+
+#if defined(__AVX512VNNI__)
+static __attribute__((always_inline)) int32_t HorizontalAdd_INT32_V256(
+    __m256i v) {
+  __m256i x1 = _mm256_hadd_epi32(v, v);
+  __m256i x2 = _mm256_hadd_epi32(x1, x1);
+  __m128i x3 = _mm256_extractf128_si256(x2, 1);
+  __m128i x4 = _mm_add_epi32(_mm256_castsi256_si128(x2), x3);
+  return _mm_cvtsi128_si32(x4);
+}
+
+#define FMA_INT8_GENERAL(m, q, sum) sum += static_cast<float>(m * q);
+
+static __attribute__((always_inline)) void ip_int8_distance_avx512_vnni(
+    const void *a, const void *b, int size, float *distance) {
+  const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001);
+  const __m128i ONES_INT16_SSE = _mm_set1_epi32(0x00010001);
+
+  const int8_t *lhs = reinterpret_cast<const int8_t *>(a);
+  const int8_t *rhs = reinterpret_cast<const int8_t *>(b);
+
+  const int8_t *last = lhs + size;
+  const int8_t *last_aligned = lhs + ((size >> 6) << 6);
+
+  float result = 0.0f;
+
+  __m256i ymm_sum_0 = _mm256_setzero_si256();
+  __m256i ymm_sum_1 = _mm256_setzero_si256();
+
+  if (((uintptr_t)lhs & 0x1f) == 0 && ((uintptr_t)rhs & 0x1f) == 0) {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      __m256i ymm_lhs_0 = _mm256_load_si256((const __m256i *)(lhs + 0));
+      __m256i ymm_lhs_1 = _mm256_load_si256((const __m256i *)(lhs + 32));
+      __m256i ymm_rhs_0 = _mm256_load_si256((const __m256i *)(rhs + 0));
+      __m256i ymm_rhs_1 = _mm256_load_si256((const __m256i *)(rhs + 32));
+
+      ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0);
+      ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1);
+      ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0);
+      ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1);
+
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      ymm_sum_1 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1),
+                            ONES_INT16_AVX),
+          ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 32) {
+      __m256i ymm_lhs = _mm256_load_si256((const __m256i *)lhs);
+      __m256i ymm_rhs = _mm256_load_si256((const __m256i *)rhs);
+      ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs);
+      ymm_rhs = _mm256_abs_epi8(ymm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      lhs += 32;
+      rhs += 32;
+    }
+
+    if (last >= lhs + 16) {
+      __m128i xmm_lhs = _mm_load_si128((const __m128i *)lhs);
+      __m128i xmm_rhs = _mm_load_si128((const __m128i *)rhs);
+      xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs);
+      xmm_rhs = _mm_abs_epi8(xmm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_set_m128i(_mm_setzero_si128(),
+                           _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs),
+                                          ONES_INT16_SSE)),
+          ymm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  } else {
+    for (; lhs != last_aligned; lhs += 64, rhs += 64) {
+      __m256i ymm_lhs_0 = _mm256_loadu_si256((const __m256i *)(lhs + 0));
+      __m256i ymm_lhs_1 = _mm256_loadu_si256((const __m256i *)(lhs + 32));
+      __m256i ymm_rhs_0 = _mm256_loadu_si256((const __m256i *)(rhs + 0));
+      __m256i ymm_rhs_1 = _mm256_loadu_si256((const __m256i *)(rhs + 32));
+
+      ymm_lhs_0 = _mm256_sign_epi8(ymm_lhs_0, ymm_rhs_0);
+      ymm_lhs_1 = _mm256_sign_epi8(ymm_lhs_1, ymm_rhs_1);
+      ymm_rhs_0 = _mm256_abs_epi8(ymm_rhs_0);
+      ymm_rhs_1 = _mm256_abs_epi8(ymm_rhs_1);
+
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_0, ymm_lhs_0),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      ymm_sum_1 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs_1, ymm_lhs_1),
+                            ONES_INT16_AVX),
+          ymm_sum_1);
+    }
+
+    if (last >= last_aligned + 32) {
+      __m256i ymm_lhs = _mm256_loadu_si256((const __m256i *)lhs);
+      __m256i ymm_rhs = _mm256_loadu_si256((const __m256i *)rhs);
+      ymm_lhs = _mm256_sign_epi8(ymm_lhs, ymm_rhs);
+      ymm_rhs = _mm256_abs_epi8(ymm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_madd_epi16(_mm256_maddubs_epi16(ymm_rhs, ymm_lhs),
+                            ONES_INT16_AVX),
+          ymm_sum_0);
+      lhs += 32;
+      rhs += 32;
+    }
+
+    if (last >= lhs + 16) {
+      __m128i xmm_lhs = _mm_loadu_si128((const __m128i *)lhs);
+      __m128i xmm_rhs = _mm_loadu_si128((const __m128i *)rhs);
+      xmm_lhs = _mm_sign_epi8(xmm_lhs, xmm_rhs);
+      xmm_rhs = _mm_abs_epi8(xmm_rhs);
+      ymm_sum_0 = _mm256_add_epi32(
+          _mm256_set_m128i(_mm_setzero_si128(),
+                           _mm_madd_epi16(_mm_maddubs_epi16(xmm_rhs, xmm_lhs),
+                                          ONES_INT16_SSE)),
+          ymm_sum_0);
+      lhs += 16;
+      rhs += 16;
+    }
+  }
+  result = static_cast<float>(
+      HorizontalAdd_INT32_V256(_mm256_add_epi32(ymm_sum_0, ymm_sum_1)));
+
+  switch (last - lhs) {
+    case 15:
+      FMA_INT8_GENERAL(lhs[14], rhs[14], result)
+      /* FALLTHRU */
+    case 14:
+      FMA_INT8_GENERAL(lhs[13], rhs[13], result)
+      /* FALLTHRU */
+    case 13:
+      FMA_INT8_GENERAL(lhs[12], rhs[12], result)
+      /* FALLTHRU */
+    case 12:
+      FMA_INT8_GENERAL(lhs[11], rhs[11], result)
+      /* FALLTHRU */
+    case 11:
+      FMA_INT8_GENERAL(lhs[10], rhs[10], result)
+      /* FALLTHRU */
+    case 10:
+      FMA_INT8_GENERAL(lhs[9], rhs[9], result)
+      /* FALLTHRU */
+    case 9:
+      FMA_INT8_GENERAL(lhs[8], rhs[8], result)
+      /* FALLTHRU */
+    case 8:
+      FMA_INT8_GENERAL(lhs[7], rhs[7], result)
+      /* FALLTHRU */
+    case 7:
+      FMA_INT8_GENERAL(lhs[6], rhs[6], result)
+      /* FALLTHRU */
+    case 6:
+      FMA_INT8_GENERAL(lhs[5], rhs[5], result)
+      /* FALLTHRU */
+    case 5:
+      FMA_INT8_GENERAL(lhs[4], rhs[4], result)
+      /* FALLTHRU */
+    case 4:
+      FMA_INT8_GENERAL(lhs[3], rhs[3], result)
+      /* FALLTHRU */
+    case 3:
+      FMA_INT8_GENERAL(lhs[2], rhs[2], result)
+      /* FALLTHRU */
+    case 2:
+      FMA_INT8_GENERAL(lhs[1], rhs[1], result)
+      /* FALLTHRU */
+    case 1:
+      FMA_INT8_GENERAL(lhs[0], rhs[0], result)
+  }
+  *distance = result;
+}
+#endif
+
+void l2_int8_distance_avx512_vnni(const void *a, const void *b, int dim,
+                                  float *distance) {
+#if defined(__AVX512VNNI__)
+  const int d = dim - 20;
+  ip_int8_distance_avx512_vnni(a, b, d, distance);
+
+  const float *a_tail =
+      reinterpret_cast<const float *>(reinterpret_cast<const int8_t *>(a) + d);
+  const float *b_tail =
+      reinterpret_cast<const float *>(reinterpret_cast<const int8_t *>(b) + d);
+
+  float qa = b_tail[0];
+  float qb = b_tail[1];
+  float qs = b_tail[2];
+  float qs2 = b_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+
+  float ma = a_tail[0];
+  float mb = a_tail[1];
+  float ms = a_tail[2];
+  float ms2 = a_tail[3];
+
+  *distance = ma * ma * ms2 + sum2 - 2 * ma * qa * *distance +
+              (mb - qb) * (mb - qb) * d + 2 * (mb - qb) * (ms * ma - sum);
+#else
+  (void)a;
+  (void)b;
+  (void)dim;
+  (void)distance;
+#endif
+}
+
+#if defined(__AVX512VNNI__)
+template <int batch_size>
+__attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni_impl(
+    const void *query, const void *const *vectors,
+    const std::array<const void *, batch_size> &prefetch_ptrs,
+    int dimensionality, float *distances) {
+  __m512i accs[batch_size];
+  for (int i = 0; i < batch_size; ++i) {
+    accs[i] = _mm512_setzero_si512();
+  }
+  int dim = 0;
+  for (; dim + 64 <= dimensionality; dim += 64) {
+    __m512i q = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(
+        reinterpret_cast<const int8_t *>(query) + dim));
+    __m512i data_regs[batch_size];
+    for (int i = 0; i < batch_size; ++i) {
+      data_regs[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(
+          reinterpret_cast<const int8_t *>(vectors[i]) + dim));
+    }
+    if (prefetch_ptrs[0]) {
+      for (int i = 0; i < batch_size; ++i) {
+        _mm_prefetch(
+            reinterpret_cast<const char *>(
+                reinterpret_cast<const int8_t *>(prefetch_ptrs[i]) + dim),
+            _MM_HINT_T0);
+      }
+    }
+    for (int i = 0; i < batch_size; ++i) {
+      accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]);
+    }
+  }
+  std::array<int, batch_size> temp_results{};
+  for (int i = 0; i < batch_size; ++i) {
+    temp_results[i] = _mm512_reduce_add_epi32(accs[i]);
+  }
+  for (; dim < dimensionality; ++dim) {
+    uint q = reinterpret_cast<const uint8_t *>(query)[dim];
+    for (int i = 0; i < batch_size; ++i) {
+      temp_results[i] +=
+          q *
+          static_cast<int>(reinterpret_cast<const uint8_t *>(vectors[i])[dim]);
+    }
+  }
+  for (int i = 0; i < batch_size; ++i) {
+    distances[i] = static_cast<float>(temp_results[i]);
+  }
+}
+
+static __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni(
+    const void *const *vectors, const void *query, int n, int dim,
+    float *distances) {
+  static constexpr int batch_size = 2;
+  static constexpr int prefetch_step = 2;
+  int i = 0;
+  for (; i + batch_size <= n; i += batch_size) {
+    std::array<const void *, batch_size> prefetch_ptrs;
+    for (int j = 0; j < batch_size; ++j) {
+      if (i + j + batch_size * prefetch_step <= n) {
+        prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
+      } else {
+        prefetch_ptrs[j] = nullptr;
+      }
+    }
+    ip_int8_batch_distance_avx512_vnni_impl<batch_size>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+  for (; i < n; i++) {
+    std::array<const void *, 1> prefetch_ptrs{nullptr};
+    ip_int8_batch_distance_avx512_vnni_impl<1>(
+        query, &vectors[i], prefetch_ptrs, dim, distances + i);
+  }
+}
+#endif
+
+void l2_int8_batch_distance_avx512_vnni(const void *const *vectors,
+                                        const void *query, int n, int dim,
+                                        float *distances) {
+#if defined(__AVX512VNNI__)
+  int original_dim = dim - 20;
+
+  ip_int8_batch_distance_avx512_vnni(vectors, query, n, original_dim,
+                                     distances);
+  const float *q_tail = reinterpret_cast<const float *>(
+      reinterpret_cast<const int8_t *>(query) + original_dim);
+  float qa = q_tail[0];
+  float qb = q_tail[1];
+  float qs = q_tail[2];
+  float qs2 = q_tail[3];
+
+  const float sum = qa * qs;
+  const float sum2 = qa * qa * qs2;
+  for (int i = 0; i < n; ++i) {
+    const float *m_tail = reinterpret_cast<const float *>(
+        reinterpret_cast<const int8_t *>(vectors[i]) + original_dim);
+    float ma = m_tail[0];
+    float mb = m_tail[1];
+    float ms = m_tail[2];
+    float ms2 = m_tail[3];
+    int int8_sum = reinterpret_cast<const int *>(m_tail)[4];
+    float &result = distances[i];
+    result -= 128 * int8_sum;
+    result = ma * ma * ms2 + sum2 - 2 * ma * qa * result +
+             (mb - qb) * (mb - qb) * original_dim +
+             2 * (mb - qb) * (ms * ma - sum);
+  }
+#else
+  (void)vectors;
+  (void)query;
+  (void)n;
+  (void)dim;
+  (void)distances;
+#endif
+}
+
+void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim) {
+#if defined(__AVX512VNNI__)
+  int d = dim - 20;
+
+  const int8_t *input = reinterpret_cast<const int8_t *>(query);
+  uint8_t *output = reinterpret_cast<uint8_t *>(query);
+
+  // // AVX512 constant: 128 in each byte (cast to int8_t, which becomes -128
+  // // in signed representation, but addition works correctly due to two's
+  // // complement arithmetic)
+  const __m512i offset = _mm512_set1_epi8(static_cast<int8_t>(128));
+  //
+  int i = 0;
+  // // Process 64 bytes at a time using AVX512
+  for (; i + 64 <= d; i += 64) {
+    __m512i data =
+        _mm512_loadu_si512(reinterpret_cast<const __m512i *>(input + i));
+    __m512i result = _mm512_add_epi8(data, offset);
+    _mm512_storeu_si512(reinterpret_cast<__m512i *>(output + i), result);
+  }
+
+  // Handle remaining elements with scalar loop
+  for (; i < d; ++i) {
+    output[i] = static_cast<uint8_t>(static_cast<int>(input[i]) + 128);
+  }
+#else
+  (void)query;
+  (void)dim;
+#endif
+}
+
+}  // namespace zvec::turbo
diff --git a/src/turbo/euclidean/avx512_impl.h b/src/turbo/euclidean/avx512_impl.h
new file mode 100644
index 000000000..8f1510de3
--- /dev/null
+++ b/src/turbo/euclidean/avx512_impl.h
@@ -0,0 +1,32 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace zvec::turbo {
+
+void l2_int8_distance_avx512_vnni(const void *a, const void *b, int dim,
+                                  float *distance);
+
+void l2_int8_batch_distance_avx512_vnni(const void *const *vectors,
+                                        const void *query, int n, int dim,
+                                        float *distances);
+
+void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim);
+
+
+}  // namespace zvec::turbo
\ No newline at end of file
diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc
new file mode 100644
index 000000000..c59041df9
--- /dev/null
+++ b/src/turbo/turbo.cc
@@ -0,0 +1,75 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <ailego/internal/cpu_features.h>
+#include <zvec/turbo/turbo.h>
+#include "euclidean/avx512_impl.h"
+
+namespace zvec::turbo {
+
+DistanceFunc get_distance_func(MetricType metric_type, DataType data_type,
+                               QuantizeType quantize_type) {
+  if (metric_type == MetricType::kSquaredEuclidean) {
+    if (data_type == DataType::kInt8) {
+      if (quantize_type == QuantizeType::kDefault) {
+        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
+          return l2_int8_distance_avx512_vnni;
+        }
+        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) {
+          return nullptr;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+BatchDistanceFunc get_batch_distance_func(MetricType metric_type,
+                                          DataType data_type,
+                                          QuantizeType quantize_type) {
+  if (metric_type == MetricType::kSquaredEuclidean) {
+    if (data_type == DataType::kInt8) {
+      if (quantize_type == QuantizeType::kDefault) {
+        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
+          return l2_int8_batch_distance_avx512_vnni;
+        }
+        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) {
+          return nullptr;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type,
+                                              DataType data_type,
+                                              QuantizeType quantize_type) {
+  if (metric_type == MetricType::kSquaredEuclidean) {
+    if (data_type == DataType::kInt8) {
+      if (quantize_type == QuantizeType::kDefault) {
+        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
+          return l2_int8_query_preprocess_avx512_vnni;
+        }
+        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) {
+          return nullptr;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace zvec::turbo

From 8974bcf3f0267573ffb22344719caca1d5b3f436 Mon Sep 17 00:00:00 2001
From: luoxiaojian <lxj193371@alibaba-inc.com>
Date: Fri, 13 Mar 2026 17:45:11 +0800
Subject: [PATCH 02/13] format

---
 src/turbo/euclidean/avx512_impl.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc
index 6f6ecce26..bf19779f3 100644
--- a/src/turbo/euclidean/avx512_impl.cc
+++ b/src/turbo/euclidean/avx512_impl.cc
@@ -5,8 +5,7 @@
 namespace zvec::turbo {
 
 #if defined(__AVX512VNNI__)
-static __attribute__((always_inline)) int32_t HorizontalAdd_INT32_V256(
-    __m256i v) {
+static inline int32_t HorizontalAdd_INT32_V256(__m256i v) {
   __m256i x1 = _mm256_hadd_epi32(v, v);
   __m256i x2 = _mm256_hadd_epi32(x1, x1);
   __m128i x3 = _mm256_extractf128_si256(x2, 1);

From f332f2f0b609a77b8009248f2c722b63a60b3b43 Mon Sep 17 00:00:00 2001
From: luoxiaojian <lxj193371@alibaba-inc.com>
Date: Fri, 13 Mar 2026 17:56:22 +0800
Subject: [PATCH 03/13] fix

---
 src/turbo/euclidean/avx512_impl.cc | 14 +++++++-------
 src/turbo/euclidean/avx512_impl.h  |  2 +-
 src/turbo/turbo.cc                 |  1 -
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc
index bf19779f3..06de318cf 100644
--- a/src/turbo/euclidean/avx512_impl.cc
+++ b/src/turbo/euclidean/avx512_impl.cc
@@ -251,11 +251,11 @@ __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni_impl(
     temp_results[i] = _mm512_reduce_add_epi32(accs[i]);
   }
   for (; dim < dimensionality; ++dim) {
-    uint q = reinterpret_cast<const uint8_t *>(query)[dim];
+    int q = static_cast<int>(reinterpret_cast<const uint8_t *>(query)[dim]);
     for (int i = 0; i < batch_size; ++i) {
       temp_results[i] +=
           q *
-          static_cast<int>(reinterpret_cast<const uint8_t *>(vectors[i])[dim]);
+          static_cast<int>(reinterpret_cast<const int8_t *>(vectors[i])[dim]);
     }
   }
   for (int i = 0; i < batch_size; ++i) {
@@ -336,13 +336,13 @@ void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim) {
   const int8_t *input = reinterpret_cast<const int8_t *>(query);
   uint8_t *output = reinterpret_cast<uint8_t *>(query);
 
-  // // AVX512 constant: 128 in each byte (cast to int8_t, which becomes -128
-  // // in signed representation, but addition works correctly due to two's
-  // // complement arithmetic)
+  // AVX512 constant: 128 in each byte (cast to int8_t, which becomes -128
+  // in signed representation, but addition works correctly due to two's
+  // complement arithmetic)
   const __m512i offset = _mm512_set1_epi8(static_cast<int8_t>(128));
-  //
+
   int i = 0;
-  // // Process 64 bytes at a time using AVX512
+  // Process 64 bytes at a time using AVX512
   for (; i + 64 <= d; i += 64) {
     __m512i data =
         _mm512_loadu_si512(reinterpret_cast<const __m512i *>(input + i));
diff --git a/src/turbo/euclidean/avx512_impl.h b/src/turbo/euclidean/avx512_impl.h
index 8f1510de3..cfcf0bf4d 100644
--- a/src/turbo/euclidean/avx512_impl.h
+++ b/src/turbo/euclidean/avx512_impl.h
@@ -29,4 +29,4 @@ void l2_int8_batch_distance_avx512_vnni(const void *const *vectors,
 void l2_int8_query_preprocess_avx512_vnni(void *query, size_t dim);
 
 
-}  // namespace zvec::turbo
\ No newline at end of file
+}  // namespace zvec::turbo
diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc
index c59041df9..12b385333 100644
--- a/src/turbo/turbo.cc
+++ b/src/turbo/turbo.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <iostream>
 #include <ailego/internal/cpu_features.h>
 #include <zvec/turbo/turbo.h>
 #include "euclidean/avx512_impl.h"

From cffaa75b1eff0864c769fdc61f15768f59dced11 Mon Sep 17 00:00:00 2001
From: luoxiaojian <lxj193371@alibaba-inc.com>
Date: Fri, 13 Mar 2026 17:58:28 +0800
Subject: [PATCH 04/13] comment

---
 src/turbo/euclidean/avx512_impl.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc
index 06de318cf..97b7dee70 100644
--- a/src/turbo/euclidean/avx512_impl.cc
+++ b/src/turbo/euclidean/avx512_impl.cc
@@ -15,6 +15,10 @@ static inline int32_t HorizontalAdd_INT32_V256(__m256i v) {
 
 #define FMA_INT8_GENERAL(m, q, sum) sum += static_cast<float>(m * q);
 
+// This is done to align with the previous behavior
+// (DistanceMatrixCompute<SquaredEuclidean, int8_t>), where SquaredEuclidean
+// assumes no preprocessing on the query, and both the query and data are of
+// type int8_t.
 static __attribute__((always_inline)) void ip_int8_distance_avx512_vnni(
     const void *a, const void *b, int size, float *distance) {
   const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001);

From a1001ef233814d7ec867e60a7448cb70265bb288 Mon Sep 17 00:00:00 2001
From: luoxiaojian <lxj193371@alibaba-inc.com>
Date: Fri, 13 Mar 2026 18:47:50 +0800
Subject: [PATCH 05/13] add guard

---
 src/turbo/euclidean/avx512_impl.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc
index 97b7dee70..b891563ec 100644
--- a/src/turbo/euclidean/avx512_impl.cc
+++ b/src/turbo/euclidean/avx512_impl.cc
@@ -1,5 +1,7 @@
 #include "turbo/euclidean/avx512_impl.h"
+#if defined(__AVX512VNNI__)
 #include <immintrin.h>
+#endif
 #include <array>
 
 namespace zvec::turbo {

From d08d17c1391d7a97f109016e37d93c6e41871762 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=93=E7=AE=80?= <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 17:55:33 +0800
Subject: [PATCH 06/13] fix

---
 src/turbo/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt
index 3f795e9ae..69dd80787 100644
--- a/src/turbo/CMakeLists.txt
+++ b/src/turbo/CMakeLists.txt
@@ -9,6 +9,10 @@ if(NOT ANDROID)
             PROPERTIES
             COMPILE_FLAGS "${TURBO_MARCH_FLAG_AVX512}"
         )
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
+        # ARM64 architecture - no special march flags needed for now
+        # NEON implementations can be added here if needed
+        message(STATUS "turbo: ARM64 detected, skipping x86-specific optimizations")
     endif()
 endif()
 

From 7a8e2a4204f3a0c5a128884242f8d905e2619905 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=93=E7=AE=80?= <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 18:25:53 +0800
Subject: [PATCH 07/13] fix

---
 src/turbo/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt
index 69dd80787..79a3c139d 100644
--- a/src/turbo/CMakeLists.txt
+++ b/src/turbo/CMakeLists.txt
@@ -1,7 +1,7 @@
 include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake)
 include(${PROJECT_ROOT_DIR}/cmake/option.cmake)
 
-if(NOT ANDROID)
+if(NOT ANDROID AND AUTO_DETECT_ARCH)
     if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64")
         setup_compiler_march_for_x86(TURBO_MARCH_FLAG_SSE TURBO_MARCH_FLAG_AVX2 TURBO_MARCH_FLAG_AVX512)
         set_source_files_properties(

From cb842fb639ea4f2eff18cfa3fa40a0d4e2fe9e18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=93=E7=AE=80?= <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 19:02:03 +0800
Subject: [PATCH 08/13] fix

---
 src/turbo/CMakeLists.txt | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt
index 79a3c139d..8b068d7f7 100644
--- a/src/turbo/CMakeLists.txt
+++ b/src/turbo/CMakeLists.txt
@@ -16,7 +16,19 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH)
     endif()
 endif()
 
-file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h)
+# Collect source files explicitly
+set(TURBO_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/turbo.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.cc
+)
+
+set(TURBO_HDRS
+    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.h
+)
+
+set(ALL_SRCS ${TURBO_SRCS} ${TURBO_HDRS})
 
 cc_library(
     NAME zvec_turbo STATIC STRICT PACKED

From 8ec213fd3d14bc0f3b84a36dd4d2be85fcb228b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=93=E7=AE=80?= <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 19:29:55 +0800
Subject: [PATCH 09/13] fix

---
 examples/c++/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/c++/CMakeLists.txt b/examples/c++/CMakeLists.txt
index d0dbf8b60..d17315792 100644
--- a/examples/c++/CMakeLists.txt
+++ b/examples/c++/CMakeLists.txt
@@ -43,7 +43,7 @@ set(zvec_ailego_deps
 )
 
 set(zvec_core_deps
-    # empty
+    zvec_turbo
 )
 
 set(zvec_db_deps

From 2ef289f7a3721694a8cc3cc81e941018df62bcd8 Mon Sep 17 00:00:00 2001
From: luoxiaojian <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 20:16:16 +0800
Subject: [PATCH 10/13] revert some modifications

---
 cmake/option.cmake | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cmake/option.cmake b/cmake/option.cmake
index b9b7361d5..3c0424221 100644
--- a/cmake/option.cmake
+++ b/cmake/option.cmake
@@ -226,14 +226,11 @@ if(NOT AUTO_DETECT_ARCH)
 
 else()
   # AUTO DETECT
-  # Heuristic: detect host architecture and probe appropriate flags.
-  # For x86, per-file march flags are managed via setup_compiler_march_for_x86()
-  # in each library's CMakeLists.txt, so no global -march is set here to avoid
-  # conflicting with per-file flags (e.g. -march=icelake, -march=core-avx2).
+  # Heuristic: detect host architecture and probe appropriate flags
   if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
     _setup_armv8_march()
   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i686|i386|x64")
-    # intentionally no global -march for x86 in AUTO_DETECT_ARCH mode
+    _setup_x86_march()
   else()
     message(WARNING "Unknown host architecture: ${CMAKE_SYSTEM_PROCESSOR}; no -march= set.")
   endif()

From 642deff02b1e748d9057fa777f61def7decb6a74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=93=E7=AE=80?= <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 20:44:05 +0800
Subject: [PATCH 11/13] refine

---
 src/turbo/CMakeLists.txt         | 14 +-------------
 src/turbo/euclidean/avx2_impl.cc |  0
 src/turbo/euclidean/avx2_impl.h  |  0
 src/turbo/turbo.cc               |  9 ---------
 4 files changed, 1 insertion(+), 22 deletions(-)
 delete mode 100644 src/turbo/euclidean/avx2_impl.cc
 delete mode 100644 src/turbo/euclidean/avx2_impl.h

diff --git a/src/turbo/CMakeLists.txt b/src/turbo/CMakeLists.txt
index 8b068d7f7..79a3c139d 100644
--- a/src/turbo/CMakeLists.txt
+++ b/src/turbo/CMakeLists.txt
@@ -16,19 +16,7 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH)
     endif()
 endif()
 
-# Collect source files explicitly
-set(TURBO_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/turbo.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.cc
-)
-
-set(TURBO_HDRS
-    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx2_impl.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/euclidean/avx512_impl.h
-)
-
-set(ALL_SRCS ${TURBO_SRCS} ${TURBO_HDRS})
+file(GLOB_RECURSE ALL_SRCS *.cc *.c *.h)
 
 cc_library(
     NAME zvec_turbo STATIC STRICT PACKED
diff --git a/src/turbo/euclidean/avx2_impl.cc b/src/turbo/euclidean/avx2_impl.cc
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/turbo/euclidean/avx2_impl.h b/src/turbo/euclidean/avx2_impl.h
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/turbo/turbo.cc b/src/turbo/turbo.cc
index 12b385333..c0331e28a 100644
--- a/src/turbo/turbo.cc
+++ b/src/turbo/turbo.cc
@@ -26,9 +26,6 @@ DistanceFunc get_distance_func(MetricType metric_type, DataType data_type,
         if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
           return l2_int8_distance_avx512_vnni;
         }
-        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) {
-          return nullptr;
-        }
       }
     }
   }
@@ -44,9 +41,6 @@ BatchDistanceFunc get_batch_distance_func(MetricType metric_type,
         if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
           return l2_int8_batch_distance_avx512_vnni;
         }
-        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) {
-          return nullptr;
-        }
       }
     }
   }
@@ -62,9 +56,6 @@ QueryPreprocessFunc get_query_preprocess_func(MetricType metric_type,
         if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) {
           return l2_int8_query_preprocess_avx512_vnni;
         }
-        if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) {
-          return nullptr;
-        }
       }
     }
   }

From ee3af4ef744d23cabd460bf60e403ae89e2c68d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=93=E7=AE=80?= <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 20:48:05 +0800
Subject: [PATCH 12/13] fix bound check

---
 src/turbo/euclidean/avx512_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc
index b891563ec..bee702dd0 100644
--- a/src/turbo/euclidean/avx512_impl.cc
+++ b/src/turbo/euclidean/avx512_impl.cc
@@ -278,7 +278,7 @@ static __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni(
   for (; i + batch_size <= n; i += batch_size) {
     std::array<const void *, batch_size> prefetch_ptrs;
     for (int j = 0; j < batch_size; ++j) {
-      if (i + j + batch_size * prefetch_step <= n) {
+      if (i + j + batch_size * prefetch_step < n) {
         prefetch_ptrs[j] = vectors[i + j + batch_size * prefetch_step];
       } else {
         prefetch_ptrs[j] = nullptr;

From 5fd4a0f605f8e5fb5b9e0d443d0fb8a42fbe0d69 Mon Sep 17 00:00:00 2001
From: luoxiaojian <lxj193371@alibaba-inc.com>
Date: Tue, 17 Mar 2026 21:06:29 +0800
Subject: [PATCH 13/13] fix prefetch

---
 src/turbo/euclidean/avx512_impl.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/turbo/euclidean/avx512_impl.cc b/src/turbo/euclidean/avx512_impl.cc
index bee702dd0..844b25acf 100644
--- a/src/turbo/euclidean/avx512_impl.cc
+++ b/src/turbo/euclidean/avx512_impl.cc
@@ -240,15 +240,13 @@ __attribute__((always_inline)) void ip_int8_batch_distance_avx512_vnni_impl(
       data_regs[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(
           reinterpret_cast<const int8_t *>(vectors[i]) + dim));
     }
-    if (prefetch_ptrs[0]) {
-      for (int i = 0; i < batch_size; ++i) {
+    for (int i = 0; i < batch_size; ++i) {
+      if (prefetch_ptrs[i]) {
         _mm_prefetch(
             reinterpret_cast<const char *>(
                 reinterpret_cast<const int8_t *>(prefetch_ptrs[i]) + dim),
             _MM_HINT_T0);
       }
-    }
-    for (int i = 0; i < batch_size; ++i) {
       accs[i] = _mm512_dpbusd_epi32(accs[i], q, data_regs[i]);
     }
   }