From 6753daf52f6f94b69ea0cd7cfc8db11afade099a Mon Sep 17 00:00:00 2001 From: jinwei14 Date: Mon, 26 Jan 2026 22:36:24 -0800 Subject: [PATCH 1/4] modify without exec_env_ols --- include/pq_flash_index.h | 6 +++ src/pq_flash_index.cpp | 113 ++++++++++++++++++++++++++++++++++----- 2 files changed, 105 insertions(+), 14 deletions(-) diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index befde1a1a..cf504efdd 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -254,6 +254,12 @@ template class PQFlashIndex tsl::robin_map> _real_to_dummy_map; std::unordered_map _label_map; + // Validate if data type is correct by check each data's vector/neighbor section. + // disk_nnodes, total node count in index, used to validate neighbor index of each node. + // contains_disk_pq_file, flag if indices contains disk pq file(suffix : _disk.index_pq_pivots.bin); + // If it's true, max_node_len use uint8 to calculate data size instead.(check aux_utils.cpp:1399) + bool validate_vector_data_type(uint64_t disk_nnodes, bool contains_disk_pq_file); + #ifdef EXEC_ENV_OLS // Set to a larger value than the actual header to accommodate // any additions we make to the header. This is an outer limit diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 179e9a71e..4203eec5d 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -687,7 +687,6 @@ bool PQFlashIndex::point_has_any_label(uint32_t point_id, const std:: return ret_val; } - template void PQFlashIndex::parse_label_file(std::basic_istream &infile, size_t &num_points_labels) { @@ -980,7 +979,6 @@ template void PQFlashIndex::load_labels ss << "Note: Filter support is enabled but " << dummy_map_file << " file cannot be opened" << std::endl; diskann::cerr << ss.str(); } - } else { @@ -1180,11 +1178,11 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons READ_U64(index_metadata, this->_nvecs_per_sector); } - #ifdef EXEC_ENV_OLS - load_labels(files, _disk_index_file); - #else - load_labels(_disk_index_file); - #endif +#ifdef EXEC_ENV_OLS + load_labels(files, _disk_index_file); +#else + load_labels(_disk_index_file); +#endif diskann::cout << "Disk-Index File Meta-data: "; diskann::cout << "# nodes per sector: " << _nnodes_per_sector; @@ -1209,7 +1207,13 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons this->_max_nthreads = num_threads; #endif - + // Validate data type (called once for both EXEC_ENV_OLS and non-OLS paths) + if (!validate_vector_data_type(disk_nnodes, _use_disk_index_pq)) + { + throw diskann::ANNException("Data type validation failed. Please ensure --data_type matches " + "the type used when building the index.", + -1, __FUNCSIG__, __FILE__, __LINE__); + } #ifdef EXEC_ENV_OLS if (files.fileExists(medoids_file)) { @@ -1469,13 +1473,12 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t if (use_filters) { uint64_t size_to_reserve = std::max(l_search, (std::min((uint64_t)filter_label_count, this->_max_degree) + 1)); retset.reserve(size_to_reserve); - full_retset.reserve(4096); + full_retset.reserve(4096); full_retset_ids.reserve(4096); } else { retset.reserve(l_search + 1); } - uint32_t best_medoid = 0; uint32_t cur_list_size = 0; float best_dist = (std::numeric_limits::max)(); @@ -1538,7 +1541,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t //if we are doing multi-filter search we don't want to restrict the number of IOs //at present. Must revisit this decision later. uint32_t max_ios_for_query = use_filters || (io_limit == 0) ? std::numeric_limits::max() : io_limit; - const std::vector& label_ids = filter_labels; //avoid renaming. + const std::vector& label_ids = filter_labels; //avoid renaming. std::vector lbl_vec; retset.sort(); @@ -1554,7 +1557,6 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t // find new beam uint32_t num_seen = 0; - for (const auto &lbl : label_ids) { // assuming that number of OR labels is // less than max frontier size allowed @@ -1686,7 +1688,6 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t full_retset.push_back(Neighbor((unsigned)cached_nhood.first, cur_expanded_dist)); } - uint64_t nnbrs = cached_nhood.second.first; uint32_t *node_nbrs = cached_nhood.second.second; @@ -1768,7 +1769,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t { full_retset.push_back(Neighbor(frontier_nhood.first, cur_expanded_dist)); } - + uint32_t *node_nbrs = (node_buf + 1); // compute node_nbrs <-> query dist in PQ space cpu_timer.reset(); @@ -1971,6 +1972,90 @@ template char *PQFlashIndex::getHeaderB } #endif +template +bool PQFlashIndex::validate_vector_data_type(uint64_t disk_nnodes, bool contains_disk_pq_file) +{ + size_t vector_length = contains_disk_pq_file ? _data_dim * sizeof(uint8_t) : _data_dim * sizeof(T); + if (vector_length + sizeof(uint32_t) >= _max_node_len) + { + diskann::cerr << "Vector length : " << vector_length << " and neighbor count size : " << sizeof(uint32_t) + << ", expected less than max node length : " << _max_node_len << std::endl; + diskann::cerr << "Please check if wrong data type with larger size is " + "specified, like use float type to load byte index!" + << std::endl; + return false; + } + + // Borrow thread data for the read + ScratchStoreManager> manager(this->_thread_data); + auto this_thread_data = manager.scratch_space(); + IOContext &ctx = this_thread_data->ctx; + + // Allocate sector-aligned buffer (required for direct I/O) + char *buf = nullptr; + alloc_aligned((void **)&buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); + + // Read first node (located at sector 1, after header in sector 0) + AlignedRead read_request(defaults::SECTOR_LEN, defaults::SECTOR_LEN, buf); + std::vector read_requests; + read_requests.emplace_back(read_request); + reader->read(read_requests, ctx); + +#if defined(_WINDOWS) && defined(USE_BING_INFRA) + if ((*ctx.m_pRequestsStatus)[0] != IOContext::READ_SUCCESS) + { + aligned_free(buf); + diskann::cerr << "Read disk index file " << _disk_index_file << " failed, can't validate data type!" + << std::endl; + return false; + } +#endif + + uint32_t max_degree = static_cast((_max_node_len - vector_length - sizeof(uint32_t)) / sizeof(uint32_t)); + char *first_node = buf; + uint32_t *neighbors = reinterpret_cast(first_node + vector_length); + uint32_t neighbor_count = *neighbors; + + if (neighbor_count > max_degree) + { + aligned_free(buf); + diskann::cerr << "Calculated max neighbor count : " << max_degree + << " and first node neighbor count : " << neighbor_count << ", load data type is not correct!" + << std::endl; + return false; + } + + if (contains_disk_pq_file) + { + size_t real_node_len = _data_dim * sizeof(T) + sizeof(uint32_t) * (max_degree + 1); + if (real_node_len <= defaults::SECTOR_LEN) + { + aligned_free(buf); + diskann::cerr << "Index files contains disk pq file, which means real node length : " << real_node_len + << " should be greater than disk sector length : " << defaults::SECTOR_LEN << std::endl; + diskann::cerr << "Please check if wrong data type with smaller size is " + "specified, like use byte type to load float index!" + << std::endl; + return false; + } + } + + for (uint32_t i = 1; i <= neighbor_count; ++i) + { + if (neighbors[i] >= disk_nnodes) + { + aligned_free(buf); + diskann::cerr << ":Neighbor[" << i - 1 << "], index : " << neighbors[i] + << ", greater than total node count : " << disk_nnodes << ", load data type is not correct!" + << std::endl; + return false; + } + } + + aligned_free(buf); + return true; +} + template std::vector PQFlashIndex::get_pq_vector(std::uint64_t vid) { From 93d0ba20598f09c9220c129d8e10c12fce306d94 Mon Sep 17 00:00:00 2001 From: jinwei14 Date: Mon, 26 Jan 2026 22:43:08 -0800 Subject: [PATCH 2/4] reverting bad formating for reviewing --- src/pq_flash_index.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 4203eec5d..a4cfc140c 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -687,6 +687,7 @@ bool PQFlashIndex::point_has_any_label(uint32_t point_id, const std:: return ret_val; } + template void PQFlashIndex::parse_label_file(std::basic_istream &infile, size_t &num_points_labels) { @@ -979,6 +980,7 @@ template void PQFlashIndex::load_labels ss << "Note: Filter support is enabled but " << dummy_map_file << " file cannot be opened" << std::endl; diskann::cerr << ss.str(); } + } else { @@ -1178,11 +1180,11 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons READ_U64(index_metadata, this->_nvecs_per_sector); } -#ifdef EXEC_ENV_OLS - load_labels(files, _disk_index_file); -#else - load_labels(_disk_index_file); -#endif + #ifdef EXEC_ENV_OLS + load_labels(files, _disk_index_file); + #else + load_labels(_disk_index_file); + #endif diskann::cout << "Disk-Index File Meta-data: "; diskann::cout << "# nodes per sector: " << _nnodes_per_sector; @@ -1473,7 +1475,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t if (use_filters) { uint64_t size_to_reserve = std::max(l_search, (std::min((uint64_t)filter_label_count, this->_max_degree) + 1)); retset.reserve(size_to_reserve); - full_retset.reserve(4096); + full_retset.reserve(4096); full_retset_ids.reserve(4096); } else { retset.reserve(l_search + 1); @@ -1541,7 +1543,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t //if we are doing multi-filter search we don't want to restrict the number of IOs //at present. Must revisit this decision later. uint32_t max_ios_for_query = use_filters || (io_limit == 0) ? std::numeric_limits::max() : io_limit; - const std::vector& label_ids = filter_labels; //avoid renaming. + const std::vector& label_ids = filter_labels; //avoid renaming. std::vector lbl_vec; retset.sort(); @@ -1557,6 +1559,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t // find new beam uint32_t num_seen = 0; + for (const auto &lbl : label_ids) { // assuming that number of OR labels is // less than max frontier size allowed @@ -1770,6 +1773,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t full_retset.push_back(Neighbor(frontier_nhood.first, cur_expanded_dist)); } + uint32_t *node_nbrs = (node_buf + 1); // compute node_nbrs <-> query dist in PQ space cpu_timer.reset(); From 92a17ac398c4a4e26088aaaf2a8bf076bc14e3d5 Mon Sep 17 00:00:00 2001 From: jinwei14 Date: Mon, 26 Jan 2026 22:44:28 -0800 Subject: [PATCH 3/4] another one --- src/pq_flash_index.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index a4cfc140c..b428df5ba 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -1481,6 +1481,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t retset.reserve(l_search + 1); } + uint32_t best_medoid = 0; uint32_t cur_list_size = 0; float best_dist = (std::numeric_limits::max)(); @@ -1691,6 +1692,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t full_retset.push_back(Neighbor((unsigned)cached_nhood.first, cur_expanded_dist)); } + uint64_t nnbrs = cached_nhood.second.first; uint32_t *node_nbrs = cached_nhood.second.second; @@ -1773,7 +1775,6 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t full_retset.push_back(Neighbor(frontier_nhood.first, cur_expanded_dist)); } - uint32_t *node_nbrs = (node_buf + 1); // compute node_nbrs <-> query dist in PQ space cpu_timer.reset(); From 463b88846ee5f03d92fe8324f8109a4a0f3509b2 Mon Sep 17 00:00:00 2001 From: jinwei14 Date: Wed, 28 Jan 2026 21:50:34 -0800 Subject: [PATCH 4/4] fix: improve data type validation in validate_vector_data_type --- src/pq_flash_index.cpp | 47 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index b428df5ba..88f71de5c 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -1980,7 +1980,12 @@ template char *PQFlashIndex::getHeaderB template bool PQFlashIndex::validate_vector_data_type(uint64_t disk_nnodes, bool contains_disk_pq_file) { - size_t vector_length = contains_disk_pq_file ? _data_dim * sizeof(uint8_t) : _data_dim * sizeof(T); + (void)contains_disk_pq_file; // Suppress unused parameter warning + + // Use _disk_bytes_per_point which is already set correctly: + // - For disk PQ: _disk_pq_n_chunks * sizeof(uint8_t) + // - For regular: _data_dim * sizeof(T) + size_t vector_length = _disk_bytes_per_point; if (vector_length + sizeof(uint32_t) >= _max_node_len) { diskann::cerr << "Vector length : " << vector_length << " and neighbor count size : " << sizeof(uint32_t) @@ -1996,12 +2001,16 @@ bool PQFlashIndex::validate_vector_data_type(uint64_t disk_nnodes, bo auto this_thread_data = manager.scratch_space(); IOContext &ctx = this_thread_data->ctx; + // Calculate the number of sectors needed per node + uint64_t num_sectors_per_node = _nnodes_per_sector > 0 ? 1 : DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN); + // Allocate sector-aligned buffer (required for direct I/O) char *buf = nullptr; - alloc_aligned((void **)&buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN); + alloc_aligned((void **)&buf, num_sectors_per_node * defaults::SECTOR_LEN, defaults::SECTOR_LEN); - // Read first node (located at sector 1, after header in sector 0) - AlignedRead read_request(defaults::SECTOR_LEN, defaults::SECTOR_LEN, buf); + // Read sector(s) containing node 0 + uint64_t node_sector = get_node_sector(0); + AlignedRead read_request(node_sector * defaults::SECTOR_LEN, num_sectors_per_node * defaults::SECTOR_LEN, buf); std::vector read_requests; read_requests.emplace_back(read_request); reader->read(read_requests, ctx); @@ -2016,11 +2025,18 @@ bool PQFlashIndex::validate_vector_data_type(uint64_t disk_nnodes, bo } #endif - uint32_t max_degree = static_cast((_max_node_len - vector_length - sizeof(uint32_t)) / sizeof(uint32_t)); - char *first_node = buf; - uint32_t *neighbors = reinterpret_cast(first_node + vector_length); + // Use offset_to_node to get correct position within the sector buffer for node 0 + char *first_node = offset_to_node(buf, 0); + + // Use offset_to_node_nhood which correctly uses _disk_bytes_per_point + uint32_t *neighbors = offset_to_node_nhood(first_node); uint32_t neighbor_count = *neighbors; + // Calculate max degree based on the assumed vector length + // max_node_len = vector_length + sizeof(uint32_t) + max_degree * sizeof(uint32_t) + // So: max_degree = (max_node_len - vector_length - sizeof(uint32_t)) / sizeof(uint32_t) + uint32_t max_degree = static_cast((_max_node_len - vector_length - sizeof(uint32_t)) / sizeof(uint32_t)); + if (neighbor_count > max_degree) { aligned_free(buf); @@ -2030,27 +2046,12 @@ bool PQFlashIndex::validate_vector_data_type(uint64_t disk_nnodes, bo return false; } - if (contains_disk_pq_file) - { - size_t real_node_len = _data_dim * sizeof(T) + sizeof(uint32_t) * (max_degree + 1); - if (real_node_len <= defaults::SECTOR_LEN) - { - aligned_free(buf); - diskann::cerr << "Index files contains disk pq file, which means real node length : " << real_node_len - << " should be greater than disk sector length : " << defaults::SECTOR_LEN << std::endl; - diskann::cerr << "Please check if wrong data type with smaller size is " - "specified, like use byte type to load float index!" - << std::endl; - return false; - } - } - for (uint32_t i = 1; i <= neighbor_count; ++i) { if (neighbors[i] >= disk_nnodes) { aligned_free(buf); - diskann::cerr << ":Neighbor[" << i - 1 << "], index : " << neighbors[i] + diskann::cerr << "Neighbor[" << i - 1 << "], index : " << neighbors[i] << ", greater than total node count : " << disk_nnodes << ", load data type is not correct!" << std::endl; return false;