diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 23c90be85..3f6ae5ed2 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -53,3 +53,4 @@ benchmark_test(benchmark_float_range hdf5/benchmark_float_range.cpp) benchmark_test(benchmark_float_range_bitset hdf5/benchmark_float_range_bitset.cpp) benchmark_test(gen_hdf5_file hdf5/gen_hdf5_file.cpp) +benchmark_test(gen_fbin_file hdf5/gen_fbin_file.cpp) diff --git a/benchmark/benchmark_base.h b/benchmark/benchmark_base.h index acc7de64b..81eb4c315 100644 --- a/benchmark/benchmark_base.h +++ b/benchmark/benchmark_base.h @@ -209,6 +209,7 @@ class Benchmark_base { protected: double T0_; + std::string metric_type_; int32_t dim_; void* xb_ = nullptr; void* xq_ = nullptr; diff --git a/benchmark/hdf5/benchmark_binary.cpp b/benchmark/hdf5/benchmark_binary.cpp index 8ead1b3f0..a13dea18e 100644 --- a/benchmark/hdf5/benchmark_binary.cpp +++ b/benchmark/hdf5/benchmark_binary.cpp @@ -104,8 +104,6 @@ class Benchmark_binary : public Benchmark_knowhere, public ::testing::Test { parse_ann_test_name(); load_hdf5_data(); - assert(metric_str_ == METRIC_HAM_STR || metric_str_ == METRIC_JAC_STR); - metric_type_ = (metric_str_ == METRIC_HAM_STR) ? knowhere::metric::HAMMING : knowhere::metric::JACCARD; cfg_[knowhere::meta::METRIC_TYPE] = metric_type_; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AVX2); printf("faiss::distance_compute_blas_threshold: %ld\n", knowhere::KnowhereConfig::GetBlasThreshold()); diff --git a/benchmark/hdf5/benchmark_binary_range.cpp b/benchmark/hdf5/benchmark_binary_range.cpp index edaca74b4..fd74a073d 100644 --- a/benchmark/hdf5/benchmark_binary_range.cpp +++ b/benchmark/hdf5/benchmark_binary_range.cpp @@ -112,8 +112,6 @@ class Benchmark_binary_range : public Benchmark_knowhere, public ::testing::Test load_hdf5_data_range(); #endif - assert(metric_str_ == METRIC_HAM_STR || metric_str_ == METRIC_JAC_STR); - metric_type_ = (metric_str_ == METRIC_HAM_STR) ? knowhere::metric::HAMMING : knowhere::metric::JACCARD; cfg_[knowhere::meta::METRIC_TYPE] = metric_type_; cfg_[knowhere::meta::RADIUS] = *gt_radius_; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AVX2); diff --git a/benchmark/hdf5/benchmark_float.cpp b/benchmark/hdf5/benchmark_float.cpp index 2f54bd6be..5f42d7e24 100644 --- a/benchmark/hdf5/benchmark_float.cpp +++ b/benchmark/hdf5/benchmark_float.cpp @@ -151,8 +151,6 @@ class Benchmark_float : public Benchmark_knowhere, public ::testing::Test { parse_ann_test_name(); load_hdf5_data(); - assert(metric_str_ == METRIC_IP_STR || metric_str_ == METRIC_L2_STR); - metric_type_ = (metric_str_ == METRIC_IP_STR) ? "IP" : "L2"; cfg_[knowhere::meta::METRIC_TYPE] = metric_type_; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AVX2); printf("faiss::distance_compute_blas_threshold: %ld\n", knowhere::KnowhereConfig::GetBlasThreshold()); diff --git a/benchmark/hdf5/benchmark_float_bitset.cpp b/benchmark/hdf5/benchmark_float_bitset.cpp index 2b7ffb8b2..beb34eff3 100644 --- a/benchmark/hdf5/benchmark_float_bitset.cpp +++ b/benchmark/hdf5/benchmark_float_bitset.cpp @@ -139,8 +139,6 @@ class Benchmark_float_bitset : public Benchmark_knowhere, public ::testing::Test parse_ann_test_name(); load_hdf5_data(); - assert(metric_str_ == METRIC_IP_STR || metric_str_ == METRIC_L2_STR); - metric_type_ = (metric_str_ == METRIC_IP_STR) ? "IP" : "L2"; cfg_[knowhere::meta::METRIC_TYPE] = metric_type_; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AVX2); printf("faiss::distance_compute_blas_threshold: %ld\n", knowhere::KnowhereConfig::GetBlasThreshold()); diff --git a/benchmark/hdf5/benchmark_float_qps.cpp b/benchmark/hdf5/benchmark_float_qps.cpp index 2022f4468..ba0c7f203 100644 --- a/benchmark/hdf5/benchmark_float_qps.cpp +++ b/benchmark/hdf5/benchmark_float_qps.cpp @@ -214,8 +214,6 @@ class Benchmark_float_qps : public Benchmark_knowhere, public ::testing::Test { parse_ann_test_name(); load_hdf5_data(); - assert(metric_str_ == METRIC_IP_STR || metric_str_ == METRIC_L2_STR); - metric_type_ = (metric_str_ == METRIC_IP_STR) ? knowhere::metric::IP : knowhere::metric::L2; cfg_[knowhere::meta::METRIC_TYPE] = metric_type_; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AUTO); #ifdef KNOWHERE_WITH_GPU diff --git a/benchmark/hdf5/benchmark_float_range.cpp b/benchmark/hdf5/benchmark_float_range.cpp index 591e742f0..1ad9b8afe 100644 --- a/benchmark/hdf5/benchmark_float_range.cpp +++ b/benchmark/hdf5/benchmark_float_range.cpp @@ -112,8 +112,6 @@ class Benchmark_float_range : public Benchmark_knowhere, public ::testing::Test load_hdf5_data_range(); #endif - assert(metric_str_ == METRIC_IP_STR || metric_str_ == METRIC_L2_STR); - metric_type_ = (metric_str_ == METRIC_IP_STR) ? knowhere::metric::IP : knowhere::metric::L2; cfg_[knowhere::meta::METRIC_TYPE] = metric_type_; cfg_[knowhere::meta::RADIUS] = *gt_radius_; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AVX2); diff --git a/benchmark/hdf5/benchmark_float_range_bitset.cpp b/benchmark/hdf5/benchmark_float_range_bitset.cpp index fd9203af1..f5c406ce7 100644 --- a/benchmark/hdf5/benchmark_float_range_bitset.cpp +++ b/benchmark/hdf5/benchmark_float_range_bitset.cpp @@ -147,8 +147,6 @@ class Benchmark_float_range_bitset : public Benchmark_knowhere, public ::testing parse_ann_test_name_with_range(); load_hdf5_data_range(); - assert(metric_str_ == METRIC_IP_STR || metric_str_ == METRIC_L2_STR); - metric_type_ = (metric_str_ == METRIC_IP_STR) ? "IP" : "L2"; cfg_[knowhere::meta::METRIC_TYPE] = metric_type_; cfg_[knowhere::meta::RADIUS] = *gt_radius_; knowhere::KnowhereConfig::SetSimdType(knowhere::KnowhereConfig::SimdType::AVX2); diff --git a/benchmark/hdf5/benchmark_hdf5.h b/benchmark/hdf5/benchmark_hdf5.h index ddc8ffe8b..8a7f707cd 100644 --- a/benchmark/hdf5/benchmark_hdf5.h +++ b/benchmark/hdf5/benchmark_hdf5.h @@ -34,6 +34,7 @@ static const char* HDF5_DATASET_RADIUS = "radius"; static const char* METRIC_IP_STR = "angular"; static const char* METRIC_L2_STR = "euclidean"; +static const char* METRIC_COS_STR = "cosine"; static const char* METRIC_HAM_STR = "hamming"; static const char* METRIC_JAC_STR = "jaccard"; @@ -61,43 +62,58 @@ class Benchmark_hdf5 : public Benchmark_base { int32_t parse_name_and_dim() { size_t pos1, pos2; - assert(!ann_test_name_.empty() || !"ann_test_name not set"); + pos1 = ann_test_name_.find_first_of('-', 0); assert(pos1 != std::string::npos); + dataset_name_ = ann_test_name_.substr(0, pos1); pos2 = ann_test_name_.find_first_of('-', pos1 + 1); assert(pos2 != std::string::npos); - dim_ = std::stoi(ann_test_name_.substr(pos1 + 1, pos2 - pos1 - 1)); return (pos2 + 1); } + void + set_metric_type(const std::string& str) { + if (str == METRIC_L2_STR || str == "l2") { + metric_type_ = "L2"; + } else if (str == METRIC_IP_STR || str == "ip") { + metric_type_ = "IP"; + } else if (str == METRIC_COS_STR) { + metric_type_ = "COSINE"; + } else if (str == METRIC_HAM_STR) { + metric_type_ = "HAMMING"; + } else if (str == METRIC_JAC_STR) { + metric_type_ = "JACCARD"; + } else { + assert(false); + } + } void parse_ann_test_name() { auto pos = parse_name_and_dim(); metric_str_ = ann_test_name_.substr(pos); + set_metric_type(metric_str_); } void parse_ann_test_name_with_range() { auto pos1 = parse_name_and_dim(); - auto pos2 = ann_test_name_.find_first_of('-', pos1); assert(pos2 != std::string::npos); metric_str_ = ann_test_name_.substr(pos1, pos2 - pos1); - + set_metric_type(metric_str_); assert("range" == ann_test_name_.substr(pos2 + 1)); } void parse_ann_test_name_with_range_multi() { auto pos1 = parse_name_and_dim(); - auto pos2 = ann_test_name_.find_first_of('-', pos1); assert(pos2 != std::string::npos); metric_str_ = ann_test_name_.substr(pos1, pos2 - pos1); - + set_metric_type(metric_str_); assert("range-multi" == ann_test_name_.substr(pos2 + 1)); } @@ -119,10 +135,10 @@ class Benchmark_hdf5 : public Benchmark_base { assert(dim * 32 == dim_ || !"train dataset has incorrect dimension"); } - if (metric_str_ == METRIC_IP_STR) { - printf("[%.3f s] Normalizing train dataset \n", get_time_diff()); - normalize((float*)xb_, nb_, dim_); - } + // if (metric_str_ == METRIC_IP_STR) { + // printf("[%.3f s] Normalizing train dataset \n", get_time_diff()); + // normalize((float*)xb_, nb_, dim_); + // } /* load test data */ printf("[%.3f s] Loading test data\n", get_time_diff()); @@ -134,10 +150,10 @@ class Benchmark_hdf5 : public Benchmark_base { assert(dim * 32 == dim_ || !"test dataset has incorrect dimension"); } - if (metric_str_ == METRIC_IP_STR) { - printf("[%.3f s] Normalizing test dataset \n", get_time_diff()); - normalize((float*)xq_, nq_, dim_); - } + // if (metric_str_ == METRIC_IP_STR) { + // printf("[%.3f s] Normalizing test dataset \n", get_time_diff()); + // normalize((float*)xq_, nq_, dim_); + // } /* load ground-truth data */ int32_t gt_nq; @@ -167,10 +183,10 @@ class Benchmark_hdf5 : public Benchmark_base { assert(dim * 32 == dim_ || !"train dataset has incorrect dimension"); } - if (metric_str_ == METRIC_IP_STR) { - printf("[%.3f s] Normalizing train dataset \n", get_time_diff()); - normalize((float*)xb_, nb_, dim_); - } + // if (metric_str_ == METRIC_IP_STR) { + // printf("[%.3f s] Normalizing train dataset \n", get_time_diff()); + // normalize((float*)xb_, nb_, dim_); + // } /* load test data */ printf("[%.3f s] Loading test data\n", get_time_diff()); @@ -182,10 +198,10 @@ class Benchmark_hdf5 : public Benchmark_base { assert(dim * 32 == dim_ || !"test dataset has incorrect dimension"); } - if (metric_str_ == METRIC_IP_STR) { - printf("[%.3f s] Normalizing test dataset \n", get_time_diff()); - normalize((float*)xq_, nq_, dim_); - } + // if (metric_str_ == METRIC_IP_STR) { + // printf("[%.3f s] Normalizing test dataset \n", get_time_diff()); + // normalize((float*)xq_, nq_, dim_); + // } /* load ground-truth data */ int32_t cols, rows; @@ -221,10 +237,10 @@ class Benchmark_hdf5 : public Benchmark_base { assert(dim * 32 == dim_ || !"train dataset has incorrect dimension"); } - if (metric_str_ == METRIC_IP_STR) { - printf("[%.3f s] Normalizing train dataset \n", get_time_diff()); - normalize((float*)xb_, nb_, dim_); - } + // if (metric_str_ == METRIC_IP_STR) { + // printf("[%.3f s] Normalizing train dataset \n", get_time_diff()); + // normalize((float*)xb_, nb_, dim_); + // } /* load test data */ printf("[%.3f s] Loading test data\n", get_time_diff()); @@ -236,10 +252,10 @@ class Benchmark_hdf5 : public Benchmark_base { assert(dim * 32 == dim_ || !"test dataset has incorrect dimension"); } - if (metric_str_ == METRIC_IP_STR) { - printf("[%.3f s] Normalizing test dataset \n", get_time_diff()); - normalize((float*)xq_, nq_, dim_); - } + // if (metric_str_ == METRIC_IP_STR) { + // printf("[%.3f s] Normalizing test dataset \n", get_time_diff()); + // normalize((float*)xq_, nq_, dim_); + // } /* load ground-truth data */ int32_t cols, rows; @@ -421,5 +437,6 @@ class Benchmark_hdf5 : public Benchmark_base { protected: std::string ann_test_name_ = ""; + std::string dataset_name_; std::string metric_str_; }; diff --git a/benchmark/hdf5/benchmark_knowhere.h b/benchmark/hdf5/benchmark_knowhere.h index 0505d87e2..1bd1ae1db 100644 --- a/benchmark/hdf5/benchmark_knowhere.h +++ b/benchmark/hdf5/benchmark_knowhere.h @@ -138,8 +138,6 @@ class Benchmark_knowhere : public Benchmark_hdf5 { } protected: - std::string metric_type_; - std::string index_type_; knowhere::Json cfg_; knowhere::Index index_; diff --git a/benchmark/hdf5/gen_fbin_file.cpp b/benchmark/hdf5/gen_fbin_file.cpp new file mode 100644 index 000000000..58227e08f --- /dev/null +++ b/benchmark/hdf5/gen_fbin_file.cpp @@ -0,0 +1,175 @@ +// Copyright (C) 2019-2023 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#include + +#include +#include +#include + +#include "benchmark/utils.h" +#include "benchmark_hdf5.h" +#include "knowhere/comp/brute_force.h" +#include "knowhere/comp/index_param.h" +#include "knowhere/dataset.h" + +knowhere::DataSetPtr +GenDataSet(int rows, int dim) { + std::mt19937 rng(42); + std::uniform_real_distribution<> distrib(-1.0, 1.0); + float* ts = new float[rows * dim]; + for (int i = 0; i < rows * dim; ++i) { + ts[i] = (float)distrib(rng); + } + auto ds = knowhere::GenDataSet(rows, dim, ts); + ds->SetIsOwner(true); + return ds; +} + +class Create_FBIN : public Benchmark_hdf5, public ::testing::Test { + protected: + void + SetUp() override { + } + + void + TearDown() override { + } + + void + fbin_write(const std::string& filename, const uint32_t rows, const uint32_t dim, const void* data) { + FileIOWriter writer(filename); + writer((void*)&rows, sizeof(rows)); + writer((void*)&dim, sizeof(dim)); + writer((void*)data, rows * dim * sizeof(float)); + } + + void + fbin_read(const std::string& filename, uint32_t& rows, uint32_t& dim, void* data) { + FileIOReader reader(filename); + reader((void*)&rows, sizeof(rows)); + reader((void*)&dim, sizeof(dim)); + reader((void*)data, rows * dim * sizeof(float)); + } + + void + fbin_result_write(const std::string& filename, const uint32_t rows, const uint32_t topk, const uint32_t* ids, + const float* dist) { + FileIOWriter writer(filename); + writer((void*)&rows, sizeof(rows)); + writer((void*)&topk, sizeof(topk)); + writer((void*)ids, rows * topk * sizeof(uint32_t)); + writer((void*)dist, rows * topk * sizeof(float)); + } + + void + fbin_range_result_write(const std::string& filename, const uint32_t rows, const float radius, const uint32_t* lims, + const uint32_t* ids, const float* dist) { + FileIOWriter writer(filename); + writer((void*)&rows, sizeof(rows)); + writer((void*)&radius, sizeof(radius)); + writer((void*)lims, (rows + 1) * sizeof(uint32_t)); + writer((void*)ids, lims[rows] * sizeof(uint32_t)); + writer((void*)dist, lims[rows] * sizeof(float)); + } + + void + create_fbin_files(const int64_t nb, const int64_t nq, const int64_t dim, const int64_t topk, + const std::vector& metric_types) { + knowhere::DataSetPtr xb_ds, xq_ds; + xb_ds = GenDataSet(nb, dim); + xq_ds = GenDataSet(nq, dim); + + std::string prefix = "rand-" + std::to_string(dim) + "-"; + std::string postfix = ".fbin"; + std::string filename; + + filename = prefix + "base" + postfix; + fbin_write(filename, nb, dim, xb_ds->GetTensor()); + + filename = prefix + "query" + postfix; + fbin_write(filename, nq, dim, xq_ds->GetTensor()); + + for (knowhere::MetricType metric_type : metric_types) { + std::string metric_str = metric_type; + transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower); + + knowhere::Json json; + json[knowhere::meta::DIM] = dim; + json[knowhere::meta::METRIC_TYPE] = metric_type; + json[knowhere::meta::TOPK] = topk; + + auto result = knowhere::BruteForce::Search(xb_ds, xq_ds, json, nullptr); + assert(result.has_value()); + + // convert golden_ids to int32 + auto elem_cnt = nq * topk; + std::vector gt_ids_int(elem_cnt); + for (int32_t i = 0; i < elem_cnt; i++) { + gt_ids_int[i] = result.value()->GetIds()[i]; + } + + filename = prefix + metric_str + "-gt" + postfix; + fbin_result_write(filename, nq, topk, gt_ids_int.data(), result.value()->GetDistance()); + } + } +}; + +TEST_F(Create_FBIN, CREATE_FLOAT) { + int64_t nb = 10000; + int64_t nq = 100; + int64_t dim = 128; + int64_t topk = 100; + + create_fbin_files(nb, nq, dim, topk, {knowhere::metric::L2, knowhere::metric::IP, knowhere::metric::COSINE}); +} + +TEST_F(Create_FBIN, HDF5_TO_FBIN) { + set_ann_test_name("rand-128-l2"); + parse_ann_test_name(); + load_hdf5_data(); + + std::string prefix = dataset_name_ + "-" + std::to_string(dim_) + "-"; + std::string postfix = ".fbin"; + std::string filename; + + filename = prefix + "base" + postfix; + fbin_write(filename, nb_, dim_, xb_); + + filename = prefix + "query" + postfix; + fbin_write(filename, nq_, dim_, xq_); + + filename = prefix + metric_str_ + "-gt" + postfix; + fbin_result_write(filename, nq_, gt_k_, (uint32_t*)gt_ids_, gt_dist_); + + free_all(); +} + +TEST_F(Create_FBIN, HDF5_RANGE_TO_FBIN) { + set_ann_test_name("rand-128-l2-range"); + parse_ann_test_name_with_range(); + load_hdf5_data_range(); + + std::string prefix = dataset_name_ + "-" + std::to_string(dim_) + "-range-"; + std::string postfix = ".fbin"; + std::string filename; + + filename = prefix + "base" + postfix; + fbin_write(filename, nb_, dim_, xb_); + + filename = prefix + "query" + postfix; + fbin_write(filename, nq_, dim_, xq_); + + filename = prefix + metric_str_ + "-gt" + postfix; + fbin_range_result_write(filename, nq_, *gt_radius_, (uint32_t*)gt_lims_, (uint32_t*)gt_ids_, gt_dist_); + + free_all(); +} diff --git a/benchmark/hdf5/gen_hdf5_file.cpp b/benchmark/hdf5/gen_hdf5_file.cpp index d8ddfaaef..b2a478b45 100644 --- a/benchmark/hdf5/gen_hdf5_file.cpp +++ b/benchmark/hdf5/gen_hdf5_file.cpp @@ -18,7 +18,6 @@ #include "benchmark_hdf5.h" #include "knowhere/comp/brute_force.h" #include "knowhere/comp/index_param.h" -#include "knowhere/comp/knowhere_config.h" #include "knowhere/dataset.h" knowhere::DataSetPtr