From 2967ddf65b4fdc781020c1d4abfe1d2f2d1214e4 Mon Sep 17 00:00:00 2001 From: Yudong Cai Date: Wed, 25 Oct 2023 11:47:07 +0800 Subject: [PATCH] Add gen_hdf_file.cpp to support create random dataset Signed-off-by: Yudong Cai --- benchmark/CMakeLists.txt | 2 + benchmark/hdf5/benchmark_hdf5.h | 105 ++++-------------- benchmark/hdf5/gen_hdf5_file.cpp | 177 +++++++++++++++++++++++++++++++ benchmark/utils.h | 12 +++ 4 files changed, 209 insertions(+), 87 deletions(-) create mode 100644 benchmark/hdf5/gen_hdf5_file.cpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 86d908808..e2a5203b0 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -47,3 +47,5 @@ benchmark_test(benchmark_float_bitset hdf5/benchmark_float_bitset.cpp) benchmark_test(benchmark_float_qps hdf5/benchmark_float_qps.cpp) benchmark_test(benchmark_float_range hdf5/benchmark_float_range.cpp) benchmark_test(benchmark_float_range_bitset hdf5/benchmark_float_range_bitset.cpp) + +benchmark_test(gen_hdf5_file hdf5/gen_hdf5_file.cpp) diff --git a/benchmark/hdf5/benchmark_hdf5.h b/benchmark/hdf5/benchmark_hdf5.h index 92602e578..ddc8ffe8b 100644 --- a/benchmark/hdf5/benchmark_hdf5.h +++ b/benchmark/hdf5/benchmark_hdf5.h @@ -330,6 +330,20 @@ class Benchmark_hdf5 : public Benchmark_base { return data_out; } + void + write_hdf5_dataset(hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols, + const void* data) { + hsize_t dims[2]; + dims[0] = rows; + dims[1] = cols; + auto dataspace = H5Screate_simple(2, dims, NULL); + auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data); + assert(err == 0); + H5Dclose(dataset); + H5Sclose(dataspace); + } + // For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */ template void @@ -338,31 +352,18 @@ class Benchmark_hdf5 : public Benchmark_base { /* Open the file and the dataset. */ hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); - auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols, - const void* data) { - hsize_t dims[2]; - dims[0] = rows; - dims[1] = cols; - auto dataspace = H5Screate_simple(2, dims, NULL); - auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data); - assert(err == 0); - H5Dclose(dataset); - H5Sclose(dataspace); - }; - /* write train dataset */ if (!is_binary) { write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb); } else { - write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb); + write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb); } /* write test dataset */ if (!is_binary) { write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq); } else { - write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq); + write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq); } /* write ground-truth labels dataset */ @@ -388,31 +389,18 @@ class Benchmark_hdf5 : public Benchmark_base { /* Open the file and the dataset. */ hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); - auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols, - const void* data) { - hsize_t dims[2]; - dims[0] = rows; - dims[1] = cols; - auto dataspace = H5Screate_simple(2, dims, NULL); - auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data); - assert(err == 0); - H5Dclose(dataset); - H5Sclose(dataspace); - }; - /* write train dataset */ if (!is_binary) { write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb); } else { - write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb); + write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb); } /* write test dataset */ if (!is_binary) { write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq); } else { - write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq); + write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq); } /* write ground-truth radius */ @@ -431,63 +419,6 @@ class Benchmark_hdf5 : public Benchmark_base { H5Fclose(file); } - // For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */ - // Write HDF5 file with following dataset: - // HDF5_DATASET_RADIUS - H5T_NATIVE_FLOAT, [1, nq] - // HDF5_DATASET_LIMS - H5T_NATIVE_INT32, [1, nq+1] - // HDF5_DATASET_NEIGHBORS - H5T_NATIVE_INT32, [1, lims[nq]] - // HDF5_DATASET_DISTANCES - H5T_NATIVE_FLOAT, [1, lims[nq]] - template - void - hdf5_write_range(const char* file_name, const int32_t dim, const void* xb, const int32_t nb, const void* xq, - const int32_t nq, const float* g_radius, const void* g_lims, const void* g_ids, - const void* g_dist) { - /* Open the file and the dataset. */ - hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); - - auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols, - const void* data) { - hsize_t dims[2]; - dims[0] = rows; - dims[1] = cols; - auto dataspace = H5Screate_simple(2, dims, NULL); - auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data); - assert(err == 0); - H5Dclose(dataset); - H5Sclose(dataspace); - }; - - /* write train dataset */ - if (!is_binary) { - write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb); - } else { - write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb); - } - - /* write test dataset */ - if (!is_binary) { - write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq); - } else { - write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq); - } - - /* write ground-truth radius */ - write_hdf5_dataset(file, HDF5_DATASET_RADIUS, H5T_NATIVE_FLOAT, 1, nq, g_radius); - - /* write ground-truth lims dataset */ - write_hdf5_dataset(file, HDF5_DATASET_LIMS, H5T_NATIVE_INT32, 1, nq + 1, g_lims); - - /* write ground-truth labels dataset */ - write_hdf5_dataset(file, HDF5_DATASET_NEIGHBORS, H5T_NATIVE_INT32, 1, ((int32_t*)g_lims)[nq], g_ids); - - /* write ground-truth distance dataset */ - write_hdf5_dataset(file, HDF5_DATASET_DISTANCES, H5T_NATIVE_FLOAT, 1, ((int32_t*)g_lims)[nq], g_dist); - - /* Close/release resources. */ - H5Fclose(file); - } - protected: std::string ann_test_name_ = ""; std::string metric_str_; diff --git a/benchmark/hdf5/gen_hdf5_file.cpp b/benchmark/hdf5/gen_hdf5_file.cpp new file mode 100644 index 000000000..b6a2dbd40 --- /dev/null +++ b/benchmark/hdf5/gen_hdf5_file.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019-2023 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#include + +#include +#include + +#include "benchmark_hdf5.h" +#include "knowhere/comp/brute_force.h" +#include "knowhere/comp/index_param.h" +#include "knowhere/comp/knowhere_config.h" +#include "knowhere/dataset.h" + +knowhere::DataSetPtr +GenDataSet(int rows, int dim) { + std::mt19937 rng(42); + std::uniform_real_distribution<> distrib(-1.0, 1.0); + float* ts = new float[rows * dim]; + for (int i = 0; i < rows * dim; ++i) { + ts[i] = (float)distrib(rng); + } + auto ds = knowhere::GenDataSet(rows, dim, ts); + ds->SetIsOwner(true); + return ds; +} + +knowhere::DataSetPtr +GenBinDataSet(int rows, int dim) { + std::mt19937 rng(42); + std::uniform_int_distribution<> distrib(0, 255); + int uint8_num = dim / 8; + uint8_t* ts = new uint8_t[rows * uint8_num]; + for (int i = 0; i < rows * uint8_num; ++i) { + ts[i] = (uint8_t)distrib(rng); + } + auto ds = knowhere::GenDataSet(rows, dim, ts); + ds->SetIsOwner(true); + return ds; +} + +class Create_HDF5 : public Benchmark_hdf5, public ::testing::Test { + protected: + void + SetUp() override { + } + + void + TearDown() override { + } + + template + void + create_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq, const int64_t dim, + const int64_t topk) { + std::string metric_str = metric_type; + transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower); + std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + ".hdf5"; + + knowhere::Json json; + json[knowhere::meta::DIM] = dim; + json[knowhere::meta::METRIC_TYPE] = metric_type; + json[knowhere::meta::TOPK] = topk; + + knowhere::DataSetPtr xb_ds, xq_ds; + if (is_binary) { + xb_ds = GenBinDataSet(nb, dim); + xq_ds = GenBinDataSet(nq, dim); + } else { + xb_ds = GenDataSet(nb, dim); + xq_ds = GenDataSet(nq, dim); + } + + auto result = knowhere::BruteForce::Search(xb_ds, xq_ds, json, nullptr); + assert(result.has_value()); + + // convert golden_ids to int32 + auto elem_cnt = nq * topk; + std::vector gt_ids_int(elem_cnt); + for (int32_t i = 0; i < elem_cnt; i++) { + gt_ids_int[i] = result.value()->GetIds()[i]; + } + + hdf5_write(fn.c_str(), dim, topk, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, gt_ids_int.data(), + result.value()->GetDistance()); + } + + template + void + create_range_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq, + const int64_t dim, const float radius) { + std::string metric_str = metric_type; + transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower); + std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + "-range.hdf5"; + + knowhere::Json json; + json[knowhere::meta::DIM] = dim; + json[knowhere::meta::METRIC_TYPE] = metric_type; + json[knowhere::meta::RADIUS] = radius; + + knowhere::DataSetPtr xb_ds, xq_ds; + if (is_binary) { + xb_ds = GenBinDataSet(nb, dim); + xq_ds = GenBinDataSet(nq, dim); + } else { + xb_ds = GenDataSet(nb, dim); + xq_ds = GenDataSet(nq, dim); + } + + auto result = knowhere::BruteForce::RangeSearch(xb_ds, xq_ds, json, nullptr); + assert(result.has_value()); + + // convert golden_lims to int32 + std::vector gt_lims_int(nq + 1); + for (int32_t i = 0; i <= nq; i++) { + gt_lims_int[i] = result.value()->GetLims()[i]; + } + + // convert golden_ids to int32 + auto elem_cnt = result.value()->GetLims()[nq]; + std::vector gt_ids_int(elem_cnt); + for (int32_t i = 0; i < elem_cnt; i++) { + gt_ids_int[i] = result.value()->GetIds()[i]; + } + + hdf5_write_range(fn.c_str(), dim, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, radius, + gt_lims_int.data(), gt_ids_int.data(), result.value()->GetDistance()); + } +}; + +TEST_F(Create_HDF5, CREATE_FLOAT) { + int64_t nb = 10000; + int64_t nq = 100; + int64_t dim = 128; + int64_t topk = 100; + + create_hdf5_file(knowhere::metric::L2, nb, nq, dim, topk); + create_hdf5_file(knowhere::metric::IP, nb, nq, dim, topk); + create_hdf5_file(knowhere::metric::COSINE, nb, nq, dim, topk); +} + +TEST_F(Create_HDF5, CREATE_FLOAT_RANGE) { + int64_t nb = 10000; + int64_t nq = 100; + int64_t dim = 128; + + create_range_hdf5_file(knowhere::metric::L2, nb, nq, dim, 65.0); + create_range_hdf5_file(knowhere::metric::IP, nb, nq, dim, 8.7); + create_range_hdf5_file(knowhere::metric::COSINE, nb, nq, dim, 0.2); +} + +TEST_F(Create_HDF5, CREATE_BINARY) { + int64_t nb = 10000; + int64_t nq = 100; + int64_t dim = 1024; + int64_t topk = 100; + + create_hdf5_file(knowhere::metric::HAMMING, nb, nq, dim, topk); + create_hdf5_file(knowhere::metric::JACCARD, nb, nq, dim, topk); +} + +TEST_F(Create_HDF5, CREATE_BINARY_RANGE) { + int64_t nb = 10000; + int64_t nq = 100; + int64_t dim = 1024; + + create_range_hdf5_file(knowhere::metric::HAMMING, nb, nq, dim, 476); + create_range_hdf5_file(knowhere::metric::JACCARD, nb, nq, dim, 0.63); +} diff --git a/benchmark/utils.h b/benchmark/utils.h index bb6cb066a..dd7f33bf4 100644 --- a/benchmark/utils.h +++ b/benchmark/utils.h @@ -82,3 +82,15 @@ GenRandomBitset(size_t n, size_t t) { } return data; } + +inline float* +GenRandomVector(size_t rows, size_t dim) { + std::mt19937 rng(42); + // std::uniform_int_distribution<> distrib(0.0, 100.0); + std::uniform_real_distribution<> distrib(-1.0, 1.0); + float* data = new float[rows * dim]; + for (int i = 0; i < rows * dim; ++i) { + data[i] = distrib(rng); + } + return data; +}