Skip to content

Commit

Permalink
Add gen_hdf_file.cpp to support create random dataset
Browse files Browse the repository at this point in the history
Signed-off-by: Yudong Cai <[email protected]>
  • Loading branch information
cydrain committed Oct 30, 2023
1 parent 69613c0 commit 2967ddf
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 87 deletions.
2 changes: 2 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ benchmark_test(benchmark_float_bitset hdf5/benchmark_float_bitset.cpp)
benchmark_test(benchmark_float_qps hdf5/benchmark_float_qps.cpp)
benchmark_test(benchmark_float_range hdf5/benchmark_float_range.cpp)
benchmark_test(benchmark_float_range_bitset hdf5/benchmark_float_range_bitset.cpp)

benchmark_test(gen_hdf5_file hdf5/gen_hdf5_file.cpp)
105 changes: 18 additions & 87 deletions benchmark/hdf5/benchmark_hdf5.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,20 @@ class Benchmark_hdf5 : public Benchmark_base {
return data_out;
}

void
write_hdf5_dataset(hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
}

// For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */
template <bool is_binary>
void
Expand All @@ -338,31 +352,18 @@ class Benchmark_hdf5 : public Benchmark_base {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq);
}

/* write ground-truth labels dataset */
Expand All @@ -388,31 +389,18 @@ class Benchmark_hdf5 : public Benchmark_base {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq);
}

/* write ground-truth radius */
Expand All @@ -431,63 +419,6 @@ class Benchmark_hdf5 : public Benchmark_base {
H5Fclose(file);
}

// For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */
// Write HDF5 file with following dataset:
// HDF5_DATASET_RADIUS - H5T_NATIVE_FLOAT, [1, nq]
// HDF5_DATASET_LIMS - H5T_NATIVE_INT32, [1, nq+1]
// HDF5_DATASET_NEIGHBORS - H5T_NATIVE_INT32, [1, lims[nq]]
// HDF5_DATASET_DISTANCES - H5T_NATIVE_FLOAT, [1, lims[nq]]
template <bool is_binary>
void
hdf5_write_range(const char* file_name, const int32_t dim, const void* xb, const int32_t nb, const void* xq,
const int32_t nq, const float* g_radius, const void* g_lims, const void* g_ids,
const void* g_dist) {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
}

/* write ground-truth radius */
write_hdf5_dataset(file, HDF5_DATASET_RADIUS, H5T_NATIVE_FLOAT, 1, nq, g_radius);

/* write ground-truth lims dataset */
write_hdf5_dataset(file, HDF5_DATASET_LIMS, H5T_NATIVE_INT32, 1, nq + 1, g_lims);

/* write ground-truth labels dataset */
write_hdf5_dataset(file, HDF5_DATASET_NEIGHBORS, H5T_NATIVE_INT32, 1, ((int32_t*)g_lims)[nq], g_ids);

/* write ground-truth distance dataset */
write_hdf5_dataset(file, HDF5_DATASET_DISTANCES, H5T_NATIVE_FLOAT, 1, ((int32_t*)g_lims)[nq], g_dist);

/* Close/release resources. */
H5Fclose(file);
}

protected:
std::string ann_test_name_ = "";
std::string metric_str_;
Expand Down
177 changes: 177 additions & 0 deletions benchmark/hdf5/gen_hdf5_file.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
// Copyright (C) 2019-2023 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.

#include <gtest/gtest.h>

#include <algorithm>
#include <vector>

#include "benchmark_hdf5.h"
#include "knowhere/comp/brute_force.h"
#include "knowhere/comp/index_param.h"
#include "knowhere/comp/knowhere_config.h"
#include "knowhere/dataset.h"

knowhere::DataSetPtr
GenDataSet(int rows, int dim) {
std::mt19937 rng(42);
std::uniform_real_distribution<> distrib(-1.0, 1.0);
float* ts = new float[rows * dim];
for (int i = 0; i < rows * dim; ++i) {
ts[i] = (float)distrib(rng);
}
auto ds = knowhere::GenDataSet(rows, dim, ts);
ds->SetIsOwner(true);
return ds;
}

knowhere::DataSetPtr
GenBinDataSet(int rows, int dim) {
std::mt19937 rng(42);
std::uniform_int_distribution<> distrib(0, 255);
int uint8_num = dim / 8;
uint8_t* ts = new uint8_t[rows * uint8_num];
for (int i = 0; i < rows * uint8_num; ++i) {
ts[i] = (uint8_t)distrib(rng);
}
auto ds = knowhere::GenDataSet(rows, dim, ts);
ds->SetIsOwner(true);
return ds;
}

class Create_HDF5 : public Benchmark_hdf5, public ::testing::Test {
protected:
void
SetUp() override {
}

void
TearDown() override {
}

template <bool is_binary>
void
create_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq, const int64_t dim,
const int64_t topk) {
std::string metric_str = metric_type;
transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower);
std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + ".hdf5";

knowhere::Json json;
json[knowhere::meta::DIM] = dim;
json[knowhere::meta::METRIC_TYPE] = metric_type;
json[knowhere::meta::TOPK] = topk;

knowhere::DataSetPtr xb_ds, xq_ds;
if (is_binary) {
xb_ds = GenBinDataSet(nb, dim);
xq_ds = GenBinDataSet(nq, dim);
} else {
xb_ds = GenDataSet(nb, dim);
xq_ds = GenDataSet(nq, dim);
}

auto result = knowhere::BruteForce::Search(xb_ds, xq_ds, json, nullptr);
assert(result.has_value());

// convert golden_ids to int32
auto elem_cnt = nq * topk;
std::vector<int32_t> gt_ids_int(elem_cnt);
for (int32_t i = 0; i < elem_cnt; i++) {
gt_ids_int[i] = result.value()->GetIds()[i];
}

hdf5_write<is_binary>(fn.c_str(), dim, topk, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, gt_ids_int.data(),
result.value()->GetDistance());
}

template <bool is_binary>
void
create_range_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq,
const int64_t dim, const float radius) {
std::string metric_str = metric_type;
transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower);
std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + "-range.hdf5";

knowhere::Json json;
json[knowhere::meta::DIM] = dim;
json[knowhere::meta::METRIC_TYPE] = metric_type;
json[knowhere::meta::RADIUS] = radius;

knowhere::DataSetPtr xb_ds, xq_ds;
if (is_binary) {
xb_ds = GenBinDataSet(nb, dim);
xq_ds = GenBinDataSet(nq, dim);
} else {
xb_ds = GenDataSet(nb, dim);
xq_ds = GenDataSet(nq, dim);
}

auto result = knowhere::BruteForce::RangeSearch(xb_ds, xq_ds, json, nullptr);
assert(result.has_value());

// convert golden_lims to int32
std::vector<int32_t> gt_lims_int(nq + 1);
for (int32_t i = 0; i <= nq; i++) {
gt_lims_int[i] = result.value()->GetLims()[i];
}

// convert golden_ids to int32
auto elem_cnt = result.value()->GetLims()[nq];
std::vector<int32_t> gt_ids_int(elem_cnt);
for (int32_t i = 0; i < elem_cnt; i++) {
gt_ids_int[i] = result.value()->GetIds()[i];
}

hdf5_write_range<is_binary>(fn.c_str(), dim, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, radius,
gt_lims_int.data(), gt_ids_int.data(), result.value()->GetDistance());
}
};

TEST_F(Create_HDF5, CREATE_FLOAT) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 128;
int64_t topk = 100;

create_hdf5_file<false>(knowhere::metric::L2, nb, nq, dim, topk);
create_hdf5_file<false>(knowhere::metric::IP, nb, nq, dim, topk);
create_hdf5_file<false>(knowhere::metric::COSINE, nb, nq, dim, topk);
}

TEST_F(Create_HDF5, CREATE_FLOAT_RANGE) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 128;

create_range_hdf5_file<false>(knowhere::metric::L2, nb, nq, dim, 65.0);
create_range_hdf5_file<false>(knowhere::metric::IP, nb, nq, dim, 8.7);
create_range_hdf5_file<false>(knowhere::metric::COSINE, nb, nq, dim, 0.2);
}

TEST_F(Create_HDF5, CREATE_BINARY) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 1024;
int64_t topk = 100;

create_hdf5_file<true>(knowhere::metric::HAMMING, nb, nq, dim, topk);
create_hdf5_file<true>(knowhere::metric::JACCARD, nb, nq, dim, topk);
}

TEST_F(Create_HDF5, CREATE_BINARY_RANGE) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 1024;

create_range_hdf5_file<true>(knowhere::metric::HAMMING, nb, nq, dim, 476);
create_range_hdf5_file<true>(knowhere::metric::JACCARD, nb, nq, dim, 0.63);
}
12 changes: 12 additions & 0 deletions benchmark/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,15 @@ GenRandomBitset(size_t n, size_t t) {
}
return data;
}

inline float*
GenRandomVector(size_t rows, size_t dim) {
std::mt19937 rng(42);
// std::uniform_int_distribution<> distrib(0.0, 100.0);
std::uniform_real_distribution<> distrib(-1.0, 1.0);
float* data = new float[rows * dim];
for (int i = 0; i < rows * dim; ++i) {
data[i] = distrib(rng);
}
return data;
}

0 comments on commit 2967ddf

Please sign in to comment.