Skip to content

Commit

Permalink
C++ Improvements - API enhancement and increase testing (#85)
Browse files Browse the repository at this point in the history
* Add C++ tests and overloaded Index methods that accept 2D vector of floats instead of NDArray

* Use most recent version of clang-format

* Undo clang-format bump. Fix formatting

* clean up C++ test, increase number of vectors

* Fix comment

* Move code into reusable function

* Use quantized random input vectors for Float8 and E4M3 storage. Remove unused util methods

* Optimize vectorsToNDArray() and add validation for vector sizes, add tests
  • Loading branch information
stephen29xie authored Sep 10, 2024
1 parent a4902b8 commit 88cfc46
Show file tree
Hide file tree
Showing 7 changed files with 290 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ java/classpath.txt
java/linux-build/include/*
python/voyager-headers
.asv/
*.dSYM

# Cmake
CMakeLists.txt.user
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ class Index {

virtual hnswlib::labeltype addItem(std::vector<float> vector,
std::optional<hnswlib::labeltype> id) = 0;

virtual std::vector<hnswlib::labeltype>
addItems(std::vector<std::vector<float>> input,
std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) = 0;

virtual std::vector<hnswlib::labeltype>
addItems(NDArray<float, 2> input, std::vector<hnswlib::labeltype> ids = {},
int numThreads = -1) = 0;
Expand All @@ -86,6 +91,10 @@ class Index {
virtual std::tuple<std::vector<hnswlib::labeltype>, std::vector<float>>
query(std::vector<float> queryVector, int k = 1, long queryEf = -1) = 0;

virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
query(std::vector<std::vector<float>> queryVectors, int k = 1,
int numThreads = -1, long queryEf = -1) = 0;

virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
query(NDArray<float, 2> queryVectors, int k = 1, int numThreads = -1,
long queryEf = -1) = 0;
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/TypedIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,12 @@ class TypedIndex : public Index {
return addItems(NDArray<float, 2>(vector, {1, (int)vector.size()}), ids)[0];
}

std::vector<hnswlib::labeltype>
addItems(const std::vector<std::vector<float>> vectors,
std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
return addItems(vectorsToNDArray(vectors), ids, numThreads);
}

std::vector<hnswlib::labeltype>
addItems(NDArray<float, 2> floatInput,
std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
Expand Down Expand Up @@ -502,6 +508,12 @@ class TypedIndex : public Index {
return algorithmImpl->label_lookup_;
}

std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
query(std::vector<std::vector<float>> floatQueryVectors, int k = 1,
int numThreads = -1, long queryEf = -1) {
return query(vectorsToNDArray(floatQueryVectors), k, numThreads, queryEf);
}

std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
query(NDArray<float, 2> floatQueryVectors, int k = 1, int numThreads = -1,
long queryEf = -1) {
Expand Down
27 changes: 27 additions & 0 deletions cpp/src/array_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,3 +309,30 @@ std::string toFloatVectorString(std::vector<data_t> vec) {
return toFloatVectorString<dist_t, data_t, scalefactor>(vec.data(),
vec.size());
}

/** Convert a 2D vector of float to NDArray<float, 2> */
NDArray<float, 2> vectorsToNDArray(std::vector<std::vector<float>> vectors) {
int numVectors = vectors.size();
int dimensions = numVectors > 0 ? vectors[0].size() : 0;
std::array<int, 2> shape = {numVectors, dimensions};

// Flatten the 2d array into the NDArray's underlying 1D vector
std::vector<float> flatArray(numVectors * dimensions);
// Pointer to the beginning of the flat array
float *flatArrayPtr = flatArray.data();
for (const auto &vector : vectors) {
// check that all provided vectors are same size, using the 1st vector as
// the reference
if (vector.size() != dimensions) {
throw std::invalid_argument("All vectors must be of the same size, but "
"received vectors of size: " +
std::to_string(dimensions) + " and " +
std::to_string(vector.size()) + ".");
}
// Use std::memcpy to copy the elements directly into the flat array
std::memcpy(flatArrayPtr, vector.data(), vector.size() * sizeof(float));
flatArrayPtr += vector.size(); // Increment the pointer
}

return NDArray<float, 2>(flatArray, shape);
}
3 changes: 3 additions & 0 deletions cpp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ set(TEST_FILES test_main.cpp doctest_setup.cpp) # Add any test files here
# Create an executable for the tests
add_executable(VoyagerTests ${TEST_FILES})

# Add compiler flags
target_compile_options(VoyagerTests PRIVATE -g)

# Link the test executable with the main project and Doctest
# target_link_libraries(MyProjectTests PRIVATE MyProject doctest::doctest)
target_link_libraries(VoyagerTests
Expand Down
229 changes: 197 additions & 32 deletions cpp/test/test_main.cpp
Original file line number Diff line number Diff line change
@@ -1,53 +1,218 @@
#include "doctest.h"

#include "TypedIndex.h"
#include "test_utils.cpp"
#include <tuple>
#include <type_traits>

template <typename dist_t, typename data_t = dist_t,
typename scalefactor = std::ratio<1, 1>>
void testCombination(TypedIndex<dist_t, data_t, scalefactor> &index,
SpaceType spaceType, int numDimensions,
StorageDataType storageType) {
CHECK(toString(index.getSpace()) == toString(spaceType));
CHECK(index.getNumDimensions() == numDimensions);
CHECK(toString(index.getStorageDataType()) == toString(storageType));
void testIndexProperties(TypedIndex<dist_t, data_t, scalefactor> &index,
SpaceType spaceType, int numDimensions,
StorageDataType storageType) {
REQUIRE(toString(index.getSpace()) == toString(spaceType));
REQUIRE(index.getNumDimensions() == numDimensions);
REQUIRE(toString(index.getStorageDataType()) == toString(storageType));
}

TEST_CASE("Test combinations of different instantiations and sizes") {
std::vector<SpaceType> spaceTypesSet = {SpaceType::Euclidean,
SpaceType::InnerProduct};
std::vector<int> numDimensionsSet = {4, 16, 128, 1024};
std::vector<int> numElementsSet = {100, 1000, 100000};
/**
* Test the query method of the index. The index is populated with random
* vectors, and then queried with the same vectors. The expected result is that
* each vector's nearest neighbor is itself and that the distance is zero
* (allowing for some precision error based on the storage type).
*/
template <typename dist_t, typename data_t = dist_t,
typename scalefactor = std::ratio<1, 1>>
void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
int numDimensions, SpaceType spaceType,
StorageDataType storageType, bool testSingleVectorMethod,
float precisionTolerance) {
/**
* Create test data and ids. If we are using Float8 or E4M3 storage, quantize
* the vector values, if we are using Float32 storage, keep the float values
* as-is. We want to match the storage type use case with the input data.
*/
std::vector<std::vector<float>> inputData;
if (storageType == StorageDataType::Float8 ||
storageType == StorageDataType::E4M3) {
inputData = randomQuantizedVectors(numVectors, numDimensions);
} else if (storageType == StorageDataType::Float32) {
inputData = randomVectors(numVectors, numDimensions);
}
std::vector<hnswlib::labeltype> ids(numVectors);
for (int i = 0; i < numVectors; i++) {
ids[i] = i;
}

// add items to index
if (testSingleVectorMethod == true) {
for (auto id : ids) {
index.addItem(inputData[id], id);
}
} else {
index.addItems(inputData, ids, -1);
}

int k = 1;
float lowerBound = 0.0f - precisionTolerance;
float upperBound = 0.0f + precisionTolerance;

// Use the single-query interface (query with a single target vector)
for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
for (int i = 0; i < numVectors; i++) {

/**
* Use the raw inputData as target vectors for querying. We don't use the
* index data because once data has been added to the index, the model can
* change the "ground truth" by changing the data format.
*/
auto targetVector = inputData[i];
auto nearestNeighbor = index.query(targetVector, k, queryEf);

auto labels = std::get<0>(nearestNeighbor);
auto distances = std::get<1>(nearestNeighbor);
REQUIRE(labels.size() == k);
REQUIRE(distances.size() == k);

/**
* E4M3 is too low precision for us to confidently assume that querying
* with the unquantized (fp32) vector will return the quantized vector as
* its NN. InnerProduct will have negative distance to the closest item,
* not zero
*/
if (storageType != StorageDataType::E4M3 &&
spaceType != SpaceType::InnerProduct) {
REQUIRE(i == labels[0]);
REQUIRE(distances[0] >= lowerBound);
REQUIRE(distances[0] <= upperBound);
}
}
}

// Use the bulk-query interface (query with multiple target vectors at once)
for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
auto nearestNeighbors = index.query(
inputData, /* k= */ k, /* numThreads= */ -1, /* queryEf= */ queryEf);
NDArray<hnswlib::labeltype, 2> labels = std::get<0>(nearestNeighbors);
NDArray<dist_t, 2> distances = std::get<1>(nearestNeighbors);
REQUIRE(labels.shape[0] == numVectors);
REQUIRE(labels.shape[1] == k);
REQUIRE(distances.shape[0] == numVectors);
REQUIRE(distances.shape[1] == k);

for (int i = 0; i < numVectors; i++) {
auto label = labels.data[i];
auto distance = distances.data[i];

/**
* E4M3 is too low precision for us to confidently assume that querying
* with the unquantized (fp32) vector will return the quantized vector
* as its NN. InnerProduct will have negative distance to the closest
* item, not zero
*/
if (storageType != StorageDataType::E4M3 &&
spaceType != SpaceType::InnerProduct) {
REQUIRE(i == label);
REQUIRE(distance >= lowerBound);
REQUIRE(distance <= upperBound);
}
}
}
}

TEST_CASE("Test combinations of different instantiations. Test that each "
"vector's NN is itself and distance is approximately zero.") {
std::unordered_map<StorageDataType, float> PRECISION_TOLERANCE_PER_DATA_TYPE =
{{StorageDataType::Float32, 0.00001f},
{StorageDataType::Float8, 0.10f},
{StorageDataType::E4M3, 0.20f}};
std::vector<SpaceType> spaceTypesSet = {
SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine};
std::vector<int> numDimensionsSet = {32};
std::vector<int> numVectorsSet = {2000};
std::vector<StorageDataType> storageTypesSet = {
StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3};

auto count = 0;
std::vector<bool> testSingleVectorMethods = {true, false};

for (auto spaceType : spaceTypesSet) {
for (auto numDimensions : numDimensionsSet) {
for (auto numElements : numElementsSet) {
for (auto storageType : storageTypesSet) {
SUBCASE("Test instantiation ") {
CAPTURE(spaceType);
CAPTURE(numDimensions);
CAPTURE(numElements);
CAPTURE(storageType);

if (storageType == StorageDataType::Float8) {
auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
spaceType, numDimensions);
testCombination(index, spaceType, numDimensions, storageType);
} else if (storageType == StorageDataType::Float32) {
auto index = TypedIndex<float>(spaceType, numDimensions);
testCombination(index, spaceType, numDimensions, storageType);
} else if (storageType == StorageDataType::E4M3) {
auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
testCombination(index, spaceType, numDimensions, storageType);
for (auto storageType : storageTypesSet) {
for (auto numDimensions : numDimensionsSet) {
for (auto numVectors : numVectorsSet) {
for (auto testSingleVectorMethod : testSingleVectorMethods) {

SUBCASE("Test instantiation ") {
CAPTURE(spaceType);
CAPTURE(numDimensions);
CAPTURE(numVectors);
CAPTURE(storageType);
CAPTURE(testSingleVectorMethod);

if (storageType == StorageDataType::Float8) {
auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
spaceType, numDimensions);
testIndexProperties(index, spaceType, numDimensions,
storageType);
testQuery(index, numVectors, numDimensions, spaceType,
storageType, testSingleVectorMethod,
PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
} else if (storageType == StorageDataType::Float32) {
auto index = TypedIndex<float>(spaceType, numDimensions);
testIndexProperties(index, spaceType, numDimensions,
storageType);
testQuery(index, numVectors, numDimensions, spaceType,
storageType, testSingleVectorMethod,
PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
} else if (storageType == StorageDataType::E4M3) {
auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
testIndexProperties(index, spaceType, numDimensions,
storageType);
testQuery(index, numVectors, numDimensions, spaceType,
storageType, testSingleVectorMethod,
PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
}
}
}
}
}
}
}
}

TEST_CASE("Test vectorsToNDArray converts 2D vector of float to NDArray<float, "
"2>") {
std::vector<std::vector<float>> vectors = {{1.0f, 2.0f, 3.0f, 4.0f},
{5.0f, 6.0f, 7.0f, 8.0f},
{9.0f, 10.0f, 11.0f, 12.0f}};
NDArray<float, 2> ndArray = vectorsToNDArray(vectors);
REQUIRE(ndArray.shape.size() == 2);
REQUIRE(ndArray.shape[0] == 3);
REQUIRE(ndArray.shape[1] == 4);
REQUIRE(ndArray.data.size() == 12);
REQUIRE(ndArray.data[0] == 1.0f);
REQUIRE(ndArray.data[1] == 2.0f);
REQUIRE(ndArray.data[2] == 3.0f);
REQUIRE(ndArray.data[3] == 4.0f);
REQUIRE(ndArray.data[4] == 5.0f);
REQUIRE(ndArray.data[5] == 6.0f);
REQUIRE(ndArray.data[6] == 7.0f);
REQUIRE(ndArray.data[7] == 8.0f);
REQUIRE(ndArray.data[8] == 9.0f);
REQUIRE(ndArray.data[9] == 10.0f);
REQUIRE(ndArray.data[10] == 11.0f);
REQUIRE(ndArray.data[11] == 12.0f);
REQUIRE(*ndArray[0] == 1.0f);
REQUIRE(*ndArray[1] == 5.0f);
REQUIRE(*ndArray[2] == 9.0f);
}

TEST_CASE("Test vectorsToNDArray throws error if vectors are not of the same "
"size") {
std::vector<std::vector<float>> vectors1 = {{1.0f, 2.0f, 3.0f, 4.0f},
{5.0f, 6.0f, 7.0f},
{9.0f, 10.0f, 11.0f, 12.0f}};
REQUIRE_THROWS_AS(vectorsToNDArray(vectors1), std::invalid_argument);

std::vector<std::vector<float>> vectors2 = {
{1.0f}, {5.0f, 6.0f, 7.0f}, {9.0f, 10.0f, 11.0f}};
REQUIRE_THROWS_AS(vectorsToNDArray(vectors2), std::invalid_argument);
}
41 changes: 41 additions & 0 deletions cpp/test/test_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <random>
#include <vector>

#include "array_utils.h"

// create test data intended for Float8 storage or E4M3 storage
std::vector<std::vector<float>> randomQuantizedVectors(int numVectors,
int dimensions) {
std::vector<std::vector<float>> vectors(numVectors,
std::vector<float>(dimensions));

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0, 1.0);

for (int i = 0; i < numVectors; ++i) {
for (int j = 0; j < dimensions; ++j) {
vectors[i][j] = static_cast<int>(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f;
}
}

return vectors;
}

// create test data intended for Float32 storage
std::vector<std::vector<float>> randomVectors(int numVectors, int dimensions) {
std::vector<std::vector<float>> vectors(numVectors,
std::vector<float>(dimensions));

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0, 1.0);

for (int i = 0; i < numVectors; ++i) {
for (int j = 0; j < dimensions; ++j) {
vectors[i][j] = static_cast<float>(dis(gen)) * 2 - 1;
}
}

return vectors;
}

0 comments on commit 88cfc46

Please sign in to comment.