Skip to content

Commit

Permalink
Add custom metadata header. (#26)
Browse files Browse the repository at this point in the history
* Init commit of custom file header.

* Wire up metadata properly.

* Add v0 indices and loading test for them.

* Add V1 indices.

* Add Java bindings for new metadata format.

* Add missing <memory> header.

* More missing Java headers.

* Increment buffer pointer when using peek reservoir.

* Formatting.

* Catch index corruption a bit more easily.

* Add feedback to error messages.

* Add fuzz tests and fix errors discovered by fuzz testing.
  • Loading branch information
psobot authored Oct 4, 2023
1 parent 9b99c0f commit 268e1eb
Show file tree
Hide file tree
Showing 68 changed files with 904 additions and 95 deletions.
50 changes: 50 additions & 0 deletions cpp/Enums.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#pragma once

/**
* The space (i.e. distance metric) to use for searching.
*/
enum SpaceType : unsigned char {
Euclidean = 0,
InnerProduct = 1,
Cosine = 2,
};

/**
* The datatype used to use when storing vectors on disk.
* Affects precision and memory usage.
*/
enum class StorageDataType : unsigned char {
Float8 = 1 << 4,
Float32 = 2 << 4,

// An 8-bit floating point format that uses
// four bits for exponent, 3 bits for mantissa,
// allowing representation of values from 2e-9 to 448.
E4M3 = 3 << 4,
};

inline const std::string toString(StorageDataType sdt) {
switch (sdt) {
case StorageDataType::Float8:
return "Float8";
case StorageDataType::Float32:
return "Float32";
case StorageDataType::E4M3:
return "E4M3";
default:
return "Unknown storage data type (value " + std::to_string((int)sdt) + ")";
}
}

inline const std::string toString(SpaceType space) {
switch (space) {
case SpaceType::Euclidean:
return "Euclidean";
case SpaceType::Cosine:
return "Cosine";
case SpaceType::InnerProduct:
return "InnerProduct";
default:
return "Unknown space type (value " + std::to_string((int)space) + ")";
}
}
24 changes: 1 addition & 23 deletions cpp/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,33 +25,11 @@
#include <ratio>
#include <stdlib.h>

#include "Enums.h"
#include "StreamUtils.h"
#include "array_utils.h"
#include "hnswlib.h"

/**
* The space (i.e. distance metric) to use for searching.
*/
enum SpaceType {
Euclidean,
InnerProduct,
Cosine,
};

/**
* The datatype used to use when storing vectors on disk.
* Affects precision and memory usage.
*/
enum class StorageDataType {
Float8,
Float32,

// An 8-bit floating point format that uses
// four bits for exponent, 3 bits for mantissa,
// allowing representation of values from 2e-9 to 448.
E4M3,
};

/**
* A C++ wrapper class for a Voyager index, which accepts
* and returns floating-point data.
Expand Down
123 changes: 123 additions & 0 deletions cpp/Metadata.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#pragma once
/*-
* -\-\-
* voyager
* --
* Copyright (C) 2016 - 2023 Spotify AB
*
* This file is heavily based on hnswlib (https://github.com/nmslib/hnswlib,
* Apache 2.0-licensed, no copyright author listed)
* --
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* -/-/-
*/

#include "Enums.h"
#include "StreamUtils.h"

namespace voyager {
namespace Metadata {
/**
* @brief A basic metadata class that stores the number of dimensions,
* the SpaceType, StorageDataType, and number of dimensions.
*/
class V1 {
public:
V1(int numDimensions, SpaceType spaceType, StorageDataType storageDataType)
: numDimensions(numDimensions), spaceType(spaceType),
storageDataType(storageDataType) {}

V1() {}
virtual ~V1() {}

int version() const { return 1; }

int getNumDimensions() { return numDimensions; }

StorageDataType getStorageDataType() { return storageDataType; }

SpaceType getSpaceType() { return spaceType; }

void setNumDimensions(int newNumDimensions) {
numDimensions = newNumDimensions;
}

void setStorageDataType(StorageDataType newStorageDataType) {
storageDataType = newStorageDataType;
}

void setSpaceType(SpaceType newSpaceType) { spaceType = newSpaceType; }

virtual void serializeToStream(std::shared_ptr<OutputStream> stream) {
stream->write("VOYA", 4);
writeBinaryPOD(stream, version());
writeBinaryPOD(stream, numDimensions);
writeBinaryPOD(stream, spaceType);
writeBinaryPOD(stream, storageDataType);
};

virtual void loadFromStream(std::shared_ptr<InputStream> stream) {
// Version has already been loaded before we get here!
readBinaryPOD(stream, numDimensions);
readBinaryPOD(stream, spaceType);
readBinaryPOD(stream, storageDataType);
};

private:
int numDimensions;
SpaceType spaceType;
StorageDataType storageDataType;
};

static std::unique_ptr<Metadata::V1>
loadFromStream(std::shared_ptr<InputStream> inputStream) {
uint32_t header = inputStream->peek();
if (header != 'AYOV') {
return nullptr;
}

// Actually read instead of just peeking:
inputStream->read((char *)&header, sizeof(header));

int version;
readBinaryPOD(inputStream, version);

switch (version) {
case 1: {
std::unique_ptr<Metadata::V1> metadata = std::make_unique<Metadata::V1>();
metadata->loadFromStream(inputStream);
return metadata;
}
default: {
std::stringstream stream;
stream << std::hex << version;
std::string resultAsHex(stream.str());

std::string error = "Unable to parse version of Voyager index file; found "
"unsupported version \"0x" +
resultAsHex + "\".";

if (version < 20) {
error += " A newer version of the Voyager library may be able to read "
"this index.";
} else {
error += " This index may be corrupted (or not a Voyager index).";
}

throw std::domain_error(error);
}
}
};

} // namespace Metadata
}; // namespace voyager
38 changes: 36 additions & 2 deletions cpp/StreamUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#pragma once
#include <exception>
#include <iostream>
#include <memory>
#include <sstream>
#include <stdio.h>
#include <string>
Expand All @@ -41,11 +42,12 @@ class InputStream {
virtual bool advanceBy(long long numBytes) {
return setPosition(getPosition() + numBytes);
}
virtual uint32_t peek() = 0;
};

class FileInputStream : public InputStream {
public:
FileInputStream(const std::string &filename) {
FileInputStream(const std::string &filename) : filename(filename) {
handle = fopen(filename.c_str(), "r");
if (!handle) {
throw std::runtime_error("Failed to open file for reading: " + filename);
Expand Down Expand Up @@ -74,6 +76,19 @@ class FileInputStream : public InputStream {
virtual bool advanceBy(long long bytes) {
return fseek(handle, bytes, SEEK_CUR) == 0;
}
virtual uint32_t peek() {
uint32_t result = 0;
long long lastPosition = getPosition();
if (read((char *)&result, sizeof(result)) == sizeof(result)) {
setPosition(lastPosition);
return result;
} else {
throw std::runtime_error(
"Failed to peek " + std::to_string(sizeof(result)) +
" bytes from file \"" + filename + "\" at index " +
std::to_string(lastPosition) + ".");
}
}

virtual ~FileInputStream() {
if (handle) {
Expand All @@ -85,6 +100,7 @@ class FileInputStream : public InputStream {
protected:
FileInputStream() {}
FILE *handle = nullptr;
std::string filename;

private:
bool isRegularFile = false;
Expand Down Expand Up @@ -143,4 +159,22 @@ class MemoryOutputStream : public OutputStream {

private:
std::ostringstream outputStream;
};
};

template <typename T>
static void writeBinaryPOD(std::shared_ptr<OutputStream> out, const T &podRef) {
if (!out->write((char *)&podRef, sizeof(T))) {
throw std::runtime_error("Failed to write " + std::to_string(sizeof(T)) +
" bytes to stream!");
}
}

template <typename T>
static void readBinaryPOD(std::shared_ptr<InputStream> in, T &podRef) {
long long bytesRead = in->read((char *)&podRef, sizeof(T));
if (bytesRead != sizeof(T)) {
throw std::runtime_error("Failed to read " + std::to_string(sizeof(T)) +
" bytes from stream! Got " +
std::to_string(bytesRead) + ".");
}
}
64 changes: 62 additions & 2 deletions cpp/TypedIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
#include <ratio>

#include "E4M3.h"
#include "Enums.h"
#include "Index.h"
#include "Metadata.h"
#include "array_utils.h"
#include "hnswlib.h"
#include "std_utils.h"
Expand Down Expand Up @@ -99,6 +101,7 @@ class TypedIndex : public Index {
hnswlib::labeltype currentLabel;
std::unique_ptr<hnswlib::HierarchicalNSW<dist_t, data_t>> algorithmImpl;
std::unique_ptr<hnswlib::Space<dist_t, data_t>> spaceImpl;
std::unique_ptr<voyager::Metadata::V1> metadata;

public:
/**
Expand All @@ -107,7 +110,10 @@ class TypedIndex : public Index {
TypedIndex(const SpaceType space, const int dimensions, const size_t M = 12,
const size_t efConstruction = 200, const size_t randomSeed = 1,
const size_t maxElements = 1)
: space(space), dimensions(dimensions) {
: space(space), dimensions(dimensions),
metadata(std::make_unique<voyager::Metadata::V1>(
dimensions, space, getStorageDataType())) {

switch (space) {
case Euclidean:
spaceImpl = std::make_unique<
Expand Down Expand Up @@ -168,6 +174,18 @@ class TypedIndex : public Index {
currentLabel = algorithmImpl->cur_element_count;
}

/**
* Load an index from the given input stream, interpreting
* it as the given Space and number of dimensions.
*/
TypedIndex(std::unique_ptr<voyager::Metadata::V1> metadata,
std::shared_ptr<InputStream> inputStream, bool searchOnly = false)
: TypedIndex(metadata->getSpaceType(), metadata->getNumDimensions()) {
algorithmImpl = std::make_unique<hnswlib::HierarchicalNSW<dist_t, data_t>>(
spaceImpl.get(), inputStream, 0, searchOnly);
currentLabel = algorithmImpl->cur_element_count;
}

int getNumDimensions() const { return dimensions; }

SpaceType getSpace() const { return space; }
Expand Down Expand Up @@ -215,7 +233,7 @@ class TypedIndex : public Index {
* Save this index to the provided file path on disk.
*/
void saveIndex(const std::string &pathToIndex) {
algorithmImpl->saveIndex(pathToIndex);
saveIndex(std::make_shared<FileOutputStream>(pathToIndex));
}

/**
Expand All @@ -224,6 +242,7 @@ class TypedIndex : public Index {
* TypedIndex constructor to reload this index.
*/
void saveIndex(std::shared_ptr<OutputStream> outputStream) {
metadata->serializeToStream(outputStream);
algorithmImpl->saveIndex(outputStream);
}

Expand Down Expand Up @@ -572,3 +591,44 @@ class TypedIndex : public Index {

size_t getM() const { return algorithmImpl->M_; }
};

std::unique_ptr<Index>
loadTypedIndexFromStream(std::shared_ptr<InputStream> inputStream) {
std::unique_ptr<voyager::Metadata::V1> metadata =
voyager::Metadata::loadFromStream(inputStream);

if (!metadata) {
throw std::domain_error(
"The provided file contains no Voyager parameter metadata. Please "
"specify the number of dimensions, SpaceType, and StorageDataType that "
"this index contains.");
} else if (voyager::Metadata::V1 *v1 =
dynamic_cast<voyager::Metadata::V1 *>(metadata.get())) {
// We have enough information to create a TypedIndex!
switch (v1->getStorageDataType()) {
case StorageDataType::Float32:
return std::make_unique<TypedIndex<float>>(
std::unique_ptr<voyager::Metadata::V1>(
(voyager::Metadata::V1 *)metadata.release()),
inputStream);
break;
case StorageDataType::Float8:
return std::make_unique<TypedIndex<float, int8_t, std::ratio<1, 127>>>(
std::unique_ptr<voyager::Metadata::V1>(
(voyager::Metadata::V1 *)metadata.release()),
inputStream);
break;
case StorageDataType::E4M3:
return std::make_unique<TypedIndex<float, E4M3>>(
std::unique_ptr<voyager::Metadata::V1>(
(voyager::Metadata::V1 *)metadata.release()),
inputStream);
break;
default:
throw std::domain_error("Unknown storage data type: " +
std::to_string((int)v1->getStorageDataType()));
}
} else {
throw std::domain_error("Unknown Voyager metadata format.");
}
}
Loading

0 comments on commit 268e1eb

Please sign in to comment.