-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Init commit of custom file header. * Wire up metadata properly. * Add v0 indices and loading test for them. * Add V1 indices. * Add Java bindings for new metadata format. * Add missing <memory> header. * More missing Java headers. * Increment buffer pointer when using peek reservoir. * Formatting. * Catch index corruption a bit more easily. * Add feedback to error messages. * Add fuzz tests and fix errors discovered by fuzz testing.
- Loading branch information
Showing
68 changed files
with
904 additions
and
95 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#pragma once | ||
|
||
/** | ||
* The space (i.e. distance metric) to use for searching. | ||
*/ | ||
enum SpaceType : unsigned char { | ||
Euclidean = 0, | ||
InnerProduct = 1, | ||
Cosine = 2, | ||
}; | ||
|
||
/** | ||
* The datatype used to use when storing vectors on disk. | ||
* Affects precision and memory usage. | ||
*/ | ||
enum class StorageDataType : unsigned char { | ||
Float8 = 1 << 4, | ||
Float32 = 2 << 4, | ||
|
||
// An 8-bit floating point format that uses | ||
// four bits for exponent, 3 bits for mantissa, | ||
// allowing representation of values from 2e-9 to 448. | ||
E4M3 = 3 << 4, | ||
}; | ||
|
||
inline const std::string toString(StorageDataType sdt) { | ||
switch (sdt) { | ||
case StorageDataType::Float8: | ||
return "Float8"; | ||
case StorageDataType::Float32: | ||
return "Float32"; | ||
case StorageDataType::E4M3: | ||
return "E4M3"; | ||
default: | ||
return "Unknown storage data type (value " + std::to_string((int)sdt) + ")"; | ||
} | ||
} | ||
|
||
inline const std::string toString(SpaceType space) { | ||
switch (space) { | ||
case SpaceType::Euclidean: | ||
return "Euclidean"; | ||
case SpaceType::Cosine: | ||
return "Cosine"; | ||
case SpaceType::InnerProduct: | ||
return "InnerProduct"; | ||
default: | ||
return "Unknown space type (value " + std::to_string((int)space) + ")"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
#pragma once | ||
/*- | ||
* -\-\- | ||
* voyager | ||
* -- | ||
* Copyright (C) 2016 - 2023 Spotify AB | ||
* | ||
* This file is heavily based on hnswlib (https://github.com/nmslib/hnswlib, | ||
* Apache 2.0-licensed, no copyright author listed) | ||
* -- | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* -/-/- | ||
*/ | ||
|
||
#include "Enums.h" | ||
#include "StreamUtils.h" | ||
|
||
namespace voyager { | ||
namespace Metadata { | ||
/** | ||
* @brief A basic metadata class that stores the number of dimensions, | ||
* the SpaceType, StorageDataType, and number of dimensions. | ||
*/ | ||
class V1 { | ||
public: | ||
V1(int numDimensions, SpaceType spaceType, StorageDataType storageDataType) | ||
: numDimensions(numDimensions), spaceType(spaceType), | ||
storageDataType(storageDataType) {} | ||
|
||
V1() {} | ||
virtual ~V1() {} | ||
|
||
int version() const { return 1; } | ||
|
||
int getNumDimensions() { return numDimensions; } | ||
|
||
StorageDataType getStorageDataType() { return storageDataType; } | ||
|
||
SpaceType getSpaceType() { return spaceType; } | ||
|
||
void setNumDimensions(int newNumDimensions) { | ||
numDimensions = newNumDimensions; | ||
} | ||
|
||
void setStorageDataType(StorageDataType newStorageDataType) { | ||
storageDataType = newStorageDataType; | ||
} | ||
|
||
void setSpaceType(SpaceType newSpaceType) { spaceType = newSpaceType; } | ||
|
||
virtual void serializeToStream(std::shared_ptr<OutputStream> stream) { | ||
stream->write("VOYA", 4); | ||
writeBinaryPOD(stream, version()); | ||
writeBinaryPOD(stream, numDimensions); | ||
writeBinaryPOD(stream, spaceType); | ||
writeBinaryPOD(stream, storageDataType); | ||
}; | ||
|
||
virtual void loadFromStream(std::shared_ptr<InputStream> stream) { | ||
// Version has already been loaded before we get here! | ||
readBinaryPOD(stream, numDimensions); | ||
readBinaryPOD(stream, spaceType); | ||
readBinaryPOD(stream, storageDataType); | ||
}; | ||
|
||
private: | ||
int numDimensions; | ||
SpaceType spaceType; | ||
StorageDataType storageDataType; | ||
}; | ||
|
||
static std::unique_ptr<Metadata::V1> | ||
loadFromStream(std::shared_ptr<InputStream> inputStream) { | ||
uint32_t header = inputStream->peek(); | ||
if (header != 'AYOV') { | ||
return nullptr; | ||
} | ||
|
||
// Actually read instead of just peeking: | ||
inputStream->read((char *)&header, sizeof(header)); | ||
|
||
int version; | ||
readBinaryPOD(inputStream, version); | ||
|
||
switch (version) { | ||
case 1: { | ||
std::unique_ptr<Metadata::V1> metadata = std::make_unique<Metadata::V1>(); | ||
metadata->loadFromStream(inputStream); | ||
return metadata; | ||
} | ||
default: { | ||
std::stringstream stream; | ||
stream << std::hex << version; | ||
std::string resultAsHex(stream.str()); | ||
|
||
std::string error = "Unable to parse version of Voyager index file; found " | ||
"unsupported version \"0x" + | ||
resultAsHex + "\"."; | ||
|
||
if (version < 20) { | ||
error += " A newer version of the Voyager library may be able to read " | ||
"this index."; | ||
} else { | ||
error += " This index may be corrupted (or not a Voyager index)."; | ||
} | ||
|
||
throw std::domain_error(error); | ||
} | ||
} | ||
}; | ||
|
||
} // namespace Metadata | ||
}; // namespace voyager |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.