Skip to content

Commit

Permalink
Permutation LSH (Amato, et. al.) incl. internal support for repeated …
Browse files Browse the repository at this point in the history
…hashes and improved angular benchmarks (#117)

- Added Permutation Lsh model and query, based on paper Large Scale Image Retrieval with Elasticsearch by Amato, et. al.
- Support for models with repeated hashes. Only used by Permutation Lsh for now.
- Picked better hyperparameters for continuous benchmarks for angular similarity. Increasing k seems to help.
- Gradle setting to fail compilation on scala warnings.
- Test support for Lucene indexing/queries without Elasticsearch.
  • Loading branch information
alexklibisz authored Jul 29, 2020
1 parent 11d6fca commit 72db321
Show file tree
Hide file tree
Showing 33 changed files with 628 additions and 155 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
**/*.ipynb linguist-detectable=false
.minio/elastiknn-benchmarks/data/processed/annbglove100/train.json.gz filter=lfs diff=lfs merge=lfs -text
.minio/elastiknn-benchmarks/data/processed/annbglove100/test.json.gz filter=lfs diff=lfs merge=lfs -text
.minio/elastiknn-benchmarks/data/processed/annbglove25/test.json.gz filter=lfs diff=lfs merge=lfs -text
.minio/elastiknn-benchmarks/data/processed/annbglove25/train.json.gz filter=lfs diff=lfs merge=lfs -text
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ release.md
# Git LFS
.minio
!.minio/elastiknn-benchmarks/data/processed/annbglove100/*.json.gz
!.minio/elastiknn-benchmarks/data/processed/annbglove25/*.json.gz

# Ignore Gradle GUI config
gradle-app.setting
Expand All @@ -20,7 +21,8 @@ gradle-app.setting
.gradletasknamecache

# IntelliJ configs
.idea
.idea/
!.idea/runConfigurations/Debug_Elasticsearch.xml

# Misc log files
*.log
15 changes: 15 additions & 0 deletions .idea/runConfigurations/Debug_Elasticsearch.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Git LFS file not shown
Git LFS file not shown
8 changes: 8 additions & 0 deletions benchmarks/python/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,14 @@ def main(argv: List[str]) -> int:
s3_bucket,
s3_prefix
)
elif dataset_name == "annbglove25":
annb(
benchmarks_bucket,
"data/raw/annb/glove-25-angular.hdf5",
local_data_dir,
s3_bucket,
s3_prefix
)
elif dataset_name == "annbglove100":
annb(
benchmarks_bucket,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ object ContinuousBenchmark extends App {
randomDenseFloats,
Mapping.DenseFloat(randomDenseFloats.dims),
NearestNeighborsQuery.Exact(field, Similarity.Angular),
Mapping.AngularLsh(randomDenseFloats.dims, 400, 1),
// Angular LSH seems to benefit a lot from increasing k.
Mapping.AngularLsh(randomDenseFloats.dims, 250, 3),
Seq(
Query(NearestNeighborsQuery.AngularLsh(field, 1000), k)
)
Expand All @@ -47,15 +48,39 @@ object ContinuousBenchmark extends App {
),
// Angular exact, LSH on Glove100 dataset. Still experimental, excluded by default.
Experiment(
Dataset.AnnbGlove100,
Mapping.DenseFloat(Dataset.AnnbGlove100.dims),
Dataset.AnnbGlove25,
Mapping.DenseFloat(Dataset.AnnbGlove25.dims),
NearestNeighborsQuery.Exact(field, Similarity.Angular),
Mapping.AngularLsh(Dataset.AnnbGlove100.dims, 100, 1),
Mapping.AngularLsh(Dataset.AnnbGlove25.dims, 250, 3),
Seq(
Query(NearestNeighborsQuery.AngularLsh(field, 1000), k)
Query(NearestNeighborsQuery.AngularLsh(field, 1000), k),
Query(NearestNeighborsQuery.AngularLsh(field, 2000), k),
Query(NearestNeighborsQuery.AngularLsh(field, 4000), k)
)
),
Experiment(
Dataset.AnnbGlove25,
Mapping.DenseFloat(Dataset.AnnbGlove25.dims),
NearestNeighborsQuery.Exact(field, Similarity.Angular),
Mapping.PermutationLsh(Dataset.AnnbGlove25.dims, 15, repeating = false),
Seq(
Query(NearestNeighborsQuery.PermutationLsh(field, Similarity.Angular, 1000), k),
Query(NearestNeighborsQuery.PermutationLsh(field, Similarity.Angular, 2000), k),
Query(NearestNeighborsQuery.PermutationLsh(field, Similarity.Angular, 4000), k)
)
),
Experiment(
Dataset.AnnbGlove25,
Mapping.DenseFloat(Dataset.AnnbGlove25.dims),
NearestNeighborsQuery.Exact(field, Similarity.Angular),
Mapping.PermutationLsh(Dataset.AnnbGlove25.dims, 15, repeating = true),
Seq(
Query(NearestNeighborsQuery.PermutationLsh(field, Similarity.Angular, 1000), k),
Query(NearestNeighborsQuery.PermutationLsh(field, Similarity.Angular, 2000), k),
Query(NearestNeighborsQuery.PermutationLsh(field, Similarity.Angular, 4000), k)
)
)
).take(3)
)

override def run(args: List[String]): URIO[Console, ExitCode] = {
val s3Client = S3Utils.minioClient()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ package object benchmarks {
case object AnnbDeep1b extends Dataset(96)
case object AnnbFashionMnist extends Dataset(784)
case object AnnbGist extends Dataset(960)
case object AnnbGlove25 extends Dataset(25)
case object AnnbGlove100 extends Dataset(100)
case object AnnbKosarak extends Dataset(27983)
case object AnnbMnist extends Dataset(784)
Expand Down Expand Up @@ -98,6 +99,7 @@ package object benchmarks {
case _: DenseFloat => "Exact"
case _: SparseIndexed => "Sparse indexed"
case _: JaccardLsh | _: HammingLsh | _: AngularLsh | _: L2Lsh => "LSH"
case _: PermutationLsh => "Permutation LSH"
}

def apply(benchmarkResult: BenchmarkResult): AggregateResult = {
Expand Down
4 changes: 4 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ allprojects {
ext."sonatypeUrl" = System.getenv().getOrDefault("SONATYPE_URL", "https://oss.sonatype.org/content/repositories/snapshots/")
ext."sonatypeUsername" = project.hasProperty("sonatypeUsername") ? project.getProperty("sonatypeUsername") : ""
ext."sonatypePassword" = project.hasProperty("sonatypePassword") ? project.getProperty("sonatypePassword") : ""

tasks.withType(ScalaCompile) {
scalaCompileOptions.setAdditionalParameters(List.of("-Xfatal-warnings"))
}
}

task unifiedScaladocs(type: ScalaDoc, description: 'Generate unified scaladocs', group: 'Documentation') {
Expand Down
3 changes: 3 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
- Added Permutation Lsh model and query, based on paper _Large Scale Image Retrieval with Elasticsearch_ by Amato, et. al.
- Several internal improvements, including support for LSH models with repeated hashes.
---
- Performance improvements for LSH queries. 1.5-2x faster on regular benchmarks with randomized data. See PR #114.
---
- Fixed error with KNN queries against vectors that are stored in nested fields, e.g. `outer.inner.vec`.
Expand Down
2 changes: 2 additions & 0 deletions core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ dependencies {
compile "io.circe:circe-generic_${scalaVersion}:${circeVersion}"
compile "io.circe:circe-parser_${scalaVersion}:${circeVersion}"
compile "io.circe:circe-generic-extras_${scalaVersion}:${circeVersion}"
implementation "com.google.guava:guava:28.1-jre"
runtime "com.google.guava:guava:28.1-jre"
}

task sourceJar(type: Jar) {
Expand Down
52 changes: 52 additions & 0 deletions core/src/main/java/com/klibisz/elastiknn/models/HashAndFreq.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package com.klibisz.elastiknn.models;

import java.util.Arrays;
import java.util.Objects;

/**
* As the name suggests, represents a hash value and the number of the times it occurs in some context.
* This enables LSH algorithms where the repetition of a hash has some significance.
*/
public class HashAndFreq implements Comparable<HashAndFreq> {
private final byte[] hash;
private final int freq;

public static HashAndFreq once(byte[] hash) {
return new HashAndFreq(hash, 1);
}

public HashAndFreq(byte[] hash, int freq) {
this.hash = hash;
this.freq = freq;
}

public byte[] getHash() {
return hash;
}

public int getFreq() {
return freq;
}

@Override
public int compareTo(HashAndFreq o) {
byte[] ohash = o.getHash();
return Arrays.compareUnsigned(hash, 0, hash.length, ohash, 0, ohash.length);
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
HashAndFreq that = (HashAndFreq) o;
return freq == that.freq &&
Arrays.equals(hash, that.hash);
}

@Override
public int hashCode() {
int result = Objects.hash(freq);
result = 31 * result + Arrays.hashCode(hash);
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,12 @@ private object Keys {
val EXACT = "exact"
val FIELD = "field"
val HAMMING = "hamming"
val HAMMING_INDEXED = "hamming_indexed"
val INDEX = "index"
val JACCARD = "jaccard"
val JACCARD_INDEXED = "jaccard_indexed"
val JACCARD_LSH = "jaccard_lsh"
val L1 = "l1"
val L2 = "l2"
val LSH = "lsh"
val PERMUTATION_LSH = "permutation_lsh"
val MODEL = "model"
val QUERY_OPTIONS = "query_options"
val SIMILARITY = "similarity"
Expand All @@ -42,6 +40,9 @@ private object Keys {
val VEC = "vec"
}

/**
* If you think this is a lot of boilerplate you should see the Json parsing in Elasticsearch.
*/
object ElasticsearchCodec { esc =>

import Keys._
Expand All @@ -59,7 +60,6 @@ object ElasticsearchCodec { esc =>
private def fail[T](msg: String): Either[DecodingFailure, T] = Left(DecodingFailure(msg, List.empty))
private def failTypes[T](field: String, good: Seq[String], bad: String): Either[DecodingFailure, T] =
fail(s"Expected field $field to be one of (${good.mkString(", ")}) but got $bad")
private def failTypes[T](good: Seq[String], bad: String): Either[DecodingFailure, T] = failTypes(TYPE, good, bad)

private implicit def jsonObjToJson(jo: JsonObject): Json = Json.fromJsonObject(jo)
private implicit def intToJson(i: Int): Json = Json.fromInt(i)
Expand Down Expand Up @@ -138,6 +138,7 @@ object ElasticsearchCodec { esc =>
implicit val mappingHammingLsh: ESC[Mapping.HammingLsh] = ElasticsearchCodec(deriveCodec)
implicit val mappingAngularLsh: ESC[Mapping.AngularLsh] = ElasticsearchCodec(deriveCodec)
implicit val mappingL2Lsh: ESC[Mapping.L2Lsh] = ElasticsearchCodec(deriveCodec)
implicit val mappingPermutationLsh: ESC[Mapping.PermutationLsh] = ElasticsearchCodec(deriveCodec)

implicit val mapping: ESC[Mapping] = new ESC[Mapping] {
override def apply(t: Mapping): Json = t match {
Expand All @@ -153,6 +154,8 @@ object ElasticsearchCodec { esc =>
JsonObject(TYPE -> EKNN_DENSE_FLOAT_VECTOR, ELASTIKNN_NAME -> (esc.encode(m) ++ JsonObject(MODEL -> LSH, SIMILARITY -> ANGULAR)))
case m: Mapping.L2Lsh =>
JsonObject(TYPE -> EKNN_DENSE_FLOAT_VECTOR, ELASTIKNN_NAME -> (esc.encode(m) ++ JsonObject(MODEL -> LSH, SIMILARITY -> L2)))
case m: Mapping.PermutationLsh =>
JsonObject(TYPE -> EKNN_DENSE_FLOAT_VECTOR, ELASTIKNN_NAME -> (esc.encode(m) ++ JsonObject(MODEL -> PERMUTATION_LSH)))
}

override def apply(c: HCursor): Either[DecodingFailure, Mapping] =
Expand All @@ -176,6 +179,7 @@ object ElasticsearchCodec { esc =>
esc.decode[Mapping.AngularLsh](c)
case (EKNN_DENSE_FLOAT_VECTOR, Some(LSH), Some(Similarity.L2)) =>
esc.decode[Mapping.L2Lsh](c)
case (EKNN_DENSE_FLOAT_VECTOR, Some(PERMUTATION_LSH), None) => esc.decode[Mapping.PermutationLsh](c)
case _ =>
val msg = s"Incompatible $TYPE [$typ], $MODEL [$modelOpt], $SIMILARITY [${simOpt.map(esc.encode(_).noSpaces)}]"
fail[Mapping](msg)
Expand All @@ -189,26 +193,29 @@ object ElasticsearchCodec { esc =>
implicit val hammingLshNNQ: ESC[NearestNeighborsQuery.HammingLsh] = ElasticsearchCodec(deriveCodec)
implicit val angularLshNNQ: ESC[NearestNeighborsQuery.AngularLsh] = ElasticsearchCodec(deriveCodec)
implicit val l2LshNNQ: ESC[NearestNeighborsQuery.L2Lsh] = ElasticsearchCodec(deriveCodec)
implicit val queryPermutationLsh: ESC[NearestNeighborsQuery.PermutationLsh] = ElasticsearchCodec(deriveCodec)

implicit val nearestNeighborsQuery: ESC[NearestNeighborsQuery] = new ESC[NearestNeighborsQuery] {
override def apply(a: NearestNeighborsQuery): Json = {
val default = JsonObject(FIELD -> a.field, VEC -> esc.encode(a.vec), SIMILARITY -> esc.encode(a.similarity))
a match {
case q: NearestNeighborsQuery.Exact => JsonObject(MODEL -> EXACT) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.SparseIndexed => JsonObject(MODEL -> SPARSE_INDEXED) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.JaccardLsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.HammingLsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.AngularLsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.L2Lsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.Exact => JsonObject(MODEL -> EXACT) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.SparseIndexed => JsonObject(MODEL -> SPARSE_INDEXED) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.JaccardLsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.HammingLsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.AngularLsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.L2Lsh => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
case q: NearestNeighborsQuery.PermutationLsh => JsonObject(MODEL -> PERMUTATION_LSH) ++ (default ++ esc.encode(q))
}
}
override def apply(c: HCursor): Result[NearestNeighborsQuery] =
for {
model <- c.downField(MODEL).as[String]
sim <- c.downField(SIMILARITY).as[Json].flatMap(esc.decodeJson[Similarity])
nnq <- model match {
case EXACT => esc.decode[NearestNeighborsQuery.Exact](c)
case SPARSE_INDEXED => esc.decode[NearestNeighborsQuery.SparseIndexed](c)
case EXACT => esc.decode[NearestNeighborsQuery.Exact](c)
case SPARSE_INDEXED => esc.decode[NearestNeighborsQuery.SparseIndexed](c)
case PERMUTATION_LSH => esc.decode[NearestNeighborsQuery.PermutationLsh](c)
case LSH =>
sim match {
case Similarity.Jaccard => esc.decode[NearestNeighborsQuery.JaccardLsh](c)
Expand Down
18 changes: 13 additions & 5 deletions core/src/main/scala/com/klibisz/elastiknn/api/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ package object api {
}

object DenseFloat {
def apply(values: Float*): DenseFloat = DenseFloat(values.toArray)

def random(length: Int, unit: Boolean = false)(implicit rng: Random): DenseFloat = {
val v = DenseFloat((0 until length).toArray.map(_ => rng.nextGaussian.toFloat))
if (unit) {
Expand Down Expand Up @@ -89,6 +91,7 @@ package object api {
final case class DenseFloat(dims: Int) extends Mapping
final case class AngularLsh(dims: Int, L: Int, k: Int) extends Mapping
final case class L2Lsh(dims: Int, L: Int, k: Int, r: Int) extends Mapping
final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping
}

sealed trait NearestNeighborsQuery {
Expand All @@ -106,29 +109,34 @@ package object api {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
}

sealed trait LshQuery extends NearestNeighborsQuery {
sealed trait ApproximateQuery extends NearestNeighborsQuery {
def candidates: Int
}

final case class JaccardLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
final case class JaccardLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.Jaccard
}

final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.Hamming
}

final case class AngularLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
final case class AngularLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.Angular
}

final case class L2Lsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
final case class L2Lsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.L2
}

final case class PermutationLsh(field: String, similarity: Similarity, candidates: Int, vec: Vec = Vec.Empty())
extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
}

}
}
Loading

0 comments on commit 72db321

Please sign in to comment.