diff --git a/ann_benchmarks/algorithms/deeplake/Dockerfile b/ann_benchmarks/algorithms/deeplake/Dockerfile new file mode 100644 index 000000000..224022dee --- /dev/null +++ b/ann_benchmarks/algorithms/deeplake/Dockerfile @@ -0,0 +1,8 @@ +FROM ann-benchmarks +ENV BUGGER_OFF=true +ENV ACTIVELOOP_TOKEN=eyJhbGciOiJIUzUxMiIsImlhdCI6MTY4MTI4Mzc3NywiZXhwIjoxNzEyOTA2MDk5fQ.eyJpZCI6Im5vdGlmeSJ9.C3zTjQODfq0TUhkdRye639aKJ0FOanZuCwL2ks2NiKXJ6YecYVMBrdFu3AabGsk7iuS-ELtQYp7WxITv76hcSg +ENV ACTIVELOOP_ORG=notify + +RUN apt-get install -y python-setuptools python-pip +RUN pip3 install deeplake +RUN python3 -c 'import deeplake' diff --git a/ann_benchmarks/algorithms/deeplake/config.yml b/ann_benchmarks/algorithms/deeplake/config.yml new file mode 100644 index 000000000..6ccf2eea1 --- /dev/null +++ b/ann_benchmarks/algorithms/deeplake/config.yml @@ -0,0 +1,45 @@ +float: + any: + - base_args: ['@metric'] + constructor: DeeplakeHnsw + disabled: false + docker_tag: ann-benchmarks-deeplake + module: ann_benchmarks.algorithms.deeplake + name: deeplake + run_groups: + M-12: + arg_groups: [{M: 12, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-16: + arg_groups: [{M: 16, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-24: + arg_groups: [{M: 24, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-36: + arg_groups: [{M: 36, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-4: + arg_groups: [{M: 4, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-48: + arg_groups: [{M: 48, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-64: + arg_groups: [{M: 64, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-8: + arg_groups: [{M: 8, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-96: + arg_groups: [{M: 96, efConstruction: 500}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] diff --git a/ann_benchmarks/algorithms/deeplake/module.py b/ann_benchmarks/algorithms/deeplake/module.py new file mode 100644 index 000000000..6a8bb0039 --- /dev/null +++ b/ann_benchmarks/algorithms/deeplake/module.py @@ -0,0 +1,60 @@ +import deeplake +import numpy as np +import os +import random +import string +from ..base.module import BaseANN + + +# Class using the Deeplake implementation of an HNSW index for nearest neighbor +# search over data points in a high dimensional vector space. + +class DeeplakeHnsw(BaseANN): + def __init__(self, metric, param, enable_normalize = True, dimension = None): + if metric not in ("angular", "euclidean"): + raise NotImplementedError(f"Deeplake doesn't support metric {metric}") + if metric == "angular": + self.metric = "cosine_similarity" + else: + self.metric = "l2_norm" + self.param = param + self._ef_construction = param.get("efConstruction", 200) + self._m = param.get("M", 16) + self.dimension = dimension + suffix = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(8)) + self.local_path = f"ANN_benchmarks-embeddings_{suffix}" + self.name = "deeplake" + self.token = os.environ.get('ACTIVELOOP_TOKEN') + self.org = os.environ.get('ACTIVELOOP_ORG') + + def __del__(self): + self.freeIndex() + + def fit(self, X): + self.ds = deeplake.dataset(self.local_path, overwrite=True, token=self.token, org_id=self.org) + self.ds.create_tensor("embedding", htype="embedding", dtype="float32", create_shape_tensor=False, create_id_tensor=False) + self.ds.embedding.extend(X) + self.ds.embedding.create_vdb_index("hnsw_1", distance=self.metric, additional_params={ + "efConstruction": self._ef_construction, "M": self._m + }) + self.index = self.ds.embedding.load_vdb_index("hnsw_1") + + def set_query_arguments(self, ef): + self.index.set_search_params(ef=ef) + + def query(self, v, n): + v_float = np.array(v).astype(np.float32) + view = self.index.search_knn(v_float, n) + return view.indices + + def __str__(self): + return f"Deeplake(m={self._m}, ef_construction={self._ef_construction})" + + def freeIndex(self): + if hasattr(self, 'index'): + del self.index + if hasattr(self, 'ds'): + del self.ds + deeplake.delete(self.local_path) + if os.path.isfile(f"/tmp/{self.local_path}/embedding"): + os.remove(f"/tmp/{self.local_path}/embedding")