Skip to content

Commit

Permalink
revise per comments
Browse files Browse the repository at this point in the history
  • Loading branch information
lijinf2 committed Oct 16, 2024
1 parent d11cffa commit d37c32d
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 25 deletions.
35 changes: 18 additions & 17 deletions python/src/spark_rapids_ml/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,12 @@ class ApproximateNearestNeighbors(
k: int (default = 5)
the default number of approximate nearest neighbors to retrieve for each query.
If fewer than k neighbors are found for a query (for example, due to a small nprobe value):
(1)In ivfflat and ivfpq:
(a) If no item vector is probed, the indices are filled with long_max (9,223,372,036,854,775,807) and distances are set to infinity.
(b) If at least one item vector is probed, the indices are filled with the top-1 neighbor's ID, and distances are filled with infinity.
(2) cagra does not have this problem, as at least itopk_size (where itopk_size ≥ k) items are always probed.
algorithm: str (default = 'ivfflat')
the algorithm parameter to be passed into cuML. It currently must be 'ivfflat', 'ivfpq' or 'cagra'. Other algorithms are expected to be supported later.
Expand Down Expand Up @@ -1432,24 +1438,19 @@ def _get_cuml_transform_func(
"cosine",
}

if cuml_alg_params["algorithm"] != "ivfpq":
check_fn = (
self._cal_cagra_params_and_check
if cuml_alg_params["algorithm"] == "cagra"
else self._cal_cuvs_ivf_flat_params_and_check
)
index_params, search_params = check_fn(
algoParams=self.cuml_params["algo_params"],
metric=self.cuml_params["metric"],
topk=cuml_alg_params["n_neighbors"],
)
if cuml_alg_params["algorithm"] == "cagra":
check_fn = self._cal_cagra_params_and_check
elif cuml_alg_params["algorithm"] in {"ivf_flat", "ivfflat"}:
check_fn = self._cal_cuvs_ivf_flat_params_and_check
else:
assert cuml_alg_params["algorithm"] in {"ivfpq", "ivf_pq"}
index_params, search_params = self._cal_cuvs_ivf_pq_params_and_check(
algoParams=self.cuml_params["algo_params"],
metric=self.cuml_params["metric"],
topk=cuml_alg_params["n_neighbors"],
)
assert cuml_alg_params["algorithm"] in {"ivf_pq", "ivfpq"}
check_fn = self._cal_cuvs_ivf_pq_params_and_check

index_params, search_params = check_fn(
algoParams=self.cuml_params["algo_params"],
metric=self.cuml_params["metric"],
topk=cuml_alg_params["n_neighbors"],
)

def _construct_sgnn() -> CumlT:

Expand Down
5 changes: 1 addition & 4 deletions python/tests/test_approximate_nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,10 +831,7 @@ def test_return_fewer_k(
) -> None:
"""
This tests the corner case where there are less than k neighbors found due to nprobe too small.
(1) In ivf_flat and ivf_pq:
(a) if no nn is probed, indices are filled with long_max and distances are filled infs.
(b) if at least one nn is probed, indices are filled with the top-1 nn id and distances are filled with infs.
(2) cagra does not have this problem because at least itopk_size (>= k) items are probed.
More details can be found at the docstring of class ApproximateNearestNeighbors.
"""
metric = "euclidean"
gpu_number = 1
Expand Down
9 changes: 5 additions & 4 deletions python/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import Any, Dict, Iterator, List, Optional, Tuple, TypeVar, Union

import numpy as np
import pandas as pd
import pyspark
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
Expand Down Expand Up @@ -99,11 +100,11 @@ def create_pyspark_dataframe(

label_col = "label_col"
schema.append(f"{label_col} {label_pyspark_type}")
data_pytype = [
ra + rb for ra, rb in zip(data.tolist(), label.reshape(m, 1).tolist())
]

pdf = pd.DataFrame(data, dtype=dtype, columns=feature_cols)
pdf[label_col] = label.astype(label_dtype)
df = spark.createDataFrame(
data_pytype,
pdf,
",".join(schema),
)
else:
Expand Down

0 comments on commit d37c32d

Please sign in to comment.