revise per comments

NVIDIA · Oct 16, 2024 · d37c32d · d37c32d
1 parent d11cffa
commit d37c32d
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 25 deletions.
diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py
@@ -923,6 +923,12 @@ class ApproximateNearestNeighbors(
     k: int (default = 5)
         the default number of approximate nearest neighbors to retrieve for each query.
 
+        If fewer than k neighbors are found for a query (for example, due to a small nprobe value):
+        (1)In ivfflat and ivfpq:
+            (a) If no item vector is probed, the indices are filled with long_max (9,223,372,036,854,775,807) and distances are set to infinity.
+            (b) If at least one item vector is probed, the indices are filled with the top-1 neighbor's ID, and distances are filled with infinity.
+        (2) cagra does not have this problem, as at least itopk_size (where itopk_size ≥ k) items are always probed.
+
     algorithm: str (default = 'ivfflat')
         the algorithm parameter to be passed into cuML. It currently must be 'ivfflat', 'ivfpq' or 'cagra'. Other algorithms are expected to be supported later.
 
@@ -1432,24 +1438,19 @@ def _get_cuml_transform_func(
             "cosine",
         }
 
-        if cuml_alg_params["algorithm"] != "ivfpq":
-            check_fn = (
-                self._cal_cagra_params_and_check
-                if cuml_alg_params["algorithm"] == "cagra"
-                else self._cal_cuvs_ivf_flat_params_and_check
-            )
-            index_params, search_params = check_fn(
-                algoParams=self.cuml_params["algo_params"],
-                metric=self.cuml_params["metric"],
-                topk=cuml_alg_params["n_neighbors"],
-            )
+        if cuml_alg_params["algorithm"] == "cagra":
+            check_fn = self._cal_cagra_params_and_check
+        elif cuml_alg_params["algorithm"] in {"ivf_flat", "ivfflat"}:
+            check_fn = self._cal_cuvs_ivf_flat_params_and_check
         else:
-            assert cuml_alg_params["algorithm"] in {"ivfpq", "ivf_pq"}
-            index_params, search_params = self._cal_cuvs_ivf_pq_params_and_check(
-                algoParams=self.cuml_params["algo_params"],
-                metric=self.cuml_params["metric"],
-                topk=cuml_alg_params["n_neighbors"],
-            )
+            assert cuml_alg_params["algorithm"] in {"ivf_pq", "ivfpq"}
+            check_fn = self._cal_cuvs_ivf_pq_params_and_check
+
+        index_params, search_params = check_fn(
+            algoParams=self.cuml_params["algo_params"],
+            metric=self.cuml_params["metric"],
+            topk=cuml_alg_params["n_neighbors"],
+        )
 
         def _construct_sgnn() -> CumlT:
 

diff --git a/python/tests/test_approximate_nearest_neighbors.py b/python/tests/test_approximate_nearest_neighbors.py
@@ -831,10 +831,7 @@ def test_return_fewer_k(
 ) -> None:
     """
     This tests the corner case where there are less than k neighbors found due to nprobe too small.
-    (1) In ivf_flat and ivf_pq:
-        (a) if no nn is probed, indices are filled with long_max and distances are filled infs.
-        (b) if at least one nn is probed, indices are filled with the top-1 nn id and distances are filled with infs.
-    (2) cagra does not have this problem because at least itopk_size (>= k) items are probed.
+    More details can be found at the docstring of class ApproximateNearestNeighbors.
     """
     metric = "euclidean"
     gpu_number = 1

diff --git a/python/tests/utils.py b/python/tests/utils.py
@@ -19,6 +19,7 @@
 from typing import Any, Dict, Iterator, List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
+import pandas as pd
 import pyspark
 from pyspark.ml.feature import VectorAssembler
 from pyspark.sql import SparkSession
@@ -99,11 +100,11 @@ def create_pyspark_dataframe(
 
         label_col = "label_col"
         schema.append(f"{label_col} {label_pyspark_type}")
-        data_pytype = [
-            ra + rb for ra, rb in zip(data.tolist(), label.reshape(m, 1).tolist())
-        ]
+
+        pdf = pd.DataFrame(data, dtype=dtype, columns=feature_cols)
+        pdf[label_col] = label.astype(label_dtype)
         df = spark.createDataFrame(
-            data_pytype,
+            pdf,
             ",".join(schema),
         )
     else: