Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDBSCAN and KMeans API improvements for improving CPU interoperability #6181

Merged
merged 9 commits into from
Dec 24, 2024
17 changes: 17 additions & 0 deletions python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ from cuml.common import input_to_cuml_array
from cuml.common.array_descriptor import CumlArrayDescriptor
from cuml.internals.api_decorators import device_interop_preparation
from cuml.internals.api_decorators import enable_device_interop
from cuml.internals.global_settings import GlobalSettings
from cuml.internals.mixins import ClusterMixin
from cuml.internals.mixins import CMajorInputTagMixin
from cuml.internals.import_utils import has_hdbscan
Expand Down Expand Up @@ -560,6 +561,17 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
self.prediction_data_ptr = None
self._cpu_to_gpu_interop_prepped = False

@property
def gen_min_span_tree(self):
return self._gen_min_span_tree

@gen_min_span_tree.setter
def gen_min_span_tree(self, value):
# If accelerator is active, we want to generate
# the mst to improve compatibility.
self._gen_min_span_tree = \
GlobalSettings().accelerator_active or value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general we should not set attributes to values that are different from the constructor argument.

I tried to work out why doing this increases compatibility, but couldn't the default of gen_min_span_tree in HDBSCAN is also False. What was the thinking for making this default on when the accelerator is enabled?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Over-zealousness of wanting the mst to be available always when I was testing locally, forgot that the original hdbscan package has the same gen_min_span_tree parameter, so I removed this logic :)


@property
def condensed_tree_(self):

Expand Down Expand Up @@ -782,6 +794,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
self.n_rows = n_rows
self.n_cols = n_cols

if GlobalSettings().accelerator_active:
self._raw_data = self.X_m.to_output("numpy")

cdef uintptr_t _input_ptr = X_m.ptr

IF GPUBUILD == 1:
Expand Down Expand Up @@ -1133,6 +1148,8 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
self.condensed_tree_._raw_tree
self._cpu_model.single_linkage_tree_ = \
self.single_linkage_tree_._linkage
if hasattr(self, "_raw_data"):
self._cpu_model._raw_data = self._raw_data
if self.gen_min_span_tree:
self._cpu_model.minimum_spanning_tree_ = \
self.minimum_spanning_tree_._mst
Expand Down
6 changes: 5 additions & 1 deletion python/cuml/cuml/cluster/kmeans.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ class KMeans(UniversalBase,
Compute k-means clustering with X.

"""
self._n_features_out = self.n_clusters
if self.init == 'preset':
check_cols = self.n_features_in_
check_dtype = self.dtype
Expand All @@ -302,6 +303,8 @@ class KMeans(UniversalBase,
else None),
check_dtype=check_dtype)

self.feature_names_in_ = _X_m.index

IF GPUBUILD == 1:

cdef uintptr_t input_ptr = _X_m.ptr
Expand Down Expand Up @@ -704,4 +707,5 @@ class KMeans(UniversalBase,

def get_attr_names(self):
return ['cluster_centers_', 'labels_', 'inertia_',
'n_iter_', 'n_features_in_', '_n_threads']
'n_iter_', 'n_features_in_', '_n_threads',
"feature_names_in_", "_n_features_out"]
5 changes: 4 additions & 1 deletion python/cuml/cuml/manifold/umap.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -577,11 +577,13 @@ class UMAP(UniversalBase,
convert_format=False)
self.n_rows, self.n_dims = self._raw_data.shape
self.sparse_fit = True
self._sparse_data = True
if self.build_algo == "nn_descent":
raise ValueError("NN Descent does not support sparse inputs")

# Handle dense inputs
else:
self._sparse_data = False
if data_on_host:
convert_to_mem_type = MemoryType.host
else:
Expand Down Expand Up @@ -908,6 +910,7 @@ class UMAP(UniversalBase,
self.metric_kwds, False, self.random_state)

super().gpu_to_cpu()
self._cpu_model._validate_parameters()

@classmethod
def _get_param_names(cls):
Expand Down Expand Up @@ -943,4 +946,4 @@ class UMAP(UniversalBase,
return ['_raw_data', 'embedding_', '_input_hash', '_small_data',
'_knn_dists', '_knn_indices', '_knn_search_index',
'_disconnection_distance', '_n_neighbors', '_a', '_b',
'_initial_alpha']
'_initial_alpha', '_sparse_data']
Loading