diff --git a/.ci/310.yml b/.ci/310.yml index dbb167e2..11b89ce0 100644 --- a/.ci/310.yml +++ b/.ci/310.yml @@ -36,7 +36,6 @@ dependencies: - python-wget - contextily - - scikit-plot - python-graphviz - nbsphinx - numpydoc diff --git a/.ci/311.yml b/.ci/311.yml index 2641e14d..0444737a 100644 --- a/.ci/311.yml +++ b/.ci/311.yml @@ -35,5 +35,4 @@ dependencies: - coverage - python-wget - contextily - - scikit-plot - python-graphviz diff --git a/.ci/312.yml b/.ci/312.yml index 0b005228..2ad55b76 100644 --- a/.ci/312.yml +++ b/.ci/312.yml @@ -36,7 +36,6 @@ dependencies: - python-wget - contextily - - scikit-plot - python-graphviz - sphinx>=1.4.3 - sphinxcontrib-bibtex==1 diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index ae06b7d5..820146da 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -40,7 +40,7 @@ jobs: micromamba-version: 'latest' - name: Install geosnap - run: pip install . ;python geosnap/tests/_dl_data.py; + run: pip install . --no-deps ;python geosnap/tests/_dl_data.py; env: COMBO_DATA: ${{ secrets.COMBO_DATA }} diff --git a/environment.yml b/environment.yml index 879b7deb..ad3ac0f8 100644 --- a/environment.yml +++ b/environment.yml @@ -18,7 +18,6 @@ dependencies: - xlrd - tobler >=0.8.2 - contextily - - scikit-plot - mapclassify - spopt >=0.3.0 - s3fs diff --git a/geosnap/analyze/_cluster_wrappers.py b/geosnap/analyze/_cluster_wrappers.py index 0e4c7c1b..eda6faf8 100644 --- a/geosnap/analyze/_cluster_wrappers.py +++ b/geosnap/analyze/_cluster_wrappers.py @@ -57,7 +57,7 @@ def kmeans( verbose=0, random_state=None, copy_x=True, - algorithm="auto", + algorithm="lloyd", **kwargs, ): """K-Means clustering. diff --git a/geosnap/analyze/_model_results.py b/geosnap/analyze/_model_results.py index 357f5ab3..6866285e 100644 --- a/geosnap/analyze/_model_results.py +++ b/geosnap/analyze/_model_results.py @@ -14,7 +14,6 @@ import esda import geopandas as gpd -import scikitplot as skplt from sklearn.metrics import ( calinski_harabasz_score, davies_bouldin_score, @@ -22,6 +21,7 @@ ) from ..visualize.mapping import plot_timeseries +from ..visualize.skplt import plot_silhouette as _plot_silhouette from .dynamics import predict_markov_labels as _predict_markov_labels from .incs import lincs_from_gdf @@ -369,7 +369,7 @@ def plot_silhouette(self, metric="euclidean", title="Silhouette Score"): elif self.pooling == "pooled": # if pooled, scale the whole series at once df.loc[:, self.columns] = self.scaler.fit_transform(df.values) - fig = skplt.metrics.plot_silhouette( + fig = _plot_silhouette( df[self.columns].values, self.labels, metric=metric, title=title ) diff --git a/geosnap/analyze/_region_wrappers.py b/geosnap/analyze/_region_wrappers.py index de7ce41b..e6ab00eb 100644 --- a/geosnap/analyze/_region_wrappers.py +++ b/geosnap/analyze/_region_wrappers.py @@ -71,7 +71,9 @@ def kmeans_spatial(data, columns, w, n_clusters=5, **kwargs): return model -def spenc(data, w, columns, n_clusters=5, gamma=500, random_state=None, **kwargs): +def spenc( + data, w, columns, n_clusters=5, gamma=1, random_state=None, n_jobs=-1, **kwargs +): """Spatially encouraged spectral clustering. :cite:`wolf2018` @@ -101,6 +103,7 @@ def spenc(data, w, columns, n_clusters=5, gamma=500, random_state=None, **kwargs attrs_name=columns, gamma=gamma, random_state=random_state, + n_jobs=n_jobs, ) model.solve() @@ -115,7 +118,7 @@ def skater( floor=-np.inf, islands="increase", cluster_args=None, - **kwargs + **kwargs, ): """SKATER spatial clustering algorithm. @@ -193,7 +196,7 @@ def max_p( threshold=10, max_iterations_construction=99, top_n=2, - **kwargs + **kwargs, ): """Max-p clustering algorithm :cite:`Duque2012`. diff --git a/geosnap/harmonize/harmonize.py b/geosnap/harmonize/harmonize.py index 17a2f55a..16d72214 100644 --- a/geosnap/harmonize/harmonize.py +++ b/geosnap/harmonize/harmonize.py @@ -142,10 +142,7 @@ def harmonize( times.remove(target_year) target_df = dfs[dfs[temporal_index] == target_year] - if target_df.index.name: - unit_index = target_df.index.name - else: - unit_index = "id" + unit_index = target_df.index.name if target_df.index.name else "id" target_df[unit_index] = target_df.index.values geom_name = target_df.geometry.name @@ -209,10 +206,11 @@ def harmonize( pixel_values=pixel_values, raster=raster, ) - except IOError: - raise IOError( - "Unable to locate raster. If using the `dasymetric` or model-based methods. You" - "must provide a raster file and indicate which pixel values contain developed land" + except OSError as e: + raise OSError from e( + "Unable to locate raster. If using the `dasymetric` or model-based " + "methods. You must provide a raster file and indicate which pixel " + "values contain developed land" ) else: raise ValueError('weights_method must of one of ["area", "dasymetric"]') diff --git a/geosnap/visualize/skplt.py b/geosnap/visualize/skplt.py new file mode 100644 index 00000000..67cfd2c5 --- /dev/null +++ b/geosnap/visualize/skplt.py @@ -0,0 +1,147 @@ +import matplotlib.pyplot as plt +import numpy as np +from sklearn.metrics import ( + silhouette_samples, + silhouette_score, +) +from sklearn.preprocessing import LabelEncoder + + +def plot_silhouette( + X, + cluster_labels, + title="Silhouette Analysis", + metric="euclidean", + ax=None, + figsize=None, + cmap="nipy_spectral", + title_fontsize="large", + text_fontsize="medium", +): + """Plots silhouette analysis of clusters provided. + + NOTE: this function is vendored from scikit-plot which is no longer maintained + + Args: + X (array-like, shape (n_samples, n_features)): + Data to cluster, where n_samples is the number of samples and + n_features is the number of features. + + cluster_labels (array-like, shape (n_samples,)): + Cluster label for each sample. + + title (string, optional): Title of the generated plot. Defaults to + "Silhouette Analysis" + + metric (string or callable, optional): The metric to use when + calculating distance between instances in a feature array. + If metric is a string, it must be one of the options allowed by + sklearn.metrics.pairwise.pairwise_distances. If X is + the distance array itself, use "precomputed" as the metric. + + copy (boolean, optional): Determines whether ``fit`` is used on + **clf** or on a copy of **clf**. + + ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to + plot the curve. If None, the plot is drawn on a new set of axes. + + figsize (2-tuple, optional): Tuple denoting figure size of the plot + e.g. (6, 6). Defaults to ``None``. + + cmap (string or :class:`matplotlib.colors.Colormap` instance, optional): + Colormap used for plotting the projection. View Matplotlib Colormap + documentation for available options. + https://matplotlib.org/users/colormaps.html + + title_fontsize (string or int, optional): Matplotlib-style fontsizes. + Use e.g. "small", "medium", "large" or integer-values. Defaults to + "large". + + text_fontsize (string or int, optional): Matplotlib-style fontsizes. + Use e.g. "small", "medium", "large" or integer-values. Defaults to + "medium". + + Returns: + ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was + drawn. + + Example: + >>> import scikitplot as skplt + >>> kmeans = KMeans(n_clusters=4, random_state=1) + >>> cluster_labels = kmeans.fit_predict(X) + >>> skplt.metrics.plot_silhouette(X, cluster_labels) + + >>> plt.show() + + .. image:: _static/examples/plot_silhouette.png + :align: center + :alt: Silhouette Plot + """ + cluster_labels = np.asarray(cluster_labels) + + le = LabelEncoder() + cluster_labels_encoded = le.fit_transform(cluster_labels) + + n_clusters = len(np.unique(cluster_labels)) + + silhouette_avg = silhouette_score(X, cluster_labels, metric=metric) + + sample_silhouette_values = silhouette_samples(X, cluster_labels, metric=metric) + + if ax is None: + fig, ax = plt.subplots(1, 1, figsize=figsize) + + ax.set_title(title, fontsize=title_fontsize) + ax.set_xlim([-0.1, 1]) + + ax.set_ylim([0, len(X) + (n_clusters + 1) * 10 + 10]) + + ax.set_xlabel("Silhouette coefficient values", fontsize=text_fontsize) + ax.set_ylabel("Cluster label", fontsize=text_fontsize) + + y_lower = 10 + + for i in range(n_clusters): + ith_cluster_silhouette_values = sample_silhouette_values[ + cluster_labels_encoded == i + ] + + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = plt.cm.get_cmap(cmap)(float(i) / n_clusters) + + ax.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) + + ax.text( + -0.05, + y_lower + 0.5 * size_cluster_i, + str(le.classes_[i]), + fontsize=text_fontsize, + ) + + y_lower = y_upper + 10 + + ax.axvline( + x=silhouette_avg, + color="red", + linestyle="--", + label="Silhouette score: {0:0.3f}".format(silhouette_avg), + ) + + ax.set_yticks([]) # Clear the y-axis labels / ticks + ax.set_xticks(np.arange(-0.1, 1.0, 0.2)) + + ax.tick_params(labelsize=text_fontsize) + ax.legend(loc="best", fontsize=text_fontsize) + + return ax diff --git a/pyproject.toml b/pyproject.toml index 18060660..aaa126f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,6 @@ dependencies = [ "quilt3>=3.6", "pyarrow>=0.14.1", "contextily", - "scikit-plot", "tobler>=0.8.2", "spopt>=0.3.0", "fsspec",