Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Functionalities for computing concave hull around clusters of points #816

Merged
merged 1 commit into from
Apr 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 261 additions & 0 deletions orangecontrib/text/concave_hull.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
from bisect import bisect_left, bisect_right
from math import sqrt
from typing import Optional, Dict, Iterable, Tuple, List

import numpy as np
import pyclipper
from Orange.data import ContinuousVariable, Domain, Table
from scipy.spatial import Delaunay


def _angle(
v1: Tuple[np.ndarray, np.ndarray], v2: Tuple[np.ndarray, np.ndarray]
) -> float:
"""
Compute clockwise angles between v1 and v2. Both vectors are
given as a tuple with two points ([x1, y1], [x2, y2]) such that
[x2, y2] of v1 == [x1, y1] of v2.
The angle is given in degrees between 0 and 2 * pi
"""
v1, v2 = np.array(v1), np.array(v2)
x1, y1 = v1[0] - v1[1]
x2, y2 = v2[1] - v2[0]
dot = np.sum(x1 * x2 + y1 * y2) # dot product
det = x1 * y2 - y1 * x1 # determinant
angle = np.arctan2(det, dot) # atan2(y, x) or atan2(sin, cos)
return -angle if angle <= 0 else np.pi + np.pi - angle


def _find_hull(
edges_list: List, points_list: np.ndarray, starting_edge: int
) -> Tuple[np.ndarray, List]:
"""
This function return a single hull which starts and ends in starting_edge.
Parameters
----------
edges_list
List of edges. Each edge is presented as a tuple of two indices which
tell the starting and ending note. The index correspond to location
of point in points_list. This list must be sorted in the ascending order.
points_list
Points location. Each point has x and y location.
starting_edge
The index of the list where hull starts.
Returns
-------
polygon
The array with the hull/polygon points
used_points
List of booleans that indicates whether each point was used in polygon
"""
firsts = [x[0] for x in edges_list] # first elements for bisection
used_edges = [False] * len(edges_list)
used_edges[starting_edge] = True

# remember start and next point
start, next_point = edges_list[starting_edge]

# remember current polygon around points
poly = [points_list[start], points_list[next_point]]

# we count number of steps to stop iteration in case it is locked
# in some dead cycle. It can be a result of some unexpected cases.
count = 0
while start != next_point and count < len(edges_list):
# find the index where the first value equal to next_point appear
ind_left = bisect_left(firsts, next_point)
# find the index next to the last value
ind_right = bisect_right(firsts, next_point)

# check if there are more edges available from the same point
if ind_right - ind_left > 1:
# select the most distant one in clockwise direction. It is probably
# the point on the outer hull - we prevent a hull to discover cycles
# inside a polygon
ang = -1
for i in range(ind_left, ind_right):
cur_ang = _angle(
(poly[-2], poly[-1]), (poly[-1], points_list[edges_list[i][1]])
)
if cur_ang > ang:
ang = cur_ang
ind_left = i
# save a next point of the polygon
used_edges[ind_left] = True
next_point = edges_list[ind_left][1]
poly.append(points_list[next_point])
count += 1
return np.array(poly), used_edges


def _edges_to_polygon(edges: Iterable, points: np.ndarray) -> np.ndarray:
"""
This function connects edges in polygons. It computes all possible hulls -
some clusters have more of them when they have a hole in the middle, and
selects outer hull.
Parameters
----------
edges
Iterable of edges. Each edge is presented as a tuple of two indices which
tell the starting and ending note. The index correspond to location
of point in points_list
points
Points location. Each point has x and y location.
Returns
-------
The array with the hull/polygon points.
"""
# sort based on first element of tuple to enable bisection search
edges = sorted(edges, key=lambda x: x[0])
# need to use all edges
used = [False] * len(edges)

# it is possible that we will find more separate hulls -
# it happens in cases when a polygon has inner cycles
polygons = []
while not all(used):
i = used.index(False)
poly, new_used = _find_hull(edges, points, i)
polygons.append(poly)
used = [u1 or u2 for u1, u2 in zip(used, new_used)]

# select polygon that is outside - the widest and the highest
height_width = [np.sum(p.max(axis=0) - p.min(axis=0)) for p in polygons]
i = height_width.index(max(height_width))
return polygons[i]


def _get_shape_around_points(pts: np.ndarray, eps: Optional[float]) -> np.ndarray:
"""
Compute the shape (concave hull) of a set of a cluster.
"""

def add_edge(edges_list, i, j):
"""
Add a line between the i-th and j-th points,
if not in the list already. Remove the lines that are not outer
edges - when (j, i) already in list means that two triangles
has same edge what means it is not an outer edge
"""
if (j, i) in edges_list:
# both neighboring triangles are in shape - it's not a boundary edge
edges_list.remove((j, i))
else:
edges_list.add((i, j))

if len(pts) < 4:
rng = list(range(3))
return _edges_to_polygon(zip(rng, rng[1:] + rng[:1]), pts)

eps = eps * 2 if eps else float("inf")
tri = Delaunay(pts)
edges = set()
# loop over triangles: ia, ib, ic = indices of corner points of the triangle
for ia, ib, ic in tri.vertices:
pa = pts[ia]
pb = pts[ib]
pc = pts[ic]

# Lengths of sides of triangle
a = sqrt((pa[0] - pb[0]) ** 2 + (pa[1] - pb[1]) ** 2)
b = sqrt((pb[0] - pc[0]) ** 2 + (pb[1] - pc[1]) ** 2)
c = sqrt((pc[0] - pa[0]) ** 2 + (pc[1] - pa[1]) ** 2)

# filter - the longest edge of triangle must be smaller than epsilon
if max(a, b, c) <= eps:
add_edge(edges, ia, ib)
add_edge(edges, ib, ic)
add_edge(edges, ic, ia)

return _edges_to_polygon(edges, pts)


def _smoothen_hull(hull: np.ndarray) -> np.ndarray:
"""
Expand hull a bit away from the cluster and smoothen it
"""
# scale hull between 0 and 1 in both directions - help to produce the hull
# with equal distance from the cluster when axes have different scaling
x, y = hull[:, 0:1], hull[:, 1:2]
x_min, x_max = np.min(x), np.max(x)
y_min, y_max = np.min(y), np.max(y)
x_diff, y_diff = (x_max - x_min), (y_max - y_min)
hull = np.hstack(((x - x_min) / x_diff, (y - y_min) / y_diff))

# buffer the line for 3 * dist_from_cluster and move it back for
# 2 * dist_from_cluster. It will make a hull smother.

# pyclipper work with integer so points need to be scaled first
scaling_factor = 1000
dist_from_cluster = 0.06
scaled_hull = pyclipper.scale_to_clipper(hull, scaling_factor)

# buffer the hull for dist_from_cluster * 3
pco1 = pyclipper.PyclipperOffset()
pco1.AddPath(scaled_hull, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
im_solution = pco1.Execute(dist_from_cluster * scaling_factor * 3)
# buffer the hull for dist_from_cluster * -2
pco2 = pyclipper.PyclipperOffset()
pco2.AddPath(im_solution[0], pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
solution = pyclipper.scale_from_clipper(
pco2.Execute(dist_from_cluster * scaling_factor * (-2)), scaling_factor
)
solution = np.array(solution).reshape(-1, 2)

# scale the hue back for its original space
return np.hstack(
(solution[:, 0:1] * x_diff + x_min, solution[:, 1:2] * y_diff + y_min)
)


def compute_concave_hulls(
embedding: np.ndarray, clusters: np.ndarray, epsilon: Optional[float] = None
) -> Dict[int, np.ndarray]:
"""
Function computes the points of the concave hull around points.
Parameters
----------
embedding
Visualisation coordinates - embeddings
clusters
Cluster indices for each item.
epsilon
Points with distance > 2*epsilon are not connected resulting in concave
hull. The parameter is optional, setting it to None result in convex hull.
Returns
-------
The points of the concave hull. Dictionary with cluster index
as a key and array of points as a value - [[x1, y1], [x2, y2], [x3, y3], ...]
"""
hulls = {}
for cl in set(clusters) - {-1}:
points = embedding[clusters == cl]

# subsample when more than 1000 points - to speed up the algorithm
if points.shape[0] > 1000:
points = points[np.random.randint(points.shape[0], size=1000), :]

# compute hull around the cluster
hull = _get_shape_around_points(points, epsilon)
# smoothen and extend the hull
hulls[cl] = _smoothen_hull(hull)
return hulls


if __name__ == "__main__":
from matplotlib import pyplot as plt

table = Table("iris")
clusters = np.array([1] * 50 + [2] * 50 + [3] * 50)

hulls = compute_concave_hulls(table.X[:, :2], clusters)
plt.plot(table.X[:, 0], table.X[:, 1], "o")
for k, h in hulls.items():
plt.plot(h[:, 0], h[:, 1])
plt.show()
55 changes: 55 additions & 0 deletions orangecontrib/text/tests/test_concave_hull.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import unittest

import numpy as np
from Orange.data import Table

from orangecontrib.text.concave_hull import compute_concave_hulls


class TestConcaveHull(unittest.TestCase):
def test_compute_concave_hulls(self):
data = Table.from_file("iris")[:, 2:4]
clusters = np.array([0] * 50 + [1] * 50 + [2] * 50)

hulls = compute_concave_hulls(data.X, clusters, epsilon=0.5)
self.assertEqual(3, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
self.assertEqual(2, hulls[2].shape[1]) # hull have x and y

hulls = compute_concave_hulls(data.X, clusters)
self.assertEqual(3, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
self.assertEqual(2, hulls[2].shape[1]) # hull have x and y

def test_compute_concave_hulls_subsampling(self):
"""
When more than 1000 points passed they are sub-sampled in order to
compute a concave hull
"""
iris = Table.from_file("iris")
data = np.repeat(iris.X[:, 2:4], 10, axis=0) # more than 1000 points
clusters = np.array([0] * 50 * 10 + [1] * 50 * 10 + [2] * 50 * 10)

hulls = compute_concave_hulls(data, clusters, epsilon=0.5)

self.assertEqual(3, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
self.assertEqual(2, hulls[2].shape[1]) # hull have x and y

def test_compute_concave_hulls_triangle(self):
"""
Concave hull must also work for tree points - it is a special case
"""
data = np.array([[1, 1], [1, 2], [2, 1]])
clusters = np.array([0] * 3)
hulls = compute_concave_hulls(data, clusters, epsilon=0.5)

self.assertEqual(1, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y


if __name__ == "__main__":
unittest.main()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Orange3 >=3.31.1
orange-widget-base >=4.14.0
pandas
pdfminer3k>=1.3.1
pyclipper>=1.2.0
pyqtgraph
pyyaml
requests
Expand Down