From 40abacc83ac359765810f664b994b73cedc87644 Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Fri, 15 Apr 2022 14:22:22 +0200
Subject: [PATCH] Functionalities for computing concave hull around cluster
---
orangecontrib/text/concave_hull.py | 261 ++++++++++++++++++
orangecontrib/text/tests/test_concave_hull.py | 55 ++++
requirements.txt | 1 +
3 files changed, 317 insertions(+)
create mode 100644 orangecontrib/text/concave_hull.py
create mode 100644 orangecontrib/text/tests/test_concave_hull.py
diff --git a/orangecontrib/text/concave_hull.py b/orangecontrib/text/concave_hull.py
new file mode 100644
index 000000000..1b2265480
--- /dev/null
+++ b/orangecontrib/text/concave_hull.py
@@ -0,0 +1,261 @@
+from bisect import bisect_left, bisect_right
+from math import sqrt
+from typing import Optional, Dict, Iterable, Tuple, List
+
+import numpy as np
+import pyclipper
+from Orange.data import ContinuousVariable, Domain, Table
+from scipy.spatial import Delaunay
+
+
+def _angle(
+ v1: Tuple[np.ndarray, np.ndarray], v2: Tuple[np.ndarray, np.ndarray]
+) -> float:
+ """
+ Compute clockwise angles between v1 and v2. Both vectors are
+ given as a tuple with two points ([x1, y1], [x2, y2]) such that
+ [x2, y2] of v1 == [x1, y1] of v2.
+ The angle is given in degrees between 0 and 2 * pi
+ """
+ v1, v2 = np.array(v1), np.array(v2)
+ x1, y1 = v1[0] - v1[1]
+ x2, y2 = v2[1] - v2[0]
+ dot = np.sum(x1 * x2 + y1 * y2) # dot product
+ det = x1 * y2 - y1 * x1 # determinant
+ angle = np.arctan2(det, dot) # atan2(y, x) or atan2(sin, cos)
+ return -angle if angle <= 0 else np.pi + np.pi - angle
+
+
+def _find_hull(
+ edges_list: List, points_list: np.ndarray, starting_edge: int
+) -> Tuple[np.ndarray, List]:
+ """
+ This function return a single hull which starts and ends in starting_edge.
+
+ Parameters
+ ----------
+ edges_list
+ List of edges. Each edge is presented as a tuple of two indices which
+ tell the starting and ending note. The index correspond to location
+ of point in points_list. This list must be sorted in the ascending order.
+ points_list
+ Points location. Each point has x and y location.
+ starting_edge
+ The index of the list where hull starts.
+
+ Returns
+ -------
+ polygon
+ The array with the hull/polygon points
+ used_points
+ List of booleans that indicates whether each point was used in polygon
+ """
+ firsts = [x[0] for x in edges_list] # first elements for bisection
+ used_edges = [False] * len(edges_list)
+ used_edges[starting_edge] = True
+
+ # remember start and next point
+ start, next_point = edges_list[starting_edge]
+
+ # remember current polygon around points
+ poly = [points_list[start], points_list[next_point]]
+
+ # we count number of steps to stop iteration in case it is locked
+ # in some dead cycle. It can be a result of some unexpected cases.
+ count = 0
+ while start != next_point and count < len(edges_list):
+ # find the index where the first value equal to next_point appear
+ ind_left = bisect_left(firsts, next_point)
+ # find the index next to the last value
+ ind_right = bisect_right(firsts, next_point)
+
+ # check if there are more edges available from the same point
+ if ind_right - ind_left > 1:
+ # select the most distant one in clockwise direction. It is probably
+ # the point on the outer hull - we prevent a hull to discover cycles
+ # inside a polygon
+ ang = -1
+ for i in range(ind_left, ind_right):
+ cur_ang = _angle(
+ (poly[-2], poly[-1]), (poly[-1], points_list[edges_list[i][1]])
+ )
+ if cur_ang > ang:
+ ang = cur_ang
+ ind_left = i
+ # save a next point of the polygon
+ used_edges[ind_left] = True
+ next_point = edges_list[ind_left][1]
+ poly.append(points_list[next_point])
+ count += 1
+ return np.array(poly), used_edges
+
+
+def _edges_to_polygon(edges: Iterable, points: np.ndarray) -> np.ndarray:
+ """
+ This function connects edges in polygons. It computes all possible hulls -
+ some clusters have more of them when they have a hole in the middle, and
+ selects outer hull.
+
+ Parameters
+ ----------
+ edges
+ Iterable of edges. Each edge is presented as a tuple of two indices which
+ tell the starting and ending note. The index correspond to location
+ of point in points_list
+ points
+ Points location. Each point has x and y location.
+
+ Returns
+ -------
+ The array with the hull/polygon points.
+ """
+ # sort based on first element of tuple to enable bisection search
+ edges = sorted(edges, key=lambda x: x[0])
+ # need to use all edges
+ used = [False] * len(edges)
+
+ # it is possible that we will find more separate hulls -
+ # it happens in cases when a polygon has inner cycles
+ polygons = []
+ while not all(used):
+ i = used.index(False)
+ poly, new_used = _find_hull(edges, points, i)
+ polygons.append(poly)
+ used = [u1 or u2 for u1, u2 in zip(used, new_used)]
+
+ # select polygon that is outside - the widest and the highest
+ height_width = [np.sum(p.max(axis=0) - p.min(axis=0)) for p in polygons]
+ i = height_width.index(max(height_width))
+ return polygons[i]
+
+
+def _get_shape_around_points(pts: np.ndarray, eps: Optional[float]) -> np.ndarray:
+ """
+ Compute the shape (concave hull) of a set of a cluster.
+ """
+
+ def add_edge(edges_list, i, j):
+ """
+ Add a line between the i-th and j-th points,
+ if not in the list already. Remove the lines that are not outer
+ edges - when (j, i) already in list means that two triangles
+ has same edge what means it is not an outer edge
+ """
+ if (j, i) in edges_list:
+ # both neighboring triangles are in shape - it's not a boundary edge
+ edges_list.remove((j, i))
+ else:
+ edges_list.add((i, j))
+
+ if len(pts) < 4:
+ rng = list(range(3))
+ return _edges_to_polygon(zip(rng, rng[1:] + rng[:1]), pts)
+
+ eps = eps * 2 if eps else float("inf")
+ tri = Delaunay(pts)
+ edges = set()
+ # loop over triangles: ia, ib, ic = indices of corner points of the triangle
+ for ia, ib, ic in tri.vertices:
+ pa = pts[ia]
+ pb = pts[ib]
+ pc = pts[ic]
+
+ # Lengths of sides of triangle
+ a = sqrt((pa[0] - pb[0]) ** 2 + (pa[1] - pb[1]) ** 2)
+ b = sqrt((pb[0] - pc[0]) ** 2 + (pb[1] - pc[1]) ** 2)
+ c = sqrt((pc[0] - pa[0]) ** 2 + (pc[1] - pa[1]) ** 2)
+
+ # filter - the longest edge of triangle must be smaller than epsilon
+ if max(a, b, c) <= eps:
+ add_edge(edges, ia, ib)
+ add_edge(edges, ib, ic)
+ add_edge(edges, ic, ia)
+
+ return _edges_to_polygon(edges, pts)
+
+
+def _smoothen_hull(hull: np.ndarray) -> np.ndarray:
+ """
+ Expand hull a bit away from the cluster and smoothen it
+ """
+ # scale hull between 0 and 1 in both directions - help to produce the hull
+ # with equal distance from the cluster when axes have different scaling
+ x, y = hull[:, 0:1], hull[:, 1:2]
+ x_min, x_max = np.min(x), np.max(x)
+ y_min, y_max = np.min(y), np.max(y)
+ x_diff, y_diff = (x_max - x_min), (y_max - y_min)
+ hull = np.hstack(((x - x_min) / x_diff, (y - y_min) / y_diff))
+
+ # buffer the line for 3 * dist_from_cluster and move it back for
+ # 2 * dist_from_cluster. It will make a hull smother.
+
+ # pyclipper work with integer so points need to be scaled first
+ scaling_factor = 1000
+ dist_from_cluster = 0.06
+ scaled_hull = pyclipper.scale_to_clipper(hull, scaling_factor)
+
+ # buffer the hull for dist_from_cluster * 3
+ pco1 = pyclipper.PyclipperOffset()
+ pco1.AddPath(scaled_hull, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+ im_solution = pco1.Execute(dist_from_cluster * scaling_factor * 3)
+ # buffer the hull for dist_from_cluster * -2
+ pco2 = pyclipper.PyclipperOffset()
+ pco2.AddPath(im_solution[0], pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+ solution = pyclipper.scale_from_clipper(
+ pco2.Execute(dist_from_cluster * scaling_factor * (-2)), scaling_factor
+ )
+ solution = np.array(solution).reshape(-1, 2)
+
+ # scale the hue back for its original space
+ return np.hstack(
+ (solution[:, 0:1] * x_diff + x_min, solution[:, 1:2] * y_diff + y_min)
+ )
+
+
+def compute_concave_hulls(
+ embedding: np.ndarray, clusters: np.ndarray, epsilon: Optional[float] = None
+) -> Dict[int, np.ndarray]:
+ """
+ Function computes the points of the concave hull around points.
+
+ Parameters
+ ----------
+ embedding
+ Visualisation coordinates - embeddings
+ clusters
+ Cluster indices for each item.
+ epsilon
+ Points with distance > 2*epsilon are not connected resulting in concave
+ hull. The parameter is optional, setting it to None result in convex hull.
+
+ Returns
+ -------
+ The points of the concave hull. Dictionary with cluster index
+ as a key and array of points as a value - [[x1, y1], [x2, y2], [x3, y3], ...]
+ """
+ hulls = {}
+ for cl in set(clusters) - {-1}:
+ points = embedding[clusters == cl]
+
+ # subsample when more than 1000 points - to speed up the algorithm
+ if points.shape[0] > 1000:
+ points = points[np.random.randint(points.shape[0], size=1000), :]
+
+ # compute hull around the cluster
+ hull = _get_shape_around_points(points, epsilon)
+ # smoothen and extend the hull
+ hulls[cl] = _smoothen_hull(hull)
+ return hulls
+
+
+if __name__ == "__main__":
+ from matplotlib import pyplot as plt
+
+ table = Table("iris")
+ clusters = np.array([1] * 50 + [2] * 50 + [3] * 50)
+
+ hulls = compute_concave_hulls(table.X[:, :2], clusters)
+ plt.plot(table.X[:, 0], table.X[:, 1], "o")
+ for k, h in hulls.items():
+ plt.plot(h[:, 0], h[:, 1])
+ plt.show()
diff --git a/orangecontrib/text/tests/test_concave_hull.py b/orangecontrib/text/tests/test_concave_hull.py
new file mode 100644
index 000000000..4bdf6222b
--- /dev/null
+++ b/orangecontrib/text/tests/test_concave_hull.py
@@ -0,0 +1,55 @@
+import unittest
+
+import numpy as np
+from Orange.data import Table
+
+from orangecontrib.text.concave_hull import compute_concave_hulls
+
+
+class TestConcaveHull(unittest.TestCase):
+ def test_compute_concave_hulls(self):
+ data = Table.from_file("iris")[:, 2:4]
+ clusters = np.array([0] * 50 + [1] * 50 + [2] * 50)
+
+ hulls = compute_concave_hulls(data.X, clusters, epsilon=0.5)
+ self.assertEqual(3, len(hulls))
+ self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
+ self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
+ self.assertEqual(2, hulls[2].shape[1]) # hull have x and y
+
+ hulls = compute_concave_hulls(data.X, clusters)
+ self.assertEqual(3, len(hulls))
+ self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
+ self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
+ self.assertEqual(2, hulls[2].shape[1]) # hull have x and y
+
+ def test_compute_concave_hulls_subsampling(self):
+ """
+ When more than 1000 points passed they are sub-sampled in order to
+ compute a concave hull
+ """
+ iris = Table.from_file("iris")
+ data = np.repeat(iris.X[:, 2:4], 10, axis=0) # more than 1000 points
+ clusters = np.array([0] * 50 * 10 + [1] * 50 * 10 + [2] * 50 * 10)
+
+ hulls = compute_concave_hulls(data, clusters, epsilon=0.5)
+
+ self.assertEqual(3, len(hulls))
+ self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
+ self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
+ self.assertEqual(2, hulls[2].shape[1]) # hull have x and y
+
+ def test_compute_concave_hulls_triangle(self):
+ """
+ Concave hull must also work for tree points - it is a special case
+ """
+ data = np.array([[1, 1], [1, 2], [2, 1]])
+ clusters = np.array([0] * 3)
+ hulls = compute_concave_hulls(data, clusters, epsilon=0.5)
+
+ self.assertEqual(1, len(hulls))
+ self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/requirements.txt b/requirements.txt
index c1110117e..2d7517295 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ Orange3 >=3.31.1
orange-widget-base >=4.14.0
pandas
pdfminer3k>=1.3.1
+pyclipper>=1.2.0
pyqtgraph
pyyaml
requests