From edee827e6954970b0999fee5d6ea461c9053459a Mon Sep 17 00:00:00 2001
From: Jona <jona.raphael@gmail.com>
Date: Tue, 22 Oct 2024 00:45:50 -0400
Subject: [PATCH 1/3] half labeling

---
 cerulean_cloud/models.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/cerulean_cloud/models.py b/cerulean_cloud/models.py
index a758ab81..60351e2b 100644
--- a/cerulean_cloud/models.py
+++ b/cerulean_cloud/models.py
@@ -966,23 +966,12 @@ def overlap_percent(a, b):
 
         raster = raster.float().detach().numpy()
 
-        # Label components based on p3 to find peaks
+        reduced_labels = set()  # Initialize an empty set for unique p1 labels
         p1_islands, p1_island_count = label(raster >= p1)
-        # logging.info(f"p1_island_count: {p1_island_count}")
-        p3_islands, p3_island_count = label(raster >= p3)
-        # logging.info(f"p3_island_count: {p3_island_count}")
-
-        # Initialize an empty set for unique p1 labels corresponding to p3 components
-        reduced_labels = set()
-
-        # Iterate over each p3 component
-        for i in range(1, p3_island_count + 1):
-            p3_island_mask = p3_islands == i
-            p1_label_at_p3 = p1_islands[p3_island_mask].flat[
-                0
-            ]  # Take the first pixel's p1 label
-            reduced_labels.add(p1_label_at_p3)
-        # logging.info(f"reduced_labels: {len(reduced_labels)}")
+        for label_num in range(1, p1_island_count + 1):
+            p1_label_mask = p1_islands == label_num
+            if np.any(raster[p1_label_mask] >= p3):
+                reduced_labels.add(label_num)
 
         zero_mask = raster == 0  # Find all pixels that are zero (i.e. nodata value)
         shapes = rasterio.features.shapes(

From 78589800a947cf0817ca83f610e8077fc825ec7d Mon Sep 17 00:00:00 2001
From: Jona <jona.raphael@gmail.com>
Date: Tue, 22 Oct 2024 00:57:39 -0400
Subject: [PATCH 2/3] Addl optimizations

---
 cerulean_cloud/models.py | 44 ++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/cerulean_cloud/models.py b/cerulean_cloud/models.py
index 60351e2b..cae1433b 100644
--- a/cerulean_cloud/models.py
+++ b/cerulean_cloud/models.py
@@ -894,9 +894,9 @@ def instantiate(self, scene_array_probs, transform):
         Returns:
         - geojson.FeatureCollection: A collection of geojson features representing detected instances.
         """
-        # Convert numpy.ndarray to PyTorch tensor if necessary
-        if isinstance(scene_array_probs, np.ndarray):
-            scene_array_probs = torch.tensor(scene_array_probs)
+        # Ensure scene_array_probs is a NumPy array
+        if isinstance(scene_array_probs, torch.Tensor):
+            scene_array_probs = scene_array_probs.detach().cpu().numpy()
 
         classes_to_consider = [1, 2, 3]
         scene_oil_probs = scene_array_probs[classes_to_consider].sum(0)
@@ -909,18 +909,16 @@ def instantiate(self, scene_array_probs, transform):
         )
         for feat in features:
             mask = geometry_mask(
-                [shape(feat["geometry"])],
+                [feat["geometry"]],
                 out_shape=scene_oil_probs.shape,
                 transform=transform,
                 invert=True,
             )
             cls_sums = [
-                cls_probs[mask].detach().sum()
-                for cls_probs in scene_array_probs[classes_to_consider]
-            ]
-            feat["properties"]["inf_idx"] = classes_to_consider[
-                cls_sums.index(max(cls_sums))
+                scene_array_probs[cls_idx][mask].sum()
+                for cls_idx in classes_to_consider
             ]
+            feat["properties"]["inf_idx"] = classes_to_consider[np.argmax(cls_sums)]
 
         return geojson.FeatureCollection(features=features)
 
@@ -949,7 +947,7 @@ def instances_from_probs(
             Sample values: p1, p2, p3 = 0.1, 0.5, 0.95
             Analogous to bbox_score_thresh, poly_score_thresh, pixel_score_thresh
         Returns:
-            GeoJSON: A GeoJSON feature collection of the processed predictions.
+            List[geojson.Feature]: A list of GeoJSON features of the processed predictions.
         """
 
         def overlap_percent(a, b):
@@ -964,13 +962,20 @@ def overlap_percent(a, b):
             else:
                 return a.intersection(b).area / a.area
 
-        raster = raster.float().detach().numpy()
+        # Ensure raster is a NumPy array
+        if isinstance(raster, torch.Tensor):
+            raster = raster.detach().cpu().numpy()
 
-        reduced_labels = set()  # Initialize an empty set for unique p1 labels
+        # Label components based on p1
         p1_islands, p1_island_count = label(raster >= p1)
+
+        # Initialize an empty set for unique p1 labels containing pixels >= p3
+        reduced_labels = set()
+
+        # For each p1_island label, check if it contains any pixels >= p3
         for label_num in range(1, p1_island_count + 1):
-            p1_label_mask = p1_islands == label_num
-            if np.any(raster[p1_label_mask] >= p3):
+            island_mask = p1_islands == label_num
+            if np.any(raster[island_mask] >= p3):
                 reduced_labels.add(label_num)
 
         zero_mask = raster == 0  # Find all pixels that are zero (i.e. nodata value)
@@ -980,16 +985,21 @@ def overlap_percent(a, b):
         polygons = [shape(geom) for geom, value in shapes if value == 1]
         scene_edge = MultiPolygon(polygons).buffer(discard_edge_polygons_buffer)
 
-        # Process into feature collections based on unique p1 labels
         features = []
+        # Process into feature collections based on unique p1 labels
         for p1_label in reduced_labels:
             mask = (p1_islands == p1_label) & (raster >= p2)  # Apply p2 trimming
             masked_raster = raster[mask]
-            shapes = rasterio.features.shapes(
+
+            if not np.any(mask):
+                continue  # Skip if mask is empty after trimming
+
+            shapes_generator = rasterio.features.shapes(
                 mask.astype(np.uint8), mask=mask, transform=transform
             )
-            polygons = [shape(geom) for geom, value in shapes if value == 1]
+            polygons = [shape(geom) for geom, value in shapes_generator if value == 1]
             polygons = [p for p in polygons if overlap_percent(p, scene_edge) <= 0.5]
+
             # Ensure there are polygons left after trimming to process into a MultiPolygon
             if polygons:
                 multipolygon = MultiPolygon(polygons)

From 9bc80c6c54f039f162021bfa22f9398fd338db7e Mon Sep 17 00:00:00 2001
From: Jona <jona.raphael@gmail.com>
Date: Tue, 22 Oct 2024 10:36:35 -0400
Subject: [PATCH 3/3] Overhaul for memory and speed

---
 cerulean_cloud/models.py | 107 ++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 42 deletions(-)

diff --git a/cerulean_cloud/models.py b/cerulean_cloud/models.py
index cae1433b..12dd66aa 100644
--- a/cerulean_cloud/models.py
+++ b/cerulean_cloud/models.py
@@ -938,9 +938,9 @@ def instances_from_probs(
 
         Args:
             raster (np.array): The input raster array to be processed.
-            p1 (float): The lowest probability, used to group adjacent polygons into multipolygons. lower value = fewer groups
-            p2 (float): The middle probability, used to trim the final polygons size. lower value = coarser polygons
-            p3 (float): The highest probability, used to discard polygons that don't reach sufficient confidence. higher value = more restrictive
+            p1 (float): The lowest probability, used to group adjacent polygons into multipolygons. Lower value = fewer groups
+            p2 (float): The middle probability, used to trim the final polygons size. Lower value = coarser polygons
+            p3 (float): The highest probability, used to discard polygons that don't reach sufficient confidence. Higher value = more restrictive
             transform (Affine): The transform to apply to the raster.
             addl_props (dict): Additional properties to add to the features.
             discard_edge_polygons_buffer (float): The buffer distance to discard polygons that are too close to the edge of the scene.
@@ -966,55 +966,78 @@ def overlap_percent(a, b):
         if isinstance(raster, torch.Tensor):
             raster = raster.detach().cpu().numpy()
 
-        # Label components based on p1
+        raster = raster.astype(np.float32)
+
+        # Generate masks for each threshold
+        nodata_mask = raster == 0
         p1_islands, p1_island_count = label(raster >= p1)
+        p2_mask = raster >= p2
+        p3_mask = raster >= p3
+
+        # Extract p1 labels where p3_mask is True
+        p1_p3_labels = np.unique(p1_islands[p3_mask])
 
-        # Initialize an empty set for unique p1 labels containing pixels >= p3
-        reduced_labels = set()
+        # Create mask of all p1_islands that pass p3 criterium
+        p1_p3_mask = np.isin(p1_islands, p1_p3_labels)
 
-        # For each p1_island label, check if it contains any pixels >= p3
-        for label_num in range(1, p1_island_count + 1):
-            island_mask = p1_islands == label_num
-            if np.any(raster[island_mask] >= p3):
-                reduced_labels.add(label_num)
+        # Create mask of all pixels that pass p2 criterium inside p1_islands that pass pass p3 criterium
+        p1_p2_p3_mask = p1_p3_mask & p2_mask
 
-        zero_mask = raster == 0  # Find all pixels that are zero (i.e. nodata value)
-        shapes = rasterio.features.shapes(
-            zero_mask.astype(np.uint8), mask=zero_mask, transform=transform
+        # Assign distinct labels to the islands that pass p1, p2, and p3 criteria
+        combined_raster = p1_p2_p3_mask * p1_islands
+
+        # Create a buffer around the edge of the scene to discard polygons that are too close to the edge
+        scene_edge = MultiPolygon(
+            shape(geom)
+            for geom, value in shapes(
+                nodata_mask.astype(np.uint8), mask=nodata_mask, transform=transform
+            )
+            if value == 1
+        ).buffer(discard_edge_polygons_buffer)
+
+        # Now, use rasterio.features.shapes to extract polygons and their labels
+        shapes_generator = shapes(
+            combined_raster,
+            mask=combined_raster > 0,  # Only consider pixels with p1_label > 0
+            transform=transform,
         )
-        polygons = [shape(geom) for geom, value in shapes if value == 1]
-        scene_edge = MultiPolygon(polygons).buffer(discard_edge_polygons_buffer)
 
-        features = []
-        # Process into feature collections based on unique p1 labels
-        for p1_label in reduced_labels:
-            mask = (p1_islands == p1_label) & (raster >= p2)  # Apply p2 trimming
-            masked_raster = raster[mask]
+        # Dictionary to collect geometries per p1_label
+        label_geometries = {}
+        for geom, p1_label in shapes_generator:
+            poly = shape(geom)
+            if overlap_percent(poly, scene_edge) <= 0.5:
+                # Only record polygons that are not overlapping the scene edge
+                if p1_label not in label_geometries:  # Initialize if doesn't exist
+                    label_geometries[p1_label] = []
+                label_geometries[p1_label].append(poly)  # Add polygon to list
 
-            if not np.any(mask):
-                continue  # Skip if mask is empty after trimming
+        features = []
+        for p1_label, geometries in label_geometries.items():
+            multipolygon = MultiPolygon(geometries)
 
-            shapes_generator = rasterio.features.shapes(
-                mask.astype(np.uint8), mask=mask, transform=transform
+            geom_mask = geometry_mask(
+                [multipolygon],
+                out_shape=raster.shape,
+                transform=transform,
+                invert=True,
             )
-            polygons = [shape(geom) for geom, value in shapes_generator if value == 1]
-            polygons = [p for p in polygons if overlap_percent(p, scene_edge) <= 0.5]
-
-            # Ensure there are polygons left after trimming to process into a MultiPolygon
-            if polygons:
-                multipolygon = MultiPolygon(polygons)
-                features.append(
-                    geojson.Feature(
-                        geometry=multipolygon,
-                        properties={
-                            "mean_conf": float(np.mean(masked_raster)),
-                            "median_conf": float(np.median(masked_raster)),
-                            "max_conf": float(np.max(masked_raster)),
-                            "machine_confidence": float(np.median(masked_raster)),
-                            **addl_props,
-                        },
-                    )
+
+            masked_raster = raster[geom_mask]
+
+            features.append(
+                geojson.Feature(
+                    geometry=multipolygon,
+                    properties={
+                        "mean_conf": float(np.mean(masked_raster)),
+                        "median_conf": float(np.median(masked_raster)),
+                        "max_conf": float(np.max(masked_raster)),
+                        "machine_confidence": float(np.median(masked_raster)),
+                        "p1_label": int(p1_label),
+                        **addl_props,
+                    },
                 )
+            )
 
         return features