askap-vast · ddobie · May 23, 2024 · Jul 23, 2024 · Jul 23, 2024 · Oct 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,15 +11,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 #### Changed
 
+- Reorganise new_high_sigma calculation into dedicated function [#714](https://github.com/askap-vast/vast-pipeline/pull/714)
 
 #### Fixed
 
+- Fix incorrect calculation of the new_high_sigma parameter [#714](https://github.com/askap-vast/vast-pipeline/pull/714)
 
 #### Removed
 
 
 #### List of PRs
 
+- Correctly calculate new_high_sigma parameter [#714](https://github.com/askap-vast/vast-pipeline/pull/714)
 
 ## [1.1.1](https://github.com/askap-vast/vast-pipeline/releases/v1.1.1) (2024-10-15)
 

diff --git a/vast_pipeline/pipeline/new_sources.py b/vast_pipeline/pipeline/new_sources.py
@@ -196,14 +196,14 @@ def get_image_rms_measurements(
     return group
 
 
-def parallel_get_rms_measurements(
+def parallel_get_new_high_sigma(
     df: pd.DataFrame, edge_buffer: float = 1.0
 ) -> pd.DataFrame:
     """
-    Wrapper function to use 'get_image_rms_measurements'
-    in parallel with Dask. nbeam is not an option here as that parameter
-    is fixed in forced extraction and so is made sure to be fixed here to. This
-    may change in the future.
+    Wrapper function to use 'get_image_rms_measurements' in parallel with Dask
+    and calculate the new high sigma. nbeam is not an option here as that
+    parameter is fixed in forced extraction and so is made sure to be fixed
+    here too. This may change in the future.
 
     Args:
         df:
@@ -244,30 +244,48 @@ def parallel_get_rms_measurements(
         ).compute(num_workers=n_cpu, scheduler='processes')
     )
 
-    # We don't need all of the RMS measurements, just the lowest. Keeping all
-    # of them results in huge memory usage when merging. However, there is an
-    # existing bug: https://github.com/askap-vast/vast-pipeline/issues/713
-    # that means that we actually want the _highest_ in order to reproduce the
-    # current behaviour. Fixing the bug is beyond the scope of this PR because
-    # it means rewriting tests and test data.
-
     df_to_merge = (df.drop_duplicates('source')
                    .drop(['img_diff_rms_path'], axis=1)
                    )
 
     out_to_merge = (out.sort_values(
-        by=['source', 'img_diff_true_rms'], ascending=False
+        by=['source', 'img_diff_true_rms'], ascending=True
     )
         .drop_duplicates('source')
     )
 
-    df = df_to_merge.merge(
+    new_sources_df = df_to_merge.merge(
         out_to_merge[['source', 'img_diff_true_rms']],
         left_on='source', right_on='source',
         how='left'
     )
+
+    # this removes those that are out of range
+    new_sources_df['img_diff_true_rms'] = (
+        new_sources_df['img_diff_true_rms'].fillna(0.)
+    )
+    new_sources_df = new_sources_df[
+        new_sources_df['img_diff_true_rms'] != 0
+    ]
+
+    # calculate the true sigma
+    new_sources_df['true_sigma'] = (
+        new_sources_df['flux_peak'].values
+        / new_sources_df['img_diff_true_rms'].values
+    )
 
-    return df
+    # keep only the highest for each source, rename for the daatabase
+    new_sources_df = (
+        new_sources_df
+        .set_index('source')
+        .rename(columns={'true_sigma': 'new_high_sigma'})
+    )
+
+    # moving forward only the new_high_sigma columns is needed, drop all
+    # others.
+    new_sources_df = new_sources_df[['new_high_sigma']]
+
+    return new_sources_df
 
 
 def new_sources(
@@ -456,53 +474,13 @@ def new_sources(
     # to measure the true rms at the source location.
 
     # measure the actual rms in the previous images at
-    # the source location.
-
-    # PR#713: This part of the code should be rewritten to reflect the new
-    # behaviour of parallel_get_rms_measurements. That function should be
-    # renamed to something like parallel_get_new_high_sigma and all of the
-    # subsequent code in this function moved into it.
-
-    logger.debug("Getting rms measurements...")
+    # the source location and calculate the corresponding S/N
 
-    new_sources_df = parallel_get_rms_measurements(
+    new_sources_df = parallel_get_new_high_sigma(
         new_sources_df, edge_buffer=edge_buffer
     )
     logger.debug(f"Time to get rms measurements: {debug_timer.reset()}s")
 
-    # this removes those that are out of range
-    new_sources_df['img_diff_true_rms'] = (
-        new_sources_df['img_diff_true_rms'].fillna(0.)
-    )
-    new_sources_df = new_sources_df[
-        new_sources_df['img_diff_true_rms'] != 0
-    ]
-
-    # calculate the true sigma
-    new_sources_df['true_sigma'] = (
-        new_sources_df['flux_peak'].values
-        / new_sources_df['img_diff_true_rms'].values
-    )
-
-    # We only care about the highest true sigma
-    # new_sources_df = new_sources_df.sort_values(
-    #    by=['source', 'true_sigma']
-    # )
-
-    # keep only the highest for each source, rename for the daatabase
-    new_sources_df = (
-        new_sources_df
-        # .drop_duplicates('source')
-        .set_index('source')
-        .rename(columns={'true_sigma': 'new_high_sigma'})
-    )
-
-    # moving forward only the new_high_sigma columns is needed, drop all
-    # others.
-    new_sources_df = new_sources_df[['new_high_sigma']]
-
-    logger.debug(f"Time to to do final cleanup steps {debug_timer.reset()}s")
-
     logger.info(
         'Total new source analysis time: %.2f seconds', timer.reset_init()
     )

diff --git a/vast_pipeline/tests/test_regression/test_epoch.py b/vast_pipeline/tests/test_regression/test_epoch.py
@@ -100,7 +100,7 @@ def test_known_source(self):
         '''
         See documentation for test_known_source in property check.
         '''
-        property_check.test_known_source(self, self.sources, 12.369)
+        property_check.test_known_source(self, self.sources, 13.943)
 
 
 @unittest.skipIf(
@@ -173,7 +173,7 @@ def test_known_source(self):
         '''
         See documentation for test_known_source in property_check.
         '''
-        property_check.test_known_source(self, self.sources, 12.369)
+        property_check.test_known_source(self, self.sources, 13.943)
 
 
 @unittest.skipIf(
@@ -248,4 +248,4 @@ def test_known_source(self):
         '''
         See documentation for test_known_source in property_check.
         '''
-        property_check.test_known_source(self, self.sources, 12.369)
+        property_check.test_known_source(self, self.sources, 13.943)
diff --git a/vast_pipeline/tests/test_regression/test_epoch_parallel_add_image.py b/vast_pipeline/tests/test_regression/test_epoch_parallel_add_image.py
@@ -298,7 +298,7 @@ def test_known_source(self):
         '''
         See documentation for test_known_source in property_check.
         '''
-        property_check.test_known_source(self, self.sources, 12.369)
+        property_check.test_known_source(self, self.sources, 13.943)
 
 
 @unittest.skipIf(

diff --git a/vast_pipeline/tests/test_regression/test_normal.py b/vast_pipeline/tests/test_regression/test_normal.py
@@ -93,7 +93,7 @@ def test_known_source(self):
         '''
         See documentation for test_known_source in property_check.
         '''
-        property_check.test_known_source(self, self.sources, 12.369)
+        property_check.test_known_source(self, self.sources, 13.943)
 
 
 @unittest.skipIf(
@@ -176,7 +176,7 @@ def test_known_source(self):
         '''
         See documentation for test_known_source in property_check.
         '''
-        property_check.test_known_source(self, self.sources, 12.369)
+        property_check.test_known_source(self, self.sources, 13.943)
 
 
 @unittest.skipIf(
@@ -256,4 +256,4 @@ def test_known_source(self):
         '''
         See documentation for test_known_source in property_check.
         '''
-        property_check.test_known_source(self, self.sources, 12.369)
+        property_check.test_known_source(self, self.sources, 13.943)