diff --git a/CHANGELOG.md b/CHANGELOG.md index f0928d75b..b25d26cf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,15 +11,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), #### Changed +- Reorganise new_high_sigma calculation into dedicated function [#714](https://github.com/askap-vast/vast-pipeline/pull/714) #### Fixed +- Fix incorrect calculation of the new_high_sigma parameter [#714](https://github.com/askap-vast/vast-pipeline/pull/714) #### Removed #### List of PRs +- Correctly calculate new_high_sigma parameter [#714](https://github.com/askap-vast/vast-pipeline/pull/714) ## [1.1.1](https://github.com/askap-vast/vast-pipeline/releases/v1.1.1) (2024-10-15) diff --git a/vast_pipeline/pipeline/new_sources.py b/vast_pipeline/pipeline/new_sources.py index 14d2b3dfb..36ca421c3 100644 --- a/vast_pipeline/pipeline/new_sources.py +++ b/vast_pipeline/pipeline/new_sources.py @@ -196,14 +196,14 @@ def get_image_rms_measurements( return group -def parallel_get_rms_measurements( +def parallel_get_new_high_sigma( df: pd.DataFrame, edge_buffer: float = 1.0 ) -> pd.DataFrame: """ - Wrapper function to use 'get_image_rms_measurements' - in parallel with Dask. nbeam is not an option here as that parameter - is fixed in forced extraction and so is made sure to be fixed here to. This - may change in the future. + Wrapper function to use 'get_image_rms_measurements' in parallel with Dask + and calculate the new high sigma. nbeam is not an option here as that + parameter is fixed in forced extraction and so is made sure to be fixed + here too. This may change in the future. Args: df: @@ -244,30 +244,48 @@ def parallel_get_rms_measurements( ).compute(num_workers=n_cpu, scheduler='processes') ) - # We don't need all of the RMS measurements, just the lowest. Keeping all - # of them results in huge memory usage when merging. However, there is an - # existing bug: https://github.com/askap-vast/vast-pipeline/issues/713 - # that means that we actually want the _highest_ in order to reproduce the - # current behaviour. Fixing the bug is beyond the scope of this PR because - # it means rewriting tests and test data. - df_to_merge = (df.drop_duplicates('source') .drop(['img_diff_rms_path'], axis=1) ) out_to_merge = (out.sort_values( - by=['source', 'img_diff_true_rms'], ascending=False + by=['source', 'img_diff_true_rms'], ascending=True ) .drop_duplicates('source') ) - df = df_to_merge.merge( + new_sources_df = df_to_merge.merge( out_to_merge[['source', 'img_diff_true_rms']], left_on='source', right_on='source', how='left' ) + + # this removes those that are out of range + new_sources_df['img_diff_true_rms'] = ( + new_sources_df['img_diff_true_rms'].fillna(0.) + ) + new_sources_df = new_sources_df[ + new_sources_df['img_diff_true_rms'] != 0 + ] + + # calculate the true sigma + new_sources_df['true_sigma'] = ( + new_sources_df['flux_peak'].values + / new_sources_df['img_diff_true_rms'].values + ) - return df + # keep only the highest for each source, rename for the daatabase + new_sources_df = ( + new_sources_df + .set_index('source') + .rename(columns={'true_sigma': 'new_high_sigma'}) + ) + + # moving forward only the new_high_sigma columns is needed, drop all + # others. + new_sources_df = new_sources_df[['new_high_sigma']] + + return new_sources_df def new_sources( @@ -456,53 +474,13 @@ def new_sources( # to measure the true rms at the source location. # measure the actual rms in the previous images at - # the source location. - - # PR#713: This part of the code should be rewritten to reflect the new - # behaviour of parallel_get_rms_measurements. That function should be - # renamed to something like parallel_get_new_high_sigma and all of the - # subsequent code in this function moved into it. - - logger.debug("Getting rms measurements...") + # the source location and calculate the corresponding S/N - new_sources_df = parallel_get_rms_measurements( + new_sources_df = parallel_get_new_high_sigma( new_sources_df, edge_buffer=edge_buffer ) logger.debug(f"Time to get rms measurements: {debug_timer.reset()}s") - # this removes those that are out of range - new_sources_df['img_diff_true_rms'] = ( - new_sources_df['img_diff_true_rms'].fillna(0.) - ) - new_sources_df = new_sources_df[ - new_sources_df['img_diff_true_rms'] != 0 - ] - - # calculate the true sigma - new_sources_df['true_sigma'] = ( - new_sources_df['flux_peak'].values - / new_sources_df['img_diff_true_rms'].values - ) - - # We only care about the highest true sigma - # new_sources_df = new_sources_df.sort_values( - # by=['source', 'true_sigma'] - # ) - - # keep only the highest for each source, rename for the daatabase - new_sources_df = ( - new_sources_df - # .drop_duplicates('source') - .set_index('source') - .rename(columns={'true_sigma': 'new_high_sigma'}) - ) - - # moving forward only the new_high_sigma columns is needed, drop all - # others. - new_sources_df = new_sources_df[['new_high_sigma']] - - logger.debug(f"Time to to do final cleanup steps {debug_timer.reset()}s") - logger.info( 'Total new source analysis time: %.2f seconds', timer.reset_init() ) diff --git a/vast_pipeline/tests/test_regression/test_epoch.py b/vast_pipeline/tests/test_regression/test_epoch.py index 05d889274..72887df78 100644 --- a/vast_pipeline/tests/test_regression/test_epoch.py +++ b/vast_pipeline/tests/test_regression/test_epoch.py @@ -100,7 +100,7 @@ def test_known_source(self): ''' See documentation for test_known_source in property check. ''' - property_check.test_known_source(self, self.sources, 12.369) + property_check.test_known_source(self, self.sources, 13.943) @unittest.skipIf( @@ -173,7 +173,7 @@ def test_known_source(self): ''' See documentation for test_known_source in property_check. ''' - property_check.test_known_source(self, self.sources, 12.369) + property_check.test_known_source(self, self.sources, 13.943) @unittest.skipIf( @@ -248,4 +248,4 @@ def test_known_source(self): ''' See documentation for test_known_source in property_check. ''' - property_check.test_known_source(self, self.sources, 12.369) + property_check.test_known_source(self, self.sources, 13.943) diff --git a/vast_pipeline/tests/test_regression/test_epoch_parallel_add_image.py b/vast_pipeline/tests/test_regression/test_epoch_parallel_add_image.py index 793beb4fa..9a8695376 100644 --- a/vast_pipeline/tests/test_regression/test_epoch_parallel_add_image.py +++ b/vast_pipeline/tests/test_regression/test_epoch_parallel_add_image.py @@ -298,7 +298,7 @@ def test_known_source(self): ''' See documentation for test_known_source in property_check. ''' - property_check.test_known_source(self, self.sources, 12.369) + property_check.test_known_source(self, self.sources, 13.943) @unittest.skipIf( diff --git a/vast_pipeline/tests/test_regression/test_normal.py b/vast_pipeline/tests/test_regression/test_normal.py index 79e3be417..74062e089 100644 --- a/vast_pipeline/tests/test_regression/test_normal.py +++ b/vast_pipeline/tests/test_regression/test_normal.py @@ -93,7 +93,7 @@ def test_known_source(self): ''' See documentation for test_known_source in property_check. ''' - property_check.test_known_source(self, self.sources, 12.369) + property_check.test_known_source(self, self.sources, 13.943) @unittest.skipIf( @@ -176,7 +176,7 @@ def test_known_source(self): ''' See documentation for test_known_source in property_check. ''' - property_check.test_known_source(self, self.sources, 12.369) + property_check.test_known_source(self, self.sources, 13.943) @unittest.skipIf( @@ -256,4 +256,4 @@ def test_known_source(self): ''' See documentation for test_known_source in property_check. ''' - property_check.test_known_source(self, self.sources, 12.369) + property_check.test_known_source(self, self.sources, 13.943)