Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly calculate new_high_sigma #714

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

#### Changed

- Reorganise new_high_sigma calculation into dedicated function [#714](https://github.com/askap-vast/vast-pipeline/pull/714)

#### Fixed

- Fix incorrect calculation of the new_high_sigma parameter [#714](https://github.com/askap-vast/vast-pipeline/pull/714)

#### Removed


#### List of PRs

- Correctly calculate new_high_sigma parameter [#714](https://github.com/askap-vast/vast-pipeline/pull/714)

## [1.1.1](https://github.com/askap-vast/vast-pipeline/releases/v1.1.1) (2024-10-15)

Expand Down
92 changes: 35 additions & 57 deletions vast_pipeline/pipeline/new_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,14 @@ def get_image_rms_measurements(
return group


def parallel_get_rms_measurements(
def parallel_get_new_high_sigma(
df: pd.DataFrame, edge_buffer: float = 1.0
) -> pd.DataFrame:
"""
Wrapper function to use 'get_image_rms_measurements'
in parallel with Dask. nbeam is not an option here as that parameter
is fixed in forced extraction and so is made sure to be fixed here to. This
may change in the future.
Wrapper function to use 'get_image_rms_measurements' in parallel with Dask
and calculate the new high sigma. nbeam is not an option here as that
parameter is fixed in forced extraction and so is made sure to be fixed
here too. This may change in the future.

Args:
df:
Expand Down Expand Up @@ -244,30 +244,48 @@ def parallel_get_rms_measurements(
).compute(num_workers=n_cpu, scheduler='processes')
)

# We don't need all of the RMS measurements, just the lowest. Keeping all
# of them results in huge memory usage when merging. However, there is an
# existing bug: https://github.com/askap-vast/vast-pipeline/issues/713
# that means that we actually want the _highest_ in order to reproduce the
# current behaviour. Fixing the bug is beyond the scope of this PR because
# it means rewriting tests and test data.

df_to_merge = (df.drop_duplicates('source')
.drop(['img_diff_rms_path'], axis=1)
)

out_to_merge = (out.sort_values(
by=['source', 'img_diff_true_rms'], ascending=False
by=['source', 'img_diff_true_rms'], ascending=True
)
.drop_duplicates('source')
)

df = df_to_merge.merge(
new_sources_df = df_to_merge.merge(
out_to_merge[['source', 'img_diff_true_rms']],
left_on='source', right_on='source',
how='left'
)

# this removes those that are out of range
new_sources_df['img_diff_true_rms'] = (
new_sources_df['img_diff_true_rms'].fillna(0.)
)
new_sources_df = new_sources_df[
new_sources_df['img_diff_true_rms'] != 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be missing something but just in case- perhaps you want > 0. rather than != 0 ? Similar to #755

]

# calculate the true sigma
new_sources_df['true_sigma'] = (
new_sources_df['flux_peak'].values
/ new_sources_df['img_diff_true_rms'].values
)

return df
# keep only the highest for each source, rename for the daatabase
new_sources_df = (
new_sources_df
.set_index('source')
.rename(columns={'true_sigma': 'new_high_sigma'})
)

# moving forward only the new_high_sigma columns is needed, drop all
# others.
new_sources_df = new_sources_df[['new_high_sigma']]

return new_sources_df


def new_sources(
Expand Down Expand Up @@ -456,53 +474,13 @@ def new_sources(
# to measure the true rms at the source location.

# measure the actual rms in the previous images at
# the source location.

# PR#713: This part of the code should be rewritten to reflect the new
# behaviour of parallel_get_rms_measurements. That function should be
# renamed to something like parallel_get_new_high_sigma and all of the
# subsequent code in this function moved into it.

logger.debug("Getting rms measurements...")
# the source location and calculate the corresponding S/N

new_sources_df = parallel_get_rms_measurements(
new_sources_df = parallel_get_new_high_sigma(
new_sources_df, edge_buffer=edge_buffer
)
logger.debug(f"Time to get rms measurements: {debug_timer.reset()}s")

# this removes those that are out of range
new_sources_df['img_diff_true_rms'] = (
new_sources_df['img_diff_true_rms'].fillna(0.)
)
new_sources_df = new_sources_df[
new_sources_df['img_diff_true_rms'] != 0
]

# calculate the true sigma
new_sources_df['true_sigma'] = (
new_sources_df['flux_peak'].values
/ new_sources_df['img_diff_true_rms'].values
)

# We only care about the highest true sigma
# new_sources_df = new_sources_df.sort_values(
# by=['source', 'true_sigma']
# )

# keep only the highest for each source, rename for the daatabase
new_sources_df = (
new_sources_df
# .drop_duplicates('source')
.set_index('source')
.rename(columns={'true_sigma': 'new_high_sigma'})
)

# moving forward only the new_high_sigma columns is needed, drop all
# others.
new_sources_df = new_sources_df[['new_high_sigma']]

logger.debug(f"Time to to do final cleanup steps {debug_timer.reset()}s")

logger.info(
'Total new source analysis time: %.2f seconds', timer.reset_init()
)
Expand Down
6 changes: 3 additions & 3 deletions vast_pipeline/tests/test_regression/test_epoch.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def test_known_source(self):
'''
See documentation for test_known_source in property check.
'''
property_check.test_known_source(self, self.sources, 12.369)
property_check.test_known_source(self, self.sources, 13.943)


@unittest.skipIf(
Expand Down Expand Up @@ -173,7 +173,7 @@ def test_known_source(self):
'''
See documentation for test_known_source in property_check.
'''
property_check.test_known_source(self, self.sources, 12.369)
property_check.test_known_source(self, self.sources, 13.943)


@unittest.skipIf(
Expand Down Expand Up @@ -248,4 +248,4 @@ def test_known_source(self):
'''
See documentation for test_known_source in property_check.
'''
property_check.test_known_source(self, self.sources, 12.369)
property_check.test_known_source(self, self.sources, 13.943)
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def test_known_source(self):
'''
See documentation for test_known_source in property_check.
'''
property_check.test_known_source(self, self.sources, 12.369)
property_check.test_known_source(self, self.sources, 13.943)


@unittest.skipIf(
Expand Down
6 changes: 3 additions & 3 deletions vast_pipeline/tests/test_regression/test_normal.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_known_source(self):
'''
See documentation for test_known_source in property_check.
'''
property_check.test_known_source(self, self.sources, 12.369)
property_check.test_known_source(self, self.sources, 13.943)


@unittest.skipIf(
Expand Down Expand Up @@ -176,7 +176,7 @@ def test_known_source(self):
'''
See documentation for test_known_source in property_check.
'''
property_check.test_known_source(self, self.sources, 12.369)
property_check.test_known_source(self, self.sources, 13.943)


@unittest.skipIf(
Expand Down Expand Up @@ -256,4 +256,4 @@ def test_known_source(self):
'''
See documentation for test_known_source in property_check.
'''
property_check.test_known_source(self, self.sources, 12.369)
property_check.test_known_source(self, self.sources, 13.943)