Skip to content

Commit

Permalink
Merge pull request #221 from nasa/feature/issue-198-ordered-output
Browse files Browse the repository at this point in the history
Feature/issue 198 ordered output
  • Loading branch information
danielfromearth authored Jul 9, 2024
2 parents dd870a7 + 391e4e0 commit 09b8099
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 61 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- [Issue #206](https://github.com/nasa/stitchee/issues/206): Group dependabot updates into one PR
- [issue #208](https://github.com/nasa/stitchee/issues/208): Increase continuous integration/unit test coverage
- [issue #198](https://github.com/nasa/stitchee/issues/198): Use time variable instead of concat dim for ordering datasets
### Deprecated
### Removed
### Fixed
Expand Down
24 changes: 14 additions & 10 deletions concatenator/attribute_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
Examples
--------
>>> coord_att = "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
>>> _flatten_coordinate_attribute(coord_att)
>>> flatten_string_with_groups(coord_att)
Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude
Parameters
Expand Down Expand Up @@ -54,44 +54,48 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
def flatten_coordinate_attribute_paths(
dataset: netCDF4.Dataset, var: netCDF4.Variable, variable_name: str
) -> None:
"""Flatten the paths of variables referenced in the coordinates attribute."""
"""Flatten the paths of variables referenced in the 'coordinates' attribute."""
if "coordinates" in var.ncattrs():
coord_att = var.getncattr("coordinates")

new_coord_att = _flatten_coordinate_attribute(coord_att)
new_coord_att = flatten_string_with_groups(coord_att)

dataset.variables[variable_name].setncattr("coordinates", new_coord_att)


def _flatten_coordinate_attribute(attribute_string: str) -> str:
"""Converts attributes that specify group membership via "/" to use new group delimiter, even for the root level.
def flatten_string_with_groups(str_with_groups: str) -> str:
"""Determine separator and flatten string specifying group membership via "/".
Applies to variable paths or attributes, even for the root level.
Examples
--------
>>> coord_att = "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
>>> _flatten_coordinate_attribute(coord_att)
>>> flatten_string_with_groups(coord_att)
__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude
Parameters
----------
attribute_string : str
str_with_groups : str
Returns
-------
str
"""
# Use the separator that's in the attribute string only if all separators in the string are the same.
# Otherwise, we will use our own default separator.
whitespaces = re.findall(r"\s+", attribute_string)
if len(set(whitespaces)) <= 1:
whitespaces = re.findall(r"\s+", str_with_groups)
if len(set(whitespaces)) == 0:
new_sep = ""
elif len(set(whitespaces)) == 1:
new_sep = whitespaces[0]
else:
new_sep = concatenator.coord_delim

# A new string is constructed.
return new_sep.join(
f'{concatenator.group_delim}{c.replace("/", concatenator.group_delim)}'
for c in attribute_string.split() # split on any whitespace
for c in str_with_groups.split() # split on any whitespace
)


Expand Down
9 changes: 5 additions & 4 deletions concatenator/dataset_and_group_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from __future__ import annotations

import re
from logging import Logger

import netCDF4 as nc
import numpy as np
Expand Down Expand Up @@ -273,7 +274,7 @@ def _get_nested_group(dataset: nc.Dataset, group_path: str) -> nc.Group:
return nested_group


def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize=4000) -> tuple:
def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize: int = 4000) -> tuple:
"""
For the given dataset, calculate if the size on any dimension is
worth chunking. Any dimension larger than 4000 will be chunked. This
Expand Down Expand Up @@ -324,8 +325,8 @@ def _get_dimension_size(dataset: nc.Dataset, dim_name: str) -> int:
return dim_size


def validate_workable_files(files, logger) -> tuple[list[str], int]:
"""Remove files from list that are not open-able as netCDF or that are empty."""
def validate_workable_files(files: list[str], logger: Logger) -> tuple[list[str], int]:
"""Remove files from a list that are not open-able as netCDF or that are empty."""
workable_files = []
for file in files:
try:
Expand All @@ -336,7 +337,7 @@ def validate_workable_files(files, logger) -> tuple[list[str], int]:
except OSError:
logger.debug("Error opening <%s> as a netCDF dataset. Skipping.", file)

# addressing the issue 153: propagate first empty file if all input files are empty
# addressing GitHub issue 153: propagate the first empty file if all input files are empty
if (len(workable_files)) == 0 and (len(files) > 0):
workable_files.append(files[0])

Expand Down
7 changes: 6 additions & 1 deletion concatenator/stitchee.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import xarray as xr

import concatenator
from concatenator.attribute_handling import flatten_string_with_groups
from concatenator.dataset_and_group_handling import (
flatten_grouped_dataset,
regroup_flattened_dataset,
Expand All @@ -39,6 +40,7 @@ def stitchee(
concat_method: str | None = "xarray-concat",
concat_dim: str = "",
concat_kwargs: dict | None = None,
time_variable: str = "geolocation/time",
history_to_append: str | None = None,
copy_input_files: bool = False,
overwrite_output_file: bool = False,
Expand Down Expand Up @@ -136,7 +138,10 @@ def stitchee(
decode_coords=False,
drop_variables=coord_vars,
)
first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0]

# Determine value for later dataset sorting.
first_value = xrds[flatten_string_with_groups(time_variable)].values.flatten()[0]
# first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0]
concat_dim_order.append(first_value)

benchmark_log["flattening"] = time.time() - start_time
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ class TempDirs(typing.NamedTuple):
toy_data_path: Path


def path_str(dir_path: Path, filename: str) -> str:
return str(dir_path.joinpath(filename))


def pytest_addoption(parser):
"""Sets up optional argument to keep temporary testing directory."""
parser.addoption(
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_history_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def test_construct_and_append_history_for_sample_concatenation(
concat_method="xarray-concat",
history_to_append=new_history_json,
concat_dim="step",
time_variable="step",
)
stitcheed_dataset = xr.open_dataset(output_path)

Expand Down
14 changes: 11 additions & 3 deletions tests/unit/test_attribute_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,37 @@

import concatenator
from concatenator.attribute_handling import (
_flatten_coordinate_attribute,
flatten_string_with_groups,
regroup_coordinate_attribute,
)


def test_coordinate_attribute_flattening():
# Case with groups present and double spaces.
assert (
_flatten_coordinate_attribute(
flatten_string_with_groups(
"Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
)
== "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
)

# Case with NO groups present and single spaces.
assert (
_flatten_coordinate_attribute(
flatten_string_with_groups(
"time longitude latitude ozone_profile_pressure ozone_profile_altitude"
)
== "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
)


def test_variable_path_flattening():
# Case with group present.
assert flatten_string_with_groups("geolocation/time") == "__geolocation__time"

# Case with NO groups present.
assert flatten_string_with_groups("time") == "__time"


def test_coordinate_attribute_regrouping():
# Case with groups present and double spaces.
assert (
Expand Down
72 changes: 32 additions & 40 deletions tests/unit/test_group_handling.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,36 @@
"""Tests for manipulating netCDF groups."""

# pylint: disable=C0116, C0301
import logging
from pathlib import Path

from concatenator.attribute_handling import (
_flatten_coordinate_attribute,
regroup_coordinate_attribute,
)


def test_coordinate_attribute_flattening():
# Case with groups present and double spaces.
assert (
_flatten_coordinate_attribute(
"Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
)
== "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
)

# Case with NO groups present and single spaces.
assert (
_flatten_coordinate_attribute(
"time longitude latitude ozone_profile_pressure ozone_profile_altitude"
)
== "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
)


def test_coordinate_attribute_regrouping():
# Case with groups present and double spaces.
assert (
regroup_coordinate_attribute(
"__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
)
== "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
)

# Case with NO groups present and single spaces.
assert (
regroup_coordinate_attribute(
"__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
)
== "time longitude latitude ozone_profile_pressure ozone_profile_altitude"
)
import pytest

from concatenator.dataset_and_group_handling import validate_workable_files

from ..conftest import path_str

logger = logging.getLogger(__name__)


@pytest.mark.usefixtures("pass_options")
class TestGroupHandling:
__test_path = Path(__file__).parents[1].resolve()
__data_path = __test_path.joinpath("data")
__harmony_path = __data_path.joinpath("harmony")
__granules_path = __harmony_path.joinpath("granules")

def test_workable_files_validation(self, temp_output_dir):
filepaths = [
path_str(
self.__granules_path, "TEMPO_NO2_L2_V03_20240601T210934Z_S012G01_subsetted.nc4"
),
path_str(
self.__granules_path, "TEMPO_NO2_L2_V03_20240601T211614Z_S012G02_subsetted.nc4"
),
path_str(
self.__granules_path, "TEMPO_NO2_L2_V03_20240601T212254Z_S012G03_subsetted.nc4"
),
]

assert validate_workable_files(filepaths, logger)[1] == 3
4 changes: 1 addition & 3 deletions tests/unit/test_run_stitchee.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
import concatenator
from concatenator.run_stitchee import parse_args


def path_str(dir_path: Path, filename: str) -> str:
return str(dir_path.joinpath(filename))
from ..conftest import path_str


def test_parser():
Expand Down

0 comments on commit 09b8099

Please sign in to comment.