diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2aa0f30..ba7dee6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,9 +38,3 @@ repos: - id: yamllint args: ["-d {extends: relaxed, rules: {line-length: {max: 120}}}"] stages: [commit, push] - - - repo: https://github.com/pryorda/dockerfilelint-precommit-hooks - rev: v0.1.0 - hooks: - - id: dockerfilelint - stages: [commit, push] diff --git a/CHANGELOG.md b/CHANGELOG.md index c309092..f8b85f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- [issue/6](https://github.com/danielfromearth/batchee/issues/6): Create Adapter code that processes a Harmony Message and STAC Catalog +- [issue/7](https://github.com/danielfromearth/batchee/issues/7): Create working Docker image - [issue/13](https://github.com/danielfromearth/batchee/issues/13): Add simple command line interface for testing - [issue/16](https://github.com/danielfromearth/batchee/issues/16): Add a logo -- [issue/6](https://github.com/danielfromearth/batchee/issues/6): Create Adapter code that processes a Harmony Message and STAC Catalog ### Changed - [issue/11](https://github.com/danielfromearth/batchee/issues/11): Rename from concat_batcher to batchee - [issue/21](https://github.com/danielfromearth/batchee/issues/21): Improve CICD workflows diff --git a/docker/Dockerfile b/Dockerfile similarity index 81% rename from docker/Dockerfile rename to Dockerfile index 1f74124..96036d1 100644 --- a/docker/Dockerfile +++ b/Dockerfile @@ -9,8 +9,9 @@ RUN apt-get update \ #hdf5-helpers \ && pip3 install --upgrade pip \ && pip3 install cython \ - && apt-get clean \ - && pip3 install poetry + && pip3 install poetry \ + && apt-get clean + # Create a new user RUN adduser --quiet --disabled-password --shell /bin/sh --home /home/dockeruser --gecos "" --uid 1000 dockeruser @@ -28,12 +29,13 @@ ARG DIST_PATH USER root RUN mkdir -p /worker && chown dockeruser /worker -COPY ../pyproject.toml /worker +COPY pyproject.toml /worker +# COPY ../pyproject.toml /worker USER dockeruser WORKDIR /worker -ENV PYTHONPATH=${PYTHONPATH}:${PWD} +# ENV PYTHONPATH=${PYTHONPATH}:${PWD} COPY --chown=dockeruser $DIST_PATH $DIST_PATH USER dockeruser @@ -46,5 +48,6 @@ RUN poetry config virtualenvs.create false RUN poetry install --no-dev USER dockeruser -# Run the Batchee Harmony service -ENTRYPOINT ["batchee_harmony"] \ No newline at end of file +COPY --chown=dockeruser ./docker-entrypoint.sh docker-entrypoint.sh +# Run the service +ENTRYPOINT ["./docker-entrypoint.sh"] diff --git a/README.md b/README.md index a6e3a04..0811fa8 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,36 @@ _____ _Batchee_ groups together filenames so that further operations (such as concatenation) can be performed separately on each group of files. + +## Installing +_____ + +For local development, one can clone the repository and then use poetry or pip from the local directory: + +```shell +git clone +``` + +###### (Option A) using poetry: +i) Follow the instructions for installing `poetry` [here](https://python-poetry.org/docs/). + +ii) Run ```poetry install``` from the repository directory. + +###### (Option B) using pip: Run ```pip install .``` from the repository directory. + +## Usage +_____ + +```shell +batchee [file_names ...] +``` + +###### Or, If installed using a `poetry` environment: +```shell +poetry run batchee [file_names ...] +``` + +#### Options + +- `-h`, `--help` show this help message and exit +- `-v`, `--verbose` Enable verbose output to stdout; useful for debugging diff --git a/batcher/harmony/cli.py b/batcher/harmony/cli.py index 61756ff..802d373 100644 --- a/batcher/harmony/cli.py +++ b/batcher/harmony/cli.py @@ -1,5 +1,4 @@ """A Harmony CLI wrapper around the concatenate-batcher""" -import sys from argparse import ArgumentParser import harmony @@ -7,27 +6,23 @@ from batcher.harmony.service_adapter import ConcatBatching as HarmonyAdapter -def main(argv, **kwargs): - """Main Harmony CLI entrypoint +def main(config: harmony.util.Config = None) -> None: + """Parse command line arguments and invoke the service to respond to them. - Parses command line arguments and invokes the appropriate method to respond to them + Parameters + ---------- + config : harmony.util.Config + harmony.util.Config is injectable for tests Returns ------- None """ - - config = None - # Optional: harmony.util.Config is injectable for tests - if "config" in kwargs: - config = kwargs.get("config") - parser = ArgumentParser( prog="Pre-concatenate-batching", description="Run the pre-concatenate-batching service" ) harmony.setup_cli(parser) - - args = parser.parse_args(argv[1:]) + args = parser.parse_args() if harmony.is_harmony_cli(args): harmony.run_cli(parser, args, HarmonyAdapter, cfg=config) else: @@ -35,4 +30,4 @@ def main(argv, **kwargs): if __name__ == "__main__": - main(sys.argv) + main() diff --git a/batcher/harmony/service_adapter.py b/batcher/harmony/service_adapter.py index f6078fa..31bff4b 100644 --- a/batcher/harmony/service_adapter.py +++ b/batcher/harmony/service_adapter.py @@ -50,6 +50,7 @@ def invoke(self): def process_catalog(self, catalog: pystac.Catalog): """Converts a list of STAC catalogs into a list of lists of STAC catalogs.""" + self.logger.info("process_catalog() started.") try: result = catalog.clone() result.id = str(uuid4()) @@ -58,16 +59,20 @@ def process_catalog(self, catalog: pystac.Catalog): # Get all the items from the catalog, including from child or linked catalogs items = list(self.get_all_catalog_items(catalog)) + self.logger.info(f"length of items==={len(items)}.") + # Quick return if catalog contains no items if len(items) == 0: return result # # --- Get granule filepaths (urls) --- netcdf_urls: list[str] = _get_netcdf_urls(items) + self.logger.info(f"netcdf_urls==={netcdf_urls}.") # --- Map each granule to an index representing the batch to which it belongs --- - batch_indices: list[int] = get_batch_indices(netcdf_urls) + batch_indices: list[int] = get_batch_indices(netcdf_urls, self.logger) sorted(set(batch_indices), key=batch_indices.index) + self.logger.info(f"batch_indices==={batch_indices}.") # --- Construct a dictionary with a separate key for each batch --- grouped: dict[int, list[Item]] = {} @@ -83,6 +88,8 @@ def process_catalog(self, catalog: pystac.Catalog): bounding_box = _get_output_bounding_box(batch_items) properties = _get_output_date_range(batch_items) + self.logger.info(f"constructing new pystac.Item for batch_id==={batch_id}.") + # Construct a new pystac.Item with every granule in the batch as a pystac.Asset output_item = Item( str(uuid4()), bbox_to_geometry(bounding_box), bounding_box, None, properties @@ -90,7 +97,7 @@ def process_catalog(self, catalog: pystac.Catalog): for idx, item in enumerate(batch_items): output_item.add_asset( - "data", + f"data_{idx}", Asset( batch_urls[idx], title=batch_urls[idx], @@ -101,6 +108,8 @@ def process_catalog(self, catalog: pystac.Catalog): result.add_item(output_item) + self.logger.info("STAC catalog creation complete.") + return result except Exception as service_exception: diff --git a/batcher/tempo_filename_parser.py b/batcher/tempo_filename_parser.py index 7d6dbe4..06488d4 100644 --- a/batcher/tempo_filename_parser.py +++ b/batcher/tempo_filename_parser.py @@ -4,6 +4,8 @@ from argparse import ArgumentParser from pathlib import Path +default_logger = logging.getLogger(__name__) + tempo_granule_filename_pattern = re.compile( r"^.*TEMPO_" r"(?P[1-9A-Z]+)" @@ -17,13 +19,15 @@ ) -def get_batch_indices(filenames: list) -> list[int]: +def get_batch_indices(filenames: list, logger: logging.Logger = default_logger) -> list[int]: """ Returns ------- list[int] batch index for each filename in the original list, e.g. [0, 0, 0, 1, 1, 1, ...] """ + logger.info(f"get_batch_indices() starting --- with {len(filenames)} filenames") + # Make a new list with days and scans, e.g. [('20130701', 'S009'), ('20130701', 'S009'), ...] day_and_scans: list[tuple[str, str]] = [] for name in filenames: @@ -35,6 +39,8 @@ def get_batch_indices(filenames: list) -> list[int]: # Unique day-scans are determined (while keeping the same order). Each will be its own batch. unique_day_scans: list[tuple[str, str]] = sorted(set(day_and_scans), key=day_and_scans.index) + logger.info(f"unique_day_scans==={unique_day_scans}.") + # Map each day/scan to an integer batch_mapper: dict[tuple[str, str], int] = { day_scan: idx for idx, day_scan in enumerate(unique_day_scans) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100755 index 0000000..9ce02d6 --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +if [ "$1" = 'batchee' ]; then + exec batchee "$@" +elif [ "$1" = 'batchee_harmony' ]; then + exec batchee_harmony "$@" +else + exec batchee_harmony "$@" +fi diff --git a/docker/Readme.md b/docker/Readme.md deleted file mode 100644 index 5f05138..0000000 --- a/docker/Readme.md +++ /dev/null @@ -1,8 +0,0 @@ -# Batchee Service Docker Image - -This directory contains the `Dockerfile` used to build the Docker image capable of running the Batchee service. - -## Building - -The docker image is setup to install the Batchee project into userspace using pip. It will look -in both PyPi and TestPyPi indexes unless building from a local wheel file. \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 353a20f..f53d2bd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "black" @@ -20,7 +20,6 @@ files = [ {file = "black-23.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:840015166dbdfbc47992871325799fd2dc0dcf9395e401ada6d88fe11498abad"}, {file = "black-23.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:037e9b4664cafda5f025a1728c50a9e9aedb99a759c89f760bd83730e76ba884"}, {file = "black-23.10.1-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:7cb5936e686e782fddb1c73f8aa6f459e1ad38a6a7b0e54b403f1f05a1507ee9"}, - {file = "black-23.10.1-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:7670242e90dc129c539e9ca17665e39a146a761e681805c54fbd86015c7c84f7"}, {file = "black-23.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed45ac9a613fb52dad3b61c8dea2ec9510bf3108d4db88422bacc7d1ba1243d"}, {file = "black-23.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:6d23d7822140e3fef190734216cefb262521789367fbdc0b3f22af6744058982"}, {file = "black-23.10.1-py3-none-any.whl", hash = "sha256:d431e6739f727bb2e0495df64a6c7a5310758e87505f5f8cde9ff6c0f2d7e4fe"}, @@ -44,17 +43,17 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "boto3" -version = "1.28.70" +version = "1.28.73" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.28.70-py3-none-any.whl", hash = "sha256:22ec3b54801c81746657827c7b1c4a3b2e4cfa7c21be3b96218d32e9390ee5eb"}, - {file = "boto3-1.28.70.tar.gz", hash = "sha256:89002e1d8411c7c54110f9f8fc4a11d57d6d7977c0cb4ba064887ca5d4c788f7"}, + {file = "boto3-1.28.73-py3-none-any.whl", hash = "sha256:bbe377a288b6b12b526fae3b3d743318c6868626cf67e1e97f104345a5194b1e"}, + {file = "boto3-1.28.73.tar.gz", hash = "sha256:a61cf96f7e196b1450afdf4856b7ea0e58080752e687b0011157be96934489be"}, ] [package.dependencies] -botocore = ">=1.31.70,<1.32.0" +botocore = ">=1.31.73,<1.32.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.7.0,<0.8.0" @@ -63,13 +62,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.31.70" +version = "1.31.73" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.31.70-py3-none-any.whl", hash = "sha256:049bbf526c95b6169f59617a5ff1b0061cb7a0e44992b8c27c6955832b383988"}, - {file = "botocore-1.31.70.tar.gz", hash = "sha256:5f49def4ec2e4216dd0195d23d9811027d02ee6c8a37b031e2b2fe38e8c77ddc"}, + {file = "botocore-1.31.73-py3-none-any.whl", hash = "sha256:6e9caaa7205e0c0505f4868a4053e96eaf3f4b6bce0368a46970a8efeeacb492"}, + {file = "botocore-1.31.73.tar.gz", hash = "sha256:5334c22d5a3f4643931896137c57b2496fef005b039d87d8740e7a28eb31519d"}, ] [package.dependencies] @@ -771,4 +770,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "136176ca1dd485addcf435f780b432bec2f148825e662f25538a3fa091984e97" +content-hash = "13d72d5e93ac5ddb6bd054b98d9cf04ed920d251e91e623fb0a48a17b777cf2e" diff --git a/pyproject.toml b/pyproject.toml index 0cd1019..1c090dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,17 +15,18 @@ packages = [ python = "^3.10" harmony-service-lib = "^1.0.23" +[tool.poetry.scripts] +batchee_harmony = 'batcher.harmony.cli:main' +batchee = 'batcher.tempo_filename_parser:main' + [tool.poetry.group.dev.dependencies] +coverage = "^7.3.2" +ruff = "^0.1.3" pytest = "^7.4.3" black = "^23.10.1" mypy = "^1.6.1" -ruff = "^0.1.3" pytest-cov = "^4.1.0" -[tool.poetry.scripts] -batchee_harmony = 'batcher.harmony.cli:main' -batchee = 'batcher.tempo_filename_parser:main' - [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api"