From bba76ad1a104bef147bcc959b4fa952a330a7f09 Mon Sep 17 00:00:00 2001 From: Vijay Vammi Date: Wed, 14 Feb 2024 12:01:44 +0000 Subject: [PATCH] feat: Getting a bit closer to release --- .github/workflows/release.yaml | 19 +- README.md | 10 +- docs/.DS_Store | Bin 8196 -> 8196 bytes docs/command-line.md | 294 --- docs/concepts/catalog.md | 16 +- docs/concepts/executor.md | 24 +- docs/concepts/experiment-tracking.md | 12 +- docs/concepts/map.md | 10 +- docs/concepts/nesting.md | 2 +- docs/concepts/parallel.md | 8 +- docs/concepts/parameters.md | 8 +- docs/concepts/run-log.md | 14 +- docs/concepts/secrets.md | 4 +- docs/concepts/task.md | 34 +- docs/concepts/the-big-picture.md | 50 +- docs/configurations/catalog.md | 2 +- docs/configurations/executors/argo.md | 34 +- .../executors/container-environments.md | 8 +- .../executors/local-container.md | 18 +- docs/configurations/executors/local.md | 2 +- docs/configurations/executors/mocked.md | 4 +- docs/configurations/overview.md | 14 +- docs/configurations/run-log.md | 6 +- docs/configurations/secrets.md | 6 +- docs/example/dataflow.md | 8 +- docs/example/example.md | 18 +- docs/example/experiment-tracking.md | 6 +- docs/example/reproducibility.md | 4 +- docs/example/retry-after-failure.md | 4 +- docs/example/secrets.md | 2 +- docs/example/steps.md | 18 +- docs/examples.md | 1879 ----------------- docs/extensions.md | 61 +- docs/how-do-i.md | 41 - docs/index.md | 24 +- .../integration.py => docs/roadmap.md | 0 docs/usage.md | 6 +- examples/concepts/catalog_object.py | 2 + examples/concepts/nesting.py | 2 +- .../notebook_api_parameters_out.ipynb | 22 +- .../notebook_env_parameters_out.ipynb | 22 +- .../notebook_native_parameters_out.ipynb | 26 +- examples/concepts/simple_notebook_out.ipynb | 22 +- examples/python-tasks-argo.py | 2 +- magnus/__init__.py | 2 - magnus/cli.py | 159 -- magnus/entrypoints.py | 143 +- magnus/executor.py | 14 - magnus/experiment_tracker.py | 2 +- .../executor/argo/implementation.py | 70 +- .../extensions/executor/argo/integration.py | 55 - .../executor/demo_renderer/__init__.py | 0 .../executor/demo_renderer/implementation.py | 126 -- .../local_container/implementation.py | 51 + .../executor/mocked/implementation.py | 33 + .../run_log_store/file_system/integration.py | 25 - .../extensions/secrets/dotenv/integration.py | 40 - .../secrets/env_secrets/integration.py | 24 - magnus/nodes.py | 8 +- magnus/parameters.py | 2 +- magnus/sdk.py | 86 - magnus/tasks.py | 214 +- mkdocs.yml | 6 +- pyproject.toml | 3 - .../extensions/executor/test_argo_executor.py | 99 +- .../executor/test_argo_integration.py | 30 - .../executor/test_generic_executor.py | 57 +- .../test_file_system_integration.py | 62 - .../secrets/test_dotenv_integration.py | 48 - .../secrets/test_env_secrets_integration.py | 12 - tests/magnus/test_interaction.py | 24 - tests/magnus/test_nodes.py | 6 +- tests/magnus/test_tasks.py | 7 - tests/magnus/test_utils.py | 4 +- tests/test_examples.py | 93 +- tox.ini | 2 +- 76 files changed, 530 insertions(+), 3745 deletions(-) delete mode 100644 docs/command-line.md delete mode 100644 docs/examples.md delete mode 100644 docs/how-do-i.md rename magnus/extensions/catalog/file_system/integration.py => docs/roadmap.md (100%) delete mode 100644 magnus/extensions/executor/argo/integration.py delete mode 100644 magnus/extensions/executor/demo_renderer/__init__.py delete mode 100644 magnus/extensions/executor/demo_renderer/implementation.py delete mode 100644 magnus/extensions/run_log_store/file_system/integration.py delete mode 100644 magnus/extensions/secrets/dotenv/integration.py delete mode 100644 magnus/extensions/secrets/env_secrets/integration.py delete mode 100644 tests/magnus/extensions/executor/test_argo_integration.py delete mode 100644 tests/magnus/extensions/run_log_store/test_file_system_integration.py delete mode 100644 tests/magnus/extensions/secrets/test_dotenv_integration.py delete mode 100644 tests/magnus/extensions/secrets/test_env_secrets_integration.py diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 0739bd97..573bbd33 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,4 +1,3 @@ -#TODO: Fix release for non bump builds on: push: paths: @@ -31,15 +30,27 @@ jobs: with: python-version: 3.8 - run: python -m pip install python-semantic-release==8.0.7 - # - run: python -m poetry install --only release - name: Figure version + continue-on-error: true id: last_tag run: | + CURRENT=$(git tag --sort=-committerdate -l | head -n 1) + echo "current: $CURRENT" + VERSION=$(python -m semantic-release --noop --strict version --no-push --no-commit --print) - echo $VERSION + echo "New: $VERSION" + + if [ "$CURRENT" == "$VERSION" ]; then + echo "version="" >> $GITHUB_OUTPUT + exit 1 + fi + echo "version=$VERSION" >> $GITHUB_OUTPUT + exit 0 + - name: Apply new tag + if: steps.last_tag.outcome == 'success' env: VERSION: ${{ steps.last_tag.outputs.version }} uses: actions/github-script@v6 @@ -54,6 +65,7 @@ jobs: sha: context.sha }) - name: Publish to PyPI + if: steps.last_tag.outcome == 'success' env: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} LAST_TAG: ${{ steps.last_tag.outputs.version }} @@ -64,6 +76,7 @@ jobs: poetry publish --build - name: "Create release" + if: steps.last_tag.outcome == 'success' env: RELEASE_TAG: ${{ steps.last_tag.outputs.version }} uses: "actions/github-script@v6" diff --git a/README.md b/README.md index 4329c181..75d45cf6 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

Logo

---- +

python: @@ -17,7 +17,7 @@ Tests: Docs:

---- +
Magnus is a simplified workflow definition language that helps in: @@ -48,17 +48,20 @@ such as [data catalog](https://astrazeneca.github.io/magnus-core/concepts/catalo [experiment tracking](https://astrazeneca.github.io/magnus-core/concepts/experiment-tracking) and secure [access to secrets](https://astrazeneca.github.io/magnus-core/concepts/secrets). +
## What does it do? ![works](assets/work.png) +
## Documentation [More details about the project and how to use it available here](https://astrazeneca.github.io/magnus-core/). +
## Installation @@ -71,6 +74,7 @@ pip install magnus Please look at the [installation guide](https://astrazeneca.github.io/magnus-core/usage) for more information. +
## Example @@ -391,3 +395,5 @@ Execute branches in parallel Execute a pipeline over an iterable parameter. [![](https://mermaid.ink/img/pako:eNqVlF1rwjAUhv9KyG4qKNR-3AS2m8nuBgN3Z0Sy5tQG20SSdE7E_76kVVEr2CY3Ied9Tx6Sk3PAmeKACc5LtcsKpi36nlGZFbXciHwfLN79CuWiBLMcEULWGkBSaeosA2OCxbxdXMd89Get2bZASsLiSyuvQE2mJZXIjW27t2rOmQZ3Gp9rD6UjatWnwy7q6zPPukd50WTydmemEiS_QbQ79RwxGoQY9UaMuojRA8TCXexzyHgQZNwbMu5Cxl3IXNX6OWMyiDHpzZh0GZMHjOK3xz2mgxjT3oxplzG9MPp5_nVOhwJjteDwOg3HyFj3L1dCcvh7DUc-iftX18n6Waet1xX8cG908vpKHO6OW7cvkeHm5GR2b3drdvaSGTODHLW37mxabYC8fLgRhlfxpjNdwmEets-Dx7gCXTHBXQc8-D2KbQEVUEzckjO9oZjKo9Ox2qr5XmaYWF3DGNdbzizMBHOVVWGSs9K4XeDCKv3ZttSmsx7_AYa341E?type=png)](https://mermaid.live/edit#pako:eNqVlF1rwjAUhv9KyG4qKNR-3AS2m8nuBgN3Z0Sy5tQG20SSdE7E_76kVVEr2CY3Ied9Tx6Sk3PAmeKACc5LtcsKpi36nlGZFbXciHwfLN79CuWiBLMcEULWGkBSaeosA2OCxbxdXMd89Get2bZASsLiSyuvQE2mJZXIjW27t2rOmQZ3Gp9rD6UjatWnwy7q6zPPukd50WTydmemEiS_QbQ79RwxGoQY9UaMuojRA8TCXexzyHgQZNwbMu5Cxl3IXNX6OWMyiDHpzZh0GZMHjOK3xz2mgxjT3oxplzG9MPp5_nVOhwJjteDwOg3HyFj3L1dCcvh7DUc-iftX18n6Waet1xX8cG908vpKHO6OW7cvkeHm5GR2b3drdvaSGTODHLW37mxabYC8fLgRhlfxpjNdwmEets-Dx7gCXTHBXQc8-D2KbQEVUEzckjO9oZjKo9Ox2qr5XmaYWF3DGNdbzizMBHOVVWGSs9K4XeDCKv3ZttSmsx7_AYa341E) + +### [Arbitrary nesting](https://astrazeneca.github.io/magnus-core/concepts/nesting/) diff --git a/docs/.DS_Store b/docs/.DS_Store index 038a1e407d164528fb8a394ea7741016156fd427..b996586c7fe27724a3808ad6ac19ed3675edc942 100644 GIT binary patch delta 94 zcmZp1XmQveE4n#aRELq9gQ1k6m?4oNogsCyfv6~VHv*abg4CW_AfN JR$^6i0|19GAX@+c delta 84 zcmZp1XmQveD=J*fP{NSNkOPE?KvpJ0K11GQ15r^%+07iHj*Oct#TpnlvrCAv5>{Bt F2mo7x6R`jQ diff --git a/docs/command-line.md b/docs/command-line.md deleted file mode 100644 index f047f214..00000000 --- a/docs/command-line.md +++ /dev/null @@ -1,294 +0,0 @@ -# Command line options - -## Executing a pipeline - -You can execute a pipeline by the following command: - -```shell -magnus execute -``` - ---- -!!! Note - - For the above command to work, make sure you are in the environment where magnus was installed. - - If you are using poetry, you can also invoke magnus by ```poetry run magnus execute``` ---- - -The complete options available are: - -``` -Usage: magnus execute [OPTIONS] - - Entry point to executing a pipeline. This command is most commonly used - either to execute a pipeline or to translate the pipeline definition to - another language. - - You can re-run an older run by providing the run_id of the older run in - --use-cached. Ensure that the catalogs and run logs are accessible by the - present configuration. - -Options: - -f, --file TEXT The pipeline definition file [default: - pipeline.yaml] - -c, --config-file TEXT config file, in yaml, to be used for the run - -p, --parameters-file TEXT Parameters, in yaml, accessible by the - application - --log-level [INFO|DEBUG|WARNING|ERROR|FATAL] - The log level [default: WARNING] - --tag TEXT A tag attached to the run - --run-id TEXT An optional run_id, one would be generated - if not provided - --use-cached TEXT Provide the previous run_id to re-run. - --help Show this message and exit. -``` - -### Dag definition/config - -The file containing the dag definition and the config to be used. - -Provided by ```-f```, ```--file``` option on magnus cli. - -Defaults to ```pipeline.yaml``` if nothing is provided. - - -### Configurations file - -The yaml file containing the configurations used to run magnus. The configurations provided here would over-ride any -configuration variables. - -Provided by ```-c```, ```--config-file``` option on magnus cli. - -Defaults to None, if nothing is provided. -Read more about different ways you can configure magnus runs here. - - - -### Parameters file - -The yaml file containing the initial set of parameters that the application can access. Individual steps of the -pipeline can still add/update parameters as required. - -Provided by ```-p```, ```--parameters-file``` option to magnus cli. - -Defaults to None, if nothing is provided. -You can also pass parameters by environmental variables prefixed by ```MAGNUS_PRM_``` - -### Log level - -To control the logging level of magnus only. This does not affect your application logs what so ever. - -Provided by ```--log-level``` option on magnus cli. - -Available options are: DEBUG, INFO, WARNING, ERROR, CRITICAL. - -Defaults to INFO if nothing is provided. - -### Tag - -A friendly way to tag experiments or runs together. - -Provided by ```--tag``` option on magnus cli. - -Defaults to None if nothing is provided. - -### Run id - -An unique run identifier for the run. - -Provided by ```--run-id``` on magnus cli. - -We generate one based on Timestamp if one is not provided. - - -### Use cached - -Enables you to re-run a previous run provided by the run-id. - -Example: - -```shell -magnus execute --file example.yaml --run-id 20210506051758 --use-cached old_run_id -``` - -## Executing a Jupyter notebook - -This method could be used to run a Jupyter notebook in any environment. - -The complete options are: - -``` - -Usage: magnus execute_notebook [OPTIONS] FILENAME - - Entry point to execute a Jupyter notebook in isolation. - - The notebook would be executed in the environment defined by the config file or default if none. - -Options: - -c, --config-file TEXT config file, in yaml, to be used for the run - -p, --parameters-file TEXT Parameters, in yaml, accessible by the - application - --log-level [INFO|DEBUG|WARNING|ERROR|FATAL] - The log level [default: WARNING] - -d, --data-folder TEXT The catalog data folder - -put, --put-in-catalog TEXT The data to put from the catalog - --tag TEXT A tag attached to the run - --run-id TEXT An optional run_id, one would be generated - if not provided - --help Show this message and exit. - -``` - -## Executing a python function - -This method could be used to run a python function in any environment. - -The complete options are: - -``` -Usage: magnus execute_function [OPTIONS] COMMAND - - Entry point to execute a python function in isolation. - - The function would be executed in the environment defined by the config file - or default if none. - -Options: - -c, --config-file TEXT config file, in yaml, to be used for the run - -p, --parameters-file TEXT Parameters, in yaml, accessible by the - application - --log-level [INFO|DEBUG|WARNING|ERROR|FATAL] - The log level [default: WARNING] - -d, --data-folder TEXT The catalog data folder - -put, --put-in-catalog TEXT The data to put from the catalog - --tag TEXT A tag attached to the run - --run-id TEXT An optional run_id, one would be generated - if not provided - --help Show this message and exit. -``` - -## Executing a single step - -This method could be used to run a single step in isolation. - -The complete options are: - -``` -Usage: magnus execute_step [OPTIONS] STEP_NAME - - Entry point to executing a single step of the pipeline. - - This command is helpful to run only one step of the pipeline in isolation. - Only the steps of the parent dag could be invoked using this method. - - You can re-run an older run by providing the run_id of the older run in - --use-cached. Ensure that the catalogs and run logs are accessible by the - present configuration. - - When running map states, ensure that the parameter to iterate on is - available in parameter space. - -Options: - -f, --file TEXT The pipeline definition file [default: - pipeline.yaml] - -c, --config-file TEXT config file, in yaml, to be used for the run - -p, --parameters-file TEXT Parameters, in yaml, accessible by the - application - --log-level [INFO|DEBUG|WARNING|ERROR|FATAL] - The log level [default: WARNING] - --tag TEXT A tag attached to the run - --run-id TEXT An optional run_id, one would be generated - if not provided - --use-cached TEXT Provide the previous run_id to re-run. - --help Show this message and exit.``` - -``` - -The options have the same meaning as executing a pipeline. - -**Design thought:** This method could be handy to debug a single node of the pipeline or run a single step of the pipeline -in other environments by changing the config. - - -## Building docker images - -This method is a utility tool to assist in building docker images. - -It is preferred that you have a docker file that you can provide to the utility tool using the ```-f/--docker-file``` -option. We can auto-generate a opinionated dockerfile but it is unlikely to fit your needs perfectly. - -For the auto-generation of the dockerfile: - -- You can provide the style of dependency management. Currently, poetry, pipenv are supported. Any other would revert -to using requirements.txt dependency style. -- The base image is python 3.7 -- By default, we add only git tracked contents into the ```app``` folder of the image. But you can over-ride it -with ```--all``` option to add all content to the image. - -Please be aware that using ```--all``` might add sensitive data into the docker image. - -The options available are: - -``` -Usage: magnus build_docker [OPTIONS] IMAGE_NAME - - A utility function to create docker images from the existing codebase. - - It is advised to provide your own dockerfile as much as possible. If you do - not have one handy, you can use --dry-run functionality to see if the auto- - generated one suits your needs. - - If you are auto-generating the dockerfile: BEWARE!! Over-riding the default - options assumes you know what you are doing! BEWARE!! - - 1). By default, only git tracked files are added to the docker image. - - 2). The auto-generated dockerfile uses, python 3.7 as the default image and - adds the current folder. - -Options: - -f, --docker-file TEXT The dockerfile to be used. If None, we generate one - -s, --style TEXT The method used to get requirements [default: - poetry] - -t, --tag TEXT The tag assigned to the image [default: latest] - -c, --commit-tag Use commit id as tag. Over-rides tag option - [default: False] - -d, --dry-run Generate the dockerfile, but NOT the image - [default: False] - --git-tracked / --all Controls what should be added to image. All vs git- - tracked [default: git-tracked] - --help Show this message and exit. -``` - - - -## Extensions - -Magnus internally uses click to perform CLI operations and base command is given below. - -```python -@with_plugins(iter_entry_points('magnus.cli_plugins')) -@click.group() -@click.version_option() -def cli(): - """ - Welcome to magnus. Please provide the command that you want to use. - All commands have options that you can see by magnus --help - """ - pass - -``` - -You can provide custom extensions to the command line capabilities by extending the namespace ```magnus.cli_plugins``` - -```toml -# For example, as part of your pyproject.toml -[tool.poetry.plugins."magnus.cli_plugins"] -"aws-ecr = "YOUR_PACKAGE:push_to_ecr" -``` - -This extension than can be used as - -```magnus aws-ecr ``` diff --git a/docs/concepts/catalog.md b/docs/concepts/catalog.md index 66187178..a7ae7c98 100644 --- a/docs/concepts/catalog.md +++ b/docs/concepts/catalog.md @@ -18,10 +18,10 @@ For example, a local directory structure partitioned by a ```run_id``` or S3 buc The directory structure within a partition is the same as the project directory structure. This enables you to get/put data in the catalog as if you are working with local directory structure. Every interaction with the catalog -(either by API or configuration) results in an entry in the [```run log```](../run-log/#step_log) +(either by API or configuration) results in an entry in the [```run log```](/concepts/run-log/#step_log) Internally, magnus also uses the catalog to store execution logs of tasks i.e stdout and stderr from -[python](../task/#python) or [shell](../task/#shell) and executed notebook from [notebook tasks](../task/#notebook). +[python](/concepts/task/#python) or [shell](/concepts/task/#shell) and executed notebook from [notebook tasks](/concepts/task/#notebook). Since the catalog captures the data files flowing through the pipeline and the execution logs, it enables you to debug failed pipelines or keep track of data lineage. @@ -448,11 +448,11 @@ The execution results in the ```catalog``` populated with the artifacts and the ## Using python API -Files could also be cataloged using [python API](../../interactions) +Files could also be cataloged using [python API](/interactions) -This functionality is possible in [python](../task/#python_functions) -and [notebook](../task/#notebook) tasks. +This functionality is possible in [python](/concepts/task/#python_functions) +and [notebook](/concepts/task/#notebook) tasks. ```python linenums="1" hl_lines="11 23 35 45" --8<-- "examples/concepts/catalog_api.py" @@ -463,11 +463,11 @@ and [notebook](../task/#notebook) tasks. ## Passing Data Objects -Data objects can be shared between [python](../task/#python_functions) or [notebook](../task/#notebook) tasks, +Data objects can be shared between [python](/concepts/task/#python_functions) or [notebook](/concepts/task/#notebook) tasks, instead of serializing data and deserializing to file structure, using -[get_object](../../interactions/#magnus.get_object) and [put_object](../../interactions/#magnus.put_object). +[get_object](/interactions/#magnus.get_object) and [put_object](/interactions/#magnus.put_object). -Internally, we use [pickle](https://docs.python.org/3/library/pickle.html) to serialize and +Internally, we use [pickle](https:/docs.python.org/3/library/pickle.html) to serialize and deserialize python objects. Please ensure that the object can be serialized via pickle. ### Example diff --git a/docs/concepts/executor.md b/docs/concepts/executor.md index 4766f2d7..6e024d56 100644 --- a/docs/concepts/executor.md +++ b/docs/concepts/executor.md @@ -1,6 +1,6 @@ Executors are the heart of magnus, they traverse the workflow and execute the tasks within the workflow while coordinating with different services -(eg. [run log](../run-log), [catalog](../catalog), [secrets](../secrets) etc) +(eg. [run log](/concepts/run-log), [catalog](/concepts/catalog), [secrets](/concepts/secrets) etc) To enable workflows run in varied computational environments, we distinguish between two core functions of any workflow engine. @@ -61,7 +61,7 @@ translated to argo specification just by changing the configuration. In this configuration, we are using [argo workflows](https://argoproj.github.io/argo-workflows/) as our workflow engine. We are also instructing the workflow engine to use a docker image, ```magnus:demo``` defined in line #4, as our execution environment. Please read - [containerised environments](../../configurations/container-environments) for more information. + [containerised environments](/configurations/executors/container-environments) for more information. Since magnus needs to track the execution status of the workflow, we are using a ```run log``` which is persistent and available in for jobs in kubernetes environment. @@ -195,7 +195,7 @@ translated to argo specification just by changing the configuration. ``` -As seen from the above example, once a [pipeline is defined in magnus](../pipeline) either via yaml or SDK, we can +As seen from the above example, once a [pipeline is defined in magnus](/concepts/pipeline) either via yaml or SDK, we can run the pipeline in different environments just by providing a different configuration. Most often, there is no need to change the code or deviate from standard best practices while coding. @@ -287,22 +287,22 @@ def execute_single_node(workflow, step_name, configuration): ##### END POST EXECUTION ##### ``` -1. The [run log](../run-log) maintains the state of the execution of the tasks and subsequently the pipeline. It also +1. The [run log](/concepts/run-log) maintains the state of the execution of the tasks and subsequently the pipeline. It also holds the latest state of parameters along with captured metrics. -2. The [catalog](../catalog) contains the information about the data flowing through the pipeline. You can get/put +2. The [catalog](/concepts/catalog) contains the information about the data flowing through the pipeline. You can get/put artifacts generated during the current execution of the pipeline to a central storage. -3. Read the workflow and get the [step definition](../task) which holds the ```command``` or ```function``` to +3. Read the workflow and get the [step definition](/concepts/task) which holds the ```command``` or ```function``` to execute along with the other optional information. 4. Any artifacts from previous steps that are needed to execute the current step can be -[retrieved from the catalog](../catalog). +[retrieved from the catalog](/concepts/catalog). 5. The current function or step might need only some of the -[parameters casted as pydantic models](../task/#accessing_parameters), filter and cast them appropriately. +[parameters casted as pydantic models](/concepts/task/#accessing_parameters), filter and cast them appropriately. 6. At this point in time, we have the required parameters and data to execute the actual command. The command can -internally request for more data using the [python API](../interactions) or record -[experiment tracking metrics](../experiment-tracking). +internally request for more data using the [python API](/interactions) or record +[experiment tracking metrics](/concepts/experiment-tracking). 7. If the task failed, we update the run log with that information and also raise an exception for the -workflow engine to handle. Any [on-failure](../pipeline/#on_failure) traversals are already handled +workflow engine to handle. Any [on-failure](/concepts/pipeline/#on_failure) traversals are already handled as part of the workflow definition. 8. Upon successful execution, we update the run log with current state of parameters for downstream steps. -9. Any artifacts generated from this step are [put into the central storage](../catalog) for downstream steps. +9. Any artifacts generated from this step are [put into the central storage](/concepts/catalog) for downstream steps. 10. We send a success message to the workflow engine and mark the step as completed. diff --git a/docs/concepts/experiment-tracking.md b/docs/concepts/experiment-tracking.md index a3d6052b..a3b4863f 100644 --- a/docs/concepts/experiment-tracking.md +++ b/docs/concepts/experiment-tracking.md @@ -1,6 +1,6 @@ # Overview -[Run log](../run-log) stores a lot of information about the execution along with the metrics captured +[Run log](/concepts/run-log) stores a lot of information about the execution along with the metrics captured during the execution of the pipeline. @@ -9,7 +9,7 @@ during the execution of the pipeline. === "Using the API" - The highlighted lines in the below example show how to [use the API](../../interactions/#magnus.track_this) + The highlighted lines in the below example show how to [use the API](/interactions/#magnus.track_this) Any pydantic model as a value would be dumped as a dict, respecting the alias, before tracking it. @@ -207,7 +207,7 @@ The step is defaulted to be 0. === "Using the API" - The highlighted lines in the below example show how to [use the API](../../interactions/#magnus.track_this) with + The highlighted lines in the below example show how to [use the API](/interactions/#magnus.track_this) with the step parameter. You can run this example by ```python run examples/concepts/experiment_tracking_step.py``` @@ -452,17 +452,17 @@ Since mlflow does not support step wise logging of parameters, the key name is f === "In mlflow UI"
- ![Image](../assets/screenshots/mlflow.png){ width="800" height="600"} + ![Image](/assets/screenshots/mlflow.png){ width="800" height="600"}
mlflow UI for the execution. The run_id remains the same as the run_id of magnus
- ![Image title](../assets/screenshots/mlflow_step.png){ width="800" height="600"} + ![Image title](/assets/screenshots/mlflow_step.png){ width="800" height="600"}
The step wise metric plotted as a graph in mlflow
To provide implementation specific capabilities, we also provide a -[python API](../../interactions/#magnus.get_experiment_tracker_context) to obtain the client context. The default +[python API](/interactions/#magnus.get_experiment_tracker_context) to obtain the client context. The default client context is a [null context manager](https://docs.python.org/3/library/contextlib.html#contextlib.nullcontext). diff --git a/docs/concepts/map.md b/docs/concepts/map.md index 77fc2768..0a20b111 100644 --- a/docs/concepts/map.md +++ b/docs/concepts/map.md @@ -95,7 +95,7 @@ of the files to process. You can run this example by ```python examples/concepts/map.py``` - ```python linenums="1" hl_lines="21 53-60" + ```python linenums="1" hl_lines="30-31 35 68-74" --8<-- "examples/concepts/map.py" ``` @@ -110,7 +110,7 @@ of the files to process. You can run this example by ```magnus execute examples/concepts/map.yaml``` - ```yaml linenums="1" hl_lines="22-23 25-36" + ```yaml linenums="1" hl_lines="23-26" --8<-- "examples/concepts/map.yaml" ``` @@ -126,7 +126,7 @@ of the files to process. You can run this example by ```magnus execute examples/concepts/map_shell.yaml``` - ```yaml linenums="1" hl_lines="23-24 38-40" + ```yaml linenums="1" hl_lines="26-27 29-32" --8<-- "examples/concepts/map_shell.yaml" ``` @@ -829,7 +829,7 @@ of the files to process. ## Traversal A branch of a map step is considered success only if the ```success``` step is reached at the end. -The steps of the pipeline can fail and be handled by [on failure](../concepts/ppiline/on_failure) and +The steps of the pipeline can fail and be handled by [on failure](/concepts/pipeline/#on_failure) and redirected to ```success``` if that is the desired behavior. The map step is considered successful only if all the branches of the step have terminated successfully. @@ -838,7 +838,7 @@ The map step is considered successful only if all the branches of the step have ## Parameters All the tasks defined in the branches of the map pipeline can -[access to parameters and data as usual](../task). +[access to parameters and data as usual](/concepts/task). !!! warning diff --git a/docs/concepts/nesting.md b/docs/concepts/nesting.md index afc555d7..cdd3874c 100644 --- a/docs/concepts/nesting.md +++ b/docs/concepts/nesting.md @@ -1,4 +1,4 @@ -As seen from the definitions of [parallel](../parallel) or [map](../map), the branches are pipelines +As seen from the definitions of [parallel](/concepts/parallel) or [map](/concepts/map), the branches are pipelines themselves. This allows for deeply nested workflows in **magnus**. Technically there is no limit in the depth of nesting but there are some practical considerations. diff --git a/docs/concepts/parallel.md b/docs/concepts/parallel.md index 6638a04b..4c1e4ea9 100644 --- a/docs/concepts/parallel.md +++ b/docs/concepts/parallel.md @@ -7,7 +7,7 @@ Parallel nodes in magnus allows you to run multiple pipelines in parallel and us All the steps in the below example are ```stubbed``` for convenience. The functionality is similar even if the steps are execution units like ```tasks``` or any other nodes. - We support deeply [nested steps](../nesting). For example, a step in the parallel branch can be a ```map``` which internally + We support deeply [nested steps](/concepts/nesting). For example, a step in the parallel branch can be a ```map``` which internally loops over a ```dag``` and so on. Though this functionality is useful, it can be difficult to debug and understand in large code bases. @@ -549,7 +549,7 @@ ensemble model happens only after both models are (successfully) trained. All pipelines, nested or parent, have the same structure as defined in -[pipeline definition](../pipeline). +[pipeline definition](/concepts/pipeline). The parent pipeline defines a step ```Train models``` which is a parallel step. The branches, XGBoost and RF model, are pipelines themselves. @@ -557,7 +557,7 @@ The branches, XGBoost and RF model, are pipelines themselves. ## Traversal A branch of a parallel step is considered success only if the ```success``` step is reached at the end. -The steps of the pipeline can fail and be handled by [on failure](../concepts/ppiline/on_failure) and +The steps of the pipeline can fail and be handled by [on failure](/concepts/pipeline/#on_failure) and redirected to ```success``` if that is the desired behavior. The parallel step is considered successful only if all the branches of the step have terminated successfully. @@ -566,7 +566,7 @@ The parallel step is considered successful only if all the branches of the step ## Parameters All the tasks defined in the branches of the parallel pipeline can -[access to parameters and data as usual](../task). +[access to parameters and data as usual](/concepts/task). !!! warning diff --git a/docs/concepts/parameters.md b/docs/concepts/parameters.md index 77b9f99d..d6967bc9 100644 --- a/docs/concepts/parameters.md +++ b/docs/concepts/parameters.md @@ -1,16 +1,16 @@ In magnus, ```parameters``` are python data types that can be passed from one ```task``` to the next ```task```. These parameters can be accessed by the ```task``` either as environment variables, arguments of the ```python function``` or using the -[API](../../interactions). +[API](/interactions). ## Initial parameters The initial parameters of the pipeline can set by using a ```yaml``` file and presented during execution -```--parameters-file, -parameters``` while using the [magnus CLI](../../usage/#usage) +```--parameters-file, -parameters``` while using the [magnus CLI](/usage/#usage) -or by using ```parameters_file``` with [the sdk](../../sdk/#magnus.Pipeline.execute). +or by using ```parameters_file``` with [the sdk](/sdk/#magnus.Pipeline.execute). They can also be set using environment variables which override the parameters defined by the file. @@ -42,5 +42,5 @@ They can also be set using environment variables which override the parameters d ## Parameters flow Tasks can access and return parameters and the patterns are specific to the -```command_type``` of the task nodes. Please refer to [tasks](../task) +```command_type``` of the task nodes. Please refer to [tasks](/concepts/task) for more information. diff --git a/docs/concepts/run-log.md b/docs/concepts/run-log.md index 5732e088..3bb41405 100644 --- a/docs/concepts/run-log.md +++ b/docs/concepts/run-log.md @@ -10,7 +10,7 @@ when running the ```command``` of a task. === "pipeline" - This is the same example [described in tasks](../task/#shell). + This is the same example [described in tasks](/concepts/task/#shell). tl;dr a pipeline that consumes some initial parameters and passes them to the next step. Both the steps are ```shell``` based tasks. @@ -389,7 +389,7 @@ A snippet from the above example: - For non-nested steps, the key is the name of the step. For example, the first entry in the steps mapping is "access initial" which corresponds to the name of the task in the pipeline. For nested steps, the step log is also nested and shown in more detail for - [parallel](../parallel), [map](../map) and [dag](../dag). + [parallel](/concepts/parallel), [map](/concepts/map). - ```status```: In line #5 is the status of the step with three possible states, ```SUCCESS```, ```PROCESSING``` or ```FAILED``` @@ -426,12 +426,12 @@ end time, duration of the execution and the parameters at the time of execution } ``` -- ```user_defined_metrics```: are any [experiment tracking metrics](../task/#experiment_tracking) +- ```user_defined_metrics```: are any [experiment tracking metrics](/concepts/task/#experiment_tracking) captured during the execution of the step. - ```branches```: This only applies to parallel, map or dag steps and shows the logs captured during the execution of the branch. -- ```data_catalog```: Captures any data flowing through the tasks by the [catalog](../catalog). +- ```data_catalog```: Captures any data flowing through the tasks by the [catalog](/concepts/catalog). By default, the execution logs of the task are put in the catalog for easier debugging purposes. For example, the below lines from the snippet specifies one entry into the catalog which is the execution log @@ -463,7 +463,7 @@ reproduced in local environments and fixed. - non-nested, linear pipelines - non-chunked run log store - [mocked executor](../../configurations/executors/mocked) provides better support in debugging failures. + [mocked executor](/configurations/executors/mocked) provides better support in debugging failures. ### Example @@ -1237,10 +1237,10 @@ reproduced in local environments and fixed. ## API Tasks can access the ```run log``` during the execution of the step -[using the API](../../interactions/#magnus.get_run_log). The run log returned by this method is a deep copy +[using the API](/interactions/#magnus.get_run_log). The run log returned by this method is a deep copy to prevent any modifications. Tasks can also access the ```run_id``` of the current execution either by -[using the API](../../interactions/#magnus.get_run_id) or by the environment +[using the API](/interactions/#magnus.get_run_id) or by the environment variable ```MAGNUS_RUN_ID```. diff --git a/docs/concepts/secrets.md b/docs/concepts/secrets.md index 601f7212..2a31d25e 100644 --- a/docs/concepts/secrets.md +++ b/docs/concepts/secrets.md @@ -11,7 +11,7 @@ Most complex pipelines require secrets to hold sensitive information during task They could be database credentials, API keys or any information that need to present at the run-time but invisible at all other times. -Magnus provides a [clean API](../../interactions/#magnus.get_secret) to access secrets +Magnus provides a [clean API](/interactions/#magnus.get_secret) to access secrets and independent of the actual secret provider, the interface remains the same. A typical example would be a task requiring the database connection string to connect @@ -29,7 +29,7 @@ class CustomObject: # Do something with the secrets ``` -Please refer to [configurations](../../configurations/secrets) for available implementations. +Please refer to [configurations](/configurations/secrets) for available implementations. ## Example diff --git a/docs/concepts/task.md b/docs/concepts/task.md index 4ffda925..6d198bd6 100644 --- a/docs/concepts/task.md +++ b/docs/concepts/task.md @@ -111,7 +111,7 @@ is to execute this function. -Please refer to [Initial Parameters](../parameters/#initial_parameters) for more information about setting +Please refer to [Initial Parameters](/concepts/parameters/#initial_parameters) for more information about setting initial parameters. Lets assume that the initial parameters are: @@ -177,14 +177,14 @@ Lets assume that the initial parameters are: === "Using the API" - Magnus also has [python API](../../interactions) to access parameters. + Magnus also has [python API](/interactions) to access parameters. - Use [get_parameter](../../interactions/get_parameter) to access a parameter at the root level. + Use [get_parameter](/interactions/#magnus.get_parameter) to access a parameter at the root level. You can optionally specify the ```type``` by using ```cast_as``` argument to the API. For example, line 19 would cast ```eggs```parameter into ```EggsModel```. Native python types do not need any explicit ```cast_as``` argument. - Use [set_parameter](../../interactions/set_parameter) to set parameters at the root level. + Use [set_parameter](/interactions/#magnus.set_parameter) to set parameters at the root level. Multiple parameters can be set at the same time, for example, line 26 would set both the ```spam``` and ```eggs``` in a single call. @@ -234,7 +234,7 @@ Lets assume that the initial parameters are: ### Passing data and execution logs -Please refer to [catalog](../catalog) for more details and examples on passing +Please refer to [catalog](/concepts/catalog) for more details and examples on passing data between tasks and the storage of execution logs. --- @@ -261,7 +261,7 @@ The output notebook is also saved in the ```catalog``` for logging and ease of d
- ![Image title](../assets/screenshots/simple_notebook.png){ width="800" height="600"} + ![Image title](/assets/screenshots/simple_notebook.png){ width="800" height="600"}
@@ -290,8 +290,8 @@ the current project are readily available. - ```notebook_output_path```: the location of the executed notebook. Defaults to the notebook name defined in ```command``` with ```_out``` post-fixed. The location should be relative to the project root and also would be stored in catalog in the same location. -- [next](../pipeline/#linking): is required for any step of the pipeline except for success and fail steps. -- [on_failure](../pipeline/#on_failure): Name of the step to execute if the step fails. +- [next](/concepts/pipeline/#linking): is required for any step of the pipeline except for success and fail steps. +- [on_failure](/concepts/pipeline/#on_failure): Name of the step to execute if the step fails. - catalog: Optional required for data access patterns from/to the central storage. ### ploomber arguments @@ -315,7 +315,7 @@ You can set additional arguments or override these by sending an optional dictio ### Accessing parameters -Please refer to [Initial Parameters](../parameters/#initial_parameters) for more information about setting +Please refer to [Initial Parameters](/concepts/parameters/#initial_parameters) for more information about setting initial parameters. Assume that the initial parameters are: @@ -343,7 +343,7 @@ Assume that the initial parameters are: === "Notebook"
- ![Image title](../assets/screenshots/notebook_native_parameters.png){ width="800" height="600"} + ![Image title](/assets/screenshots/notebook_native_parameters.png){ width="800" height="600"}
@@ -359,7 +359,7 @@ Assume that the initial parameters are: For example, the initial parameters will be passed to the notebook as shown below.
- ![Image title](../assets/screenshots/notebook_input_parameters.png){ width="800" height="600"} + ![Image title](/assets/screenshots/notebook_input_parameters.png){ width="800" height="600"}
@@ -378,7 +378,7 @@ Assume that the initial parameters are:
- ![Image title](../assets/screenshots/notebook_output_parameters.png){ width="800" height="600"} + ![Image title](/assets/screenshots/notebook_output_parameters.png){ width="800" height="600"}
@@ -405,7 +405,7 @@ Assume that the initial parameters are:
- ![Image title](../assets/screenshots/notebook_api_parameters.png){ width="800" height="600"} + ![Image title](/assets/screenshots/notebook_api_parameters.png){ width="800" height="600"}
@@ -441,7 +441,7 @@ Assume that the initial parameters are: ### Passing data and execution logs -Please refer to [catalog](../catalog) for more details and examples on passing +Please refer to [catalog](/concepts/catalog) for more details and examples on passing data between tasks and the storage of execution logs. @@ -472,7 +472,7 @@ to execute the command. ### Accessing parameters -Please refer to [Initial Parameters](../parameters/#initial_parameters) for more information about setting +Please refer to [Initial Parameters](/concepts/parameters/#initial_parameters) for more information about setting initial parameters. Assuming the initial parameters are: @@ -513,10 +513,10 @@ lines 33-35. ### Passing data and execution logs -Please refer to [catalog](../catalog) for more details and examples on passing +Please refer to [catalog](/concepts/catalog) for more details and examples on passing data between tasks and the storage of execution logs. ## Experiment tracking -Please refer to [experiment tracking](../experiment-tracking) for more details and examples on experiment tracking. +Please refer to [experiment tracking](/concepts/experiment-tracking) for more details and examples on experiment tracking. diff --git a/docs/concepts/the-big-picture.md b/docs/concepts/the-big-picture.md index dbbf2f90..62bd9208 100644 --- a/docs/concepts/the-big-picture.md +++ b/docs/concepts/the-big-picture.md @@ -2,7 +2,7 @@ Magnus revolves around the concept of pipelines or workflows and tasks that happ --- -A [workflow](../pipeline) is simply a series of steps that you want to execute for a desired outcome. +A [workflow](/concepts/pipeline) is simply a series of steps that you want to execute for a desired outcome. ``` mermaid %%{ init: { 'flowchart': { 'curve': 'linear' } } }%% @@ -11,7 +11,7 @@ flowchart LR step1:::green step1([Step 1]) --> step2:::green step2([Step 2]) --> step3:::green - step3([Step ...]) --> step4:::green + step3([Step .. ]) --> step4:::green step4([Step n]) --> suc([success]):::green classDef green stroke:#0f0 @@ -20,16 +20,16 @@ flowchart LR To define a workflow, we need: -- [List of steps](../pipeline/#steps) -- [starting step](../pipeline/#start_at) +- [List of steps](/concepts/pipeline/#steps) +- [starting step](/concepts/pipeline/#start_at) - Next step - - [In case of success](../pipeline/#linking) - - [In case of failure](../pipeline/#on_failure) + - [In case of success](/concepts/pipeline/#linking) + - [In case of failure](/concepts/pipeline/#on_failure) -- [Terminating](../pipeline/terminating) +- [Terminating](/concepts/pipeline/#terminating) -The workflow can be defined either in ```yaml``` or using the [```python sdk```](../../sdk). +The workflow can be defined either in ```yaml``` or using the [```python sdk```](/sdk). --- @@ -40,18 +40,18 @@ A step in the workflow can be: A step in the workflow that does a logical unit work. - The unit of work can be a [python function](../task/#python_functions), - a [shell script](../task/#shell) or a - [notebook](../task/#notebook). + The unit of work can be a [python function](/concepts/task/#python_functions), + a [shell script](/concepts/task/#shell) or a + [notebook](/concepts/task/#notebook). All the logs, i.e stderr and stdout or executed notebooks are stored - in [catalog](../catalog) for easier access and debugging. + in [catalog](/concepts/catalog) for easier access and debugging. === "stub" - An [abstract step](../stub) that is not yet fully implemented. + An [abstract step](/concepts/stub) that is not yet fully implemented. For example in python: @@ -63,7 +63,7 @@ A step in the workflow can be: === "parallel" - A step that has a defined number of [parallel workflows](../parallel) executing + A step that has a defined number of [parallel workflows](/concepts/parallel) executing simultaneously. In the below visualisation, the green lined steps happen in sequence and wait for the previous step to @@ -108,7 +108,7 @@ A step in the workflow can be: === "map" - A step that executes a workflow over an [iterable parameter](../map). + A step that executes a workflow over an [iterable parameter](/concepts/map). The step "chunk files" identifies the number of files to process and computes the start index of every batch of files to process for a chunk size of 10, the stride. @@ -172,19 +172,19 @@ A step in the workflow can be: --- -A [step type of task](../task) is the functional unit of the pipeline. +A [step type of task](/concepts/task) is the functional unit of the pipeline. To be useful, it can: - Access parameters - - Either [defined statically](../parameters/#initial_parameters) at the start of the + - Either [defined statically](/concepts/parameters/#initial_parameters) at the start of the pipeline - - Or by [upstream steps](../parameters/#parameters_flow) + - Or by [upstream steps](/concepts/parameters/#parameters_flow) -- [Publish or retrieve artifacts](../catalog) from/to other steps. -- [Publish metrics](../experiment-tracking) that are interesting. -- Have [access to secrets](../secrets). +- [Publish or retrieve artifacts](/concepts/catalog) from/to other steps. +- [Publish metrics](/concepts/experiment-tracking) that are interesting. +- Have [access to secrets](/concepts/secrets). All the above functionality is possible either via: @@ -193,15 +193,15 @@ All the above functionality is possible either via: - Application native way. - Or via environment variables. -- Or via the [python API](../interactions) which involves ```importing magnus``` in your code. +- Or via the [python API](/interactions) which involves ```importing magnus``` in your code. --- All executions of the pipeline should be: -- [Reproducible](../run-log) for audit and data lineage purposes. +- [Reproducible](/concepts/run-log) for audit and data lineage purposes. - Runnable at local environments for -[debugging failed runs](../run-log/#retrying_failures). +[debugging failed runs](/concepts/run-log/#retrying_failures). --- @@ -212,7 +212,7 @@ Pipelines should be portable between different infrastructure patterns. Infrastructure patterns change all the time and so are the demands from the infrastructure. -We achieve this by [changing configurations](../../configurations/overview), rather than +We achieve this by [changing configurations](/configurations/overview), rather than changing the application code. For example a pipeline should be able to run: diff --git a/docs/configurations/catalog.md b/docs/configurations/catalog.md index 52781ddb..a12d5609 100644 --- a/docs/configurations/catalog.md +++ b/docs/configurations/catalog.md @@ -1,5 +1,5 @@ Catalog provides a way to store and retrieve data generated by the individual steps of the dag to downstream -steps of the dag. Please refer to [concepts](../../concepts/catalog) for more detailed information. +steps of the dag. Please refer to [concepts](/concepts/catalog) for more detailed information. ## do-nothing diff --git a/docs/configurations/executors/argo.md b/docs/configurations/executors/argo.md index 08d3401b..bb732e31 100644 --- a/docs/configurations/executors/argo.md +++ b/docs/configurations/executors/argo.md @@ -16,7 +16,7 @@ to get inputs from infrastructure teams or ML engineers in defining the configur ## Configuration Only ```image``` is the required parameter. Please refer to the -[note on containers](../container-environments) on building images. +[note on containers](/configurations/executors/container-environments) on building images. ```yaml linenums="1" @@ -333,7 +333,7 @@ as inputs to the workflow. This allows for changing the parameters at runtime. === "Run Submission"
- ![Image](../../assets/screenshots/argo-expose-parameters.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-expose-parameters.png){ width="800" height="600"}
argo workflows UI exposing the parameters
@@ -411,7 +411,7 @@ The parallelism constraint [only applies to the step](https://github.com/argopro === "Pipeline" - This example is the same as [detailed in map](../../../concepts/map). + This example is the same as [detailed in map](/concepts/map). ```yaml linenums="1" hl_lines="22-23 25-36" --8<-- "examples/concepts/map.yaml" @@ -423,7 +423,7 @@ The parallelism constraint [only applies to the step](https://github.com/argopro tasks execute simultaneously.
- ![Image](../../assets/screenshots/argo-parallel-map.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-parallel-map.png){ width="800" height="600"}
argo workflows UI exposing the parameters
@@ -442,7 +442,7 @@ The parallelism constraint [only applies to the step](https://github.com/argopro === "Pipeline" - The pipeline defined here is nearly the same as [detailed in map](../../../concepts/map) with the + The pipeline defined here is nearly the same as [detailed in map](/concepts/map) with the only exception in lines 25-26 which use the ```sequential``` override. ```yaml linenums="1" hl_lines="22-23 25-36" @@ -456,7 +456,7 @@ The parallelism constraint [only applies to the step](https://github.com/argopro instead of parallel as seen in the default.
- ![Image](../../assets/screenshots/argo-sequential-map.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-sequential-map.png){ width="800" height="600"}
argo workflows UI exposing the parameters
@@ -538,7 +538,7 @@ code versioning tools. We recommend using ```secrets_from_k8s``` in the configur Assumed to be present at ```examples/configs/argo-config.yaml``` - The docker image is a [variable](#dynamic_name_of_the_image) and + The docker image is a [variable](/configurations/executors/container-environments/#dynamic_name_of_the_image) and dynamically set during execution. ```yaml linenums="1" hl_lines="4" @@ -547,7 +547,7 @@ code versioning tools. We recommend using ```secrets_from_k8s``` in the configur 1. Use ```argo``` executor type to execute the pipeline. 2. By default, all the tasks are executed in the docker image . Please - refer to [building docker images](#container_environments) + refer to [building docker images](/configurations/executors/container-environments/) 3. Mount the persistent volume ```magnus-volume``` to all the containers as ```/mnt```. 4. Store the run logs in the file-system. As all containers have access to ```magnus-volume``` as ```/mnt```. We use that to mounted folder as run log store. @@ -556,7 +556,7 @@ code versioning tools. We recommend using ```secrets_from_k8s``` in the configur === "python SDK" Running the SDK defined pipelines for any container based executions [happens in - multi-stage process](#container_environments). + multi-stage process](/configurations/executors/container-environments/). 1. Generate the ```yaml``` definition file by: ```MAGNUS_CONFIGURATION_FILE=examples/configs/argo-config.yaml python examples/concepts/simple.py``` @@ -594,12 +594,12 @@ code versioning tools. We recommend using ```secrets_from_k8s``` in the configur === "Screenshots"
- ![Image](../../assets/screenshots/argo-workflows-gant.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-workflows-gant.png){ width="800" height="600"}
argo workflows UI showing the pipeline
- ![Image](../../assets/screenshots/argo-workflows-logs.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-workflows-logs.png){ width="800" height="600"}
argo workflows UI showing the logs
@@ -788,7 +788,7 @@ Magnus compiled argo workflows support deeply nested workflows. === "Nested workflow" - This is the same example as shown in [nested](../../../concepts/nesting). + This is the same example as shown in [nested](/concepts/nesting). ```yaml linenums="1" --8<-- "examples/concepts/nesting.yaml" @@ -799,7 +799,7 @@ Magnus compiled argo workflows support deeply nested workflows. Assumed to be present at ```examples/configs/argo-config.yaml``` - The docker image is a [variable](#dynamic_name_of_the_image) and + The docker image is a [variable](/configurations/executors/container-environments/) and dynamically set during execution. ```yaml linenums="1" hl_lines="4" @@ -1628,7 +1628,7 @@ Magnus compiled argo workflows support deeply nested workflows. === "In argo UI"
- ![Image](../../assets/screenshots/argo-nested.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-nested.png){ width="800" height="600"}
argo workflows UI showing the deeply nested workflows.
@@ -1636,15 +1636,15 @@ Magnus compiled argo workflows support deeply nested workflows. ## Kubeflow Kubeflow pipelines compiles workflows defined in SDK to Argo workflows and thereby -has support for uploading argo workflows. Below is a screenshot of the [map](../../../concepts/map) pipeline uploaded to Kubeflow. +has support for uploading argo workflows. Below is a screenshot of the [map](/concepts/map) pipeline uploaded to Kubeflow.
- ![Image](../../assets/screenshots/argo-kubeflow-ui.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-kubeflow-ui.png){ width="800" height="600"}
argo workflows UI showing the map workflow definition.
- ![Image](../../assets/screenshots/argo-kubeflow-exec.png){ width="800" height="600"} + ![Image](/assets/screenshots/argo-kubeflow-exec.png){ width="800" height="600"}
argo workflows UI showing the map workflow execution.
diff --git a/docs/configurations/executors/container-environments.md b/docs/configurations/executors/container-environments.md index 6daec335..32d18f9b 100644 --- a/docs/configurations/executors/container-environments.md +++ b/docs/configurations/executors/container-environments.md @@ -1,10 +1,10 @@ ## Pipeline definition Executing pipelines in containers needs a ```yaml``` based definition of the pipeline which is -referred during the [task execution](../../../concepts/executor/#step_execution). +referred during the [task execution](/concepts/executor/#step_execution). -Any execution of the pipeline [defined by SDK](../../../sdk) generates the pipeline +Any execution of the pipeline [defined by SDK](/sdk) generates the pipeline definition in```yaml``` format for all executors apart from the [```local``` executor](../local). @@ -18,9 +18,9 @@ Follow the below steps to execute the pipeline defined by SDK. 2. Optionally (but highly recommended) version your code using git. 2. Build the docker image with the ```yaml``` file-based definition as part of the image. We recommend tagging the docker image with the short git sha to uniquely identify the docker image (1). -3. Define a [variable to temporarily hold](#dynamic_name_of_the_image) the docker image name in the +3. Define a [variable to temporarily hold](https://docs.python.org/3/library/string.html#template-strings) the docker image name in the pipeline definition, if the docker image name is not known. -4. Execute the pipeline using the [magnus CLI](../../../cli). +4. Execute the pipeline using the [magnus CLI](/usage/#usage). diff --git a/docs/configurations/executors/local-container.md b/docs/configurations/executors/local-container.md index fb4165b7..b40ccaf5 100644 --- a/docs/configurations/executors/local-container.md +++ b/docs/configurations/executors/local-container.md @@ -1,6 +1,6 @@ Execute all the steps of the pipeline in containers. Please refer to the -[note on containers](../container-environments) on building images. +[note on containers](/configurations/executors/container-environments/) on building images. - [x] Provides a way to test the containers and the execution of the pipeline in local environment. - [x] Any failure in cloud native container environments can be replicated in local environments. @@ -33,14 +33,14 @@ config: ``` 1. By default, all tasks are sequentially executed. Provide ```true``` to enable tasks within -[parallel](../../concepts/parallel) or [map](../../concepts/map) to be executed in parallel. +[parallel](/concepts/parallel) or [map](/concepts/map) to be executed in parallel. 2. Set it to false, to debug a failed container. -3. Setting it to true will behave exactly like a [local executor](#local). +3. Setting it to true will behave exactly like a [local executor](/configurations/executors/local/). 4. Pass any environment variables into the container. 5. Please refer to [step overrides](#step_override) for more details. The ```docker_image``` field is required and default image to execute tasks -of the pipeline. Individual [tasks](../../../concepts/task) can +of the pipeline. Individual [tasks](/concepts/task) can [override](#step_override) the global defaults of executor by providing ```overrides``` @@ -63,7 +63,7 @@ the patterns. Assumed to be present at ```examples/configs/local-container.yaml``` - The docker image is a [variable](#dynamic_name_of_the_image) and + The docker image is a [variable](/configurations/executors/container-environments/#dynamic_name_of_the_image) and dynamically set during execution. ```yaml linenums="1" hl_lines="4" @@ -72,7 +72,7 @@ the patterns. 1. Use local-container executor type to execute the pipeline. 2. By default, all the tasks are executed in the docker image . Please - refer to [building docker images](#container_environments) + refer to [building docker images](/configurations/executors/container-environments/#dynamic_name_of_the_image) 3. Pass any environment variables that are needed for the container. 4. Store the run logs in the file-system. Magnus will handle the access to them by mounting the file system into the container. @@ -81,7 +81,7 @@ the patterns. === "python sdk" Running the SDK defined pipelines for any container based executions [happens in - multi-stage process](#container_environments). + multi-stage process](/configurations/executors/container-environments/). 1. Generate the ```yaml``` definition file by: ```MAGNUS_CONFIGURATION_FILE=examples/configs/local-container.yaml python examples/concepts/simple.py``` @@ -95,7 +95,7 @@ the patterns. ``` 1. You can provide a configuration file dynamically by using the environment - variable ```MAGNUS_CONFIGURATION_FILE```. Please see [SDK for more details](../../sdk). + variable ```MAGNUS_CONFIGURATION_FILE```. Please see [SDK for more details](sdk). @@ -302,7 +302,7 @@ executor. As seen in the above example, running the SDK defined pipelines for any container based executions [happens in - multi-stage process](#container_environments). + multi-stage process](/configurations/executors/container-environments/). 1. Generate the ```yaml``` definition file by: ```MAGNUS_CONFIGURATION_FILE=examples/executors/local-container-override.yaml python examples/executors/step_overrides_container.py``` diff --git a/docs/configurations/executors/local.md b/docs/configurations/executors/local.md index 86f2afc3..f1299126 100644 --- a/docs/configurations/executors/local.md +++ b/docs/configurations/executors/local.md @@ -24,7 +24,7 @@ config: ``` 1. By default, all tasks are sequentially executed. Provide ```true``` to enable tasks within -[parallel](../../concepts/parallel) or [map](../../concepts/map) to be executed in parallel. +[parallel](/concepts/parallel) or [map](/concepts/map) to be executed in parallel. diff --git a/docs/configurations/executors/mocked.md b/docs/configurations/executors/mocked.md index 9bec47e2..84d26f83 100644 --- a/docs/configurations/executors/mocked.md +++ b/docs/configurations/executors/mocked.md @@ -23,7 +23,7 @@ to run and the configuration of the command. #### Command configuration for notebook nodes ```python``` and ```shell``` based tasks have no configuration options apart from the ```command```. -Notebook nodes have additional configuration options [detailed in concepts](../../../concepts/task/#notebook). Ploomber engine provides [rich options](https://engine.ploomber.io/en/docs/user-guide/debugging/debuglater.html) in debugging failed notebooks. +Notebook nodes have additional configuration options [detailed in concepts](/concepts/task/#notebook). Ploomber engine provides [rich options](https://engine.ploomber.io/en/docs/user-guide/debugging/debuglater.html) in debugging failed notebooks. ## Example @@ -211,7 +211,7 @@ take an example pipeline to test the behavior of the traversal. The below pipeline is designed to follow: ```step 1 >> step 2 >> step 3``` in case of no failures and ```step 1 >> step3``` in case of failure. The traversal is -[shown in concepts](../../../concepts/pipeline/#on_failure). +[shown in concepts](/concepts/pipeline/#on_failure). !!! tip "Asserting Run log" diff --git a/docs/configurations/overview.md b/docs/configurations/overview.md index 960603f6..892f8c31 100644 --- a/docs/configurations/overview.md +++ b/docs/configurations/overview.md @@ -1,15 +1,15 @@ **Magnus** is designed to make effective collaborations between data scientists/researchers and infrastructure engineers. -All the features described in the [concepts](../../concepts/the-big-picture) are +All the features described in the [concepts](/concepts/the-big-picture) are aimed at the *research* side of data science projects while configurations add *scaling* features to them. Configurations are presented during the execution: -For ```yaml``` based pipeline, use the ```--config-file, -c``` option in the [magnus CLI](../../usage/#usage). +For ```yaml``` based pipeline, use the ```--config-file, -c``` option in the [magnus CLI](/usage/#usage). -For [python SDK](../../sdk/#magnus.Pipeline.execute), use the ```configuration_file``` option or via +For [python SDK](/sdk/#magnus.Pipeline.execute), use the ```configuration_file``` option or via environment variable ```MAGNUS_CONFIGURATION_FILE``` ## Default configuration @@ -25,8 +25,12 @@ environment variable ```MAGNUS_CONFIGURATION_FILE``` 5. No experiment tracking tools, all interactions with experiment tracking tools are effectively no-op. Run log still captures the metrics, but are not passed to the experiment tracking tools. -The default configuration for all the pipeline executions runs on the [local compute](../executors/local), using a -```buffered``` run log store with no catalog or secrets or experiment tracking functionality. +The default configuration for all the pipeline executions runs on the +[local compute](/configurations/executors/local), using a +[buffered run log](/configurations/run-log/#buffered) store with +[no catalog](/configurations/catalog/#do-nothing) or +[secrets](/configurations/secrets/#do-nothing) or +[experiment tracking functionality](/configurations/experiment-tracking/). diff --git a/docs/configurations/run-log.md b/docs/configurations/run-log.md index d07a902f..9a728a10 100644 --- a/docs/configurations/run-log.md +++ b/docs/configurations/run-log.md @@ -2,7 +2,7 @@ Along with tracking the progress and status of the execution of the pipeline, ru also keeps a track of parameters, experiment tracking metrics, data flowing through the pipeline and any reproducibility metrics emitted by the tasks of the pipeline. -Please refer here for detailed [information about run log](../../../concepts/run-log). +Please refer here for detailed [information about run log](/concepts/run-log). ## buffered @@ -74,7 +74,7 @@ run_log_store: === "Run log" - The structure of the run log is [detailed in concepts](../../../concepts/run-log). + The structure of the run log is [detailed in concepts](/concepts/run-log). ```json linenums="1" { @@ -276,7 +276,7 @@ run_log_store: === "Run log" - The structure of the run log is [detailed in concepts](../../../concepts/run-log). + The structure of the run log is [detailed in concepts](/concepts/run-log). === "RunLog.json" diff --git a/docs/configurations/secrets.md b/docs/configurations/secrets.md index a75df36a..d767ab34 100644 --- a/docs/configurations/secrets.md +++ b/docs/configurations/secrets.md @@ -1,7 +1,7 @@ **Magnus** provides an interface to secrets managers -[via the API](../../interactions/#magnus.get_secret). +[via the API](/interactions/#magnus.get_secret). -Please refer to [Secrets in concepts](../../concepts/secrets) for more information. +Please refer to [Secrets in concepts](/concepts/secrets) for more information. ## do-nothing @@ -133,7 +133,7 @@ for shell scripts are allowed. The example is present in ```examples/secrets.py``` - ```python linenums="1" hl_lines="9-10" + ```python linenums="1" hl_lines="12-13" --8<-- "examples/secrets.py" ``` diff --git a/docs/example/dataflow.md b/docs/example/dataflow.md index 9f672d79..039620c5 100644 --- a/docs/example/dataflow.md +++ b/docs/example/dataflow.md @@ -13,12 +13,12 @@ using catalog. This can be controlled either by the configuration or by python A ## Flow of Parameters -The [initial parameters](../../concepts/parameters) of the pipeline can set by using a ```yaml``` file and presented +The [initial parameters](/concepts/parameters) of the pipeline can set by using a ```yaml``` file and presented during execution -```--parameters-file, -parameters``` while using the [magnus CLI](../../usage/#usage) +```--parameters-file, -parameters``` while using the [magnus CLI](/usage/#usage) -or by using ```parameters_file``` with [the sdk](../../sdk/#magnus.Pipeline.execute). +or by using ```parameters_file``` with [the sdk](/sdk/#magnus.Pipeline.execute). === "Initial Parameters" @@ -121,7 +121,7 @@ or by using ```parameters_file``` with [the sdk](../../sdk/#magnus.Pipeline.exec **Magnus** stores all the artifacts/files/logs generated by ```task``` nodes in a central storage called -[catalog](../../concepts/catalog). +[catalog](/concepts/catalog). The catalog is indexed by the ```run_id``` of the pipeline and is unique for every execution of the pipeline. Any ```task``` of the pipeline can interact with the ```catalog``` to get and put artifacts/files diff --git a/docs/example/example.md b/docs/example/example.md index 9e0ccde7..ca52d9c1 100644 --- a/docs/example/example.md +++ b/docs/example/example.md @@ -1,6 +1,6 @@ -Magnus revolves around the concept of [pipelines or workflows](../../concepts/pipeline). +Magnus revolves around the concept of [pipelines or workflows](/concepts/pipeline). Pipelines defined in magnus are translated into other workflow engine definitions like [Argo workflows](https://argoproj.github.io/workflows/) or [AWS step functions](https://aws.amazon.com/step-functions/). @@ -60,7 +60,7 @@ This pipeline can be represented in **magnus** as below: === "Run log" - Please see [Run log](../../concepts/run-log) for more detailed information about the structure. + Please see [Run log](/concepts/run-log) for more detailed information about the structure. ```json linenums="1" { @@ -69,7 +69,7 @@ This pipeline can be represented in **magnus** as below: "use_cached": false, "tag": "", "original_run_id": "", - "status": "SUCCESS", // (2) + "status": "SUCCESS", / (2) "steps": { "Acquire Data": { "name": "Acquire Data", // (3) @@ -331,10 +331,10 @@ This pipeline can be represented in **magnus** as below: Independent of the platform it is run on, -- [x] The [pipeline definition](../../concepts/pipeline) remains the same from an author point of view. +- [x] The [pipeline definition](/concepts/pipeline) remains the same from an author point of view. The data scientists are always part of the process and contribute to the development even in production environments. -- [x] The [run log](../../concepts/run-log) remains the same except for the execution configuration enabling users +- [x] The [run log](/concepts/run-log) remains the same except for the execution configuration enabling users to debug the pipeline execution in lower environments for failed executions or to validate the expectation of the execution. @@ -344,7 +344,7 @@ expectation of the execution. ## Example configuration To run the pipeline in different environments, we just provide the -[required configuration](../../configurations/overview). +[required configuration](/configurations/overview). === "Default Configuration" @@ -360,7 +360,7 @@ To run the pipeline in different environments, we just provide the === "Argo Configuration" - To render the pipeline in [argo specification](../../configurations/executors/argo/), mention the + To render the pipeline in [argo specification](/configurations/executors/argo/), mention the configuration during execution. yaml: @@ -370,7 +370,7 @@ To run the pipeline in different environments, we just provide the python: - Please refer to [containerised environments](../../configurations/executors/container-environments/) for more information. + Please refer to [containerised environments](/configurations/executors/container-environments/) for more information. MAGNUS_CONFIGURATION_FILE=examples/configs/argo-config.yaml python examples/contrived.py && magnus execute -f magnus-pipeline.yaml -c examples/configs/argo-config.yaml @@ -380,7 +380,7 @@ To run the pipeline in different environments, we just provide the 1. Use argo workflows as the execution engine to run the pipeline. 2. Run this docker image for every step of the pipeline. Please refer to - [containerised environments](../../configurations/executors/container-environments/) for more details. + [containerised environments](/configurations/executors/container-environments/) for more details. 3. Mount the volume from Kubernetes persistent volumes (magnus-volume) to /mnt directory. 4. Resource constraints for the container runtime. 5. Since every step runs in a container, the run log should be persisted. Here we are using the file-system as our diff --git a/docs/example/experiment-tracking.md b/docs/example/experiment-tracking.md index 66874f5b..e63e04ca 100644 --- a/docs/example/experiment-tracking.md +++ b/docs/example/experiment-tracking.md @@ -1,8 +1,8 @@ Metrics in data science projects summarize important information about the execution and performance of the experiment. -Magnus captures [this information as part of the run log](../../concepts/experiment-tracking) and also provides -an [interface to experiment tracking tools](../../concepts/experiment-tracking/#experiment_tracking_tools) +Magnus captures [this information as part of the run log](/concepts/experiment-tracking) and also provides +an [interface to experiment tracking tools](/concepts/experiment-tracking/#experiment_tracking_tools) like [mlflow](https://mlflow.org/docs/latest/tracking.html) or [Weights and Biases](https://wandb.ai/site/experiment-tracking). @@ -197,6 +197,6 @@ like [mlflow](https://mlflow.org/docs/latest/tracking.html) or The metrics are also sent to mlflow.
- ![Image](../assets/screenshots/mlflow_example.png){ width="800" height="600"} + ![Image](/assets/screenshots/mlflow_example.png){ width="800" height="600"}
mlflow UI for the execution. The run_id remains the same as the run_id of magnus
diff --git a/docs/example/reproducibility.md b/docs/example/reproducibility.md index 0225e5a5..47ef34cb 100644 --- a/docs/example/reproducibility.md +++ b/docs/example/reproducibility.md @@ -1,4 +1,4 @@ -Magnus stores a variety of information about the current execution in [run log](../../concepts/run-log). +Magnus stores a variety of information about the current execution in [run log](/concepts/run-log). The run log is internally used for keeping track of the execution (status of different steps, parameters, etc) but also has rich information for reproducing the state at the time of pipeline execution. @@ -227,5 +227,5 @@ Below we show an example pipeline and the different layers of the run log. -This [structure of the run log](../../concepts/run-log) is the same independent of where the pipeline was executed. +This [structure of the run log](/concepts/run-log) is the same independent of where the pipeline was executed. This enables you to reproduce a failed execution in complex environments on local environments for easier debugging. diff --git a/docs/example/retry-after-failure.md b/docs/example/retry-after-failure.md index 93838787..69060f29 100644 --- a/docs/example/retry-after-failure.md +++ b/docs/example/retry-after-failure.md @@ -1,4 +1,4 @@ -Magnus allows you to [debug and recover](../../concepts/run-log/#retrying_failures) from a +Magnus allows you to [debug and recover](/concepts/run-log/#retrying_failures) from a failure during the execution of pipeline. The pipeline can be restarted in any suitable environment for debugging. @@ -585,7 +585,7 @@ Below is an example of retrying a pipeline that failed. ``` -Magnus also supports [```mocked``` executor](../../configurations/executors/mocked) which can +Magnus also supports [```mocked``` executor](/configurations/executors/mocked) which can patch and mock tasks to isolate and focus on the failed task. Since python functions and notebooks are run in the same shell, it is possible to use [python debugger](https://docs.python.org/3/library/pdb.html) and diff --git a/docs/example/secrets.md b/docs/example/secrets.md index 7f771eb2..6ecdea06 100644 --- a/docs/example/secrets.md +++ b/docs/example/secrets.md @@ -1,5 +1,5 @@ Secrets are required assets as the complexity of the application increases. Magnus provides a -python API to get secrets from various sources. +[python API](/interactions/#magnus.get_secret) to get secrets from various sources. !!! info annotate inline end "from magnus import get_secret" diff --git a/docs/example/steps.md b/docs/example/steps.md index 46280fe8..9b2360c3 100644 --- a/docs/example/steps.md +++ b/docs/example/steps.md @@ -2,10 +2,10 @@ Magnus provides a rich definition of of step types.
-- [stub](../../concepts/stub): A mock step which is handy during designing and debugging pipelines. -- [task](../../concepts/task): To execute python functions, jupyter notebooks, shell scripts. -- [parallel](../../concepts/parallel): To execute many tasks in parallel. -- [map](../../concepts/map): To execute the same task over a list of parameters. (1) +- [stub](/concepts/stub): A mock step which is handy during designing and debugging pipelines. +- [task](/concepts/task): To execute python functions, jupyter notebooks, shell scripts. +- [parallel](/concepts/parallel): To execute many tasks in parallel. +- [map](/concepts/map): To execute the same task over a list of parameters. (1)
@@ -40,12 +40,12 @@ Used as a mock node or a placeholder before the actual implementation (1). ## task -Used to execute a single unit of work. You can use [python](../../concepts/task/#python_functions), -[shell](../../concepts/task/#shell), [notebook](../../concepts/task/#notebook) as command types. +Used to execute a single unit of work. You can use [python](/concepts/task/#python_functions), +[shell](/concepts/task/#shell), [notebook](/concepts/task/#notebook) as command types. !!! note annotate "Execution logs" - You can view the execution logs of the tasks in the [catalog](../../concepts/catalog) without digging through the + You can view the execution logs of the tasks in the [catalog](/concepts/catalog) without digging through the logs from the underlying executor. @@ -63,7 +63,7 @@ Used to execute a single unit of work. You can use [python](../../concepts/task/ --8<-- "examples/python-tasks.yaml" ``` - 1. Note that the ```command``` is the [path to the python function](../../concepts/task/#python_functions). + 1. Note that the ```command``` is the [path to the python function](/concepts/task/#python_functions). 2. ```python``` is default command type, you can use ```shell```, ```notebook``` too. === "python" @@ -72,7 +72,7 @@ Used to execute a single unit of work. You can use [python](../../concepts/task/ --8<-- "examples/python-tasks.py" ``` - 1. Note that the command is the [path to the function](../../concepts/task/#python_functions). + 1. Note that the command is the [path to the function](/concepts/task/#python_functions). 2. There are many ways to define dependencies within nodes, step1 >> step2, step1 << step2 or during the definition of step1, we can define a next step. 3. ```terminate_with_success``` indicates that the dag is completed successfully. You can also use ```terminate_with_failure``` to indicate the dag failed. 4. Add ```success``` and ```fail``` nodes to the dag. diff --git a/docs/examples.md b/docs/examples.md deleted file mode 100644 index c4aeba99..00000000 --- a/docs/examples.md +++ /dev/null @@ -1,1879 +0,0 @@ -# Examples - -## Executing a notebook - -You can execute a Jupyter notebook by: - -```shell -magnus execute_notebook my_notebook.ipynb -``` - -The notebook file should have an extension of ```ipynb```. - -This execution would run on the local machine and the output notebook would be put in the ```.catalog``` folder called -```my_notebook_output.ipynb```. - -To change the compute environment, please provide the relevant configuration file. - - ---- - -## Executing a python function - -You can execute a python function defined in my_module: - -```python -# In my_module.py - -def my_function(): - print('In the function, my_function of my_module') - -``` - -by invoking magnus as follows: - -```shell -magnus execute_function my_module.my_function -``` - -This execution would run on the local machine and the captured output of the function would be added to the catalog -folder, ```.catalog``` in this case, as my_module.my_function.log - -To change the compute environment, please provide the relevant configuration file. - ---- - -## A single node pipeline - - -Assuming you have one simple function call as part of a pipeline defined below: - -```python -# In my_module.py - -def my_function(): - print('In the function, my_function of my_module') -``` - -You can define the single node pipeline either by: - -### YAML definition - -Every pipeline defined via YAML in magnus should have a ```success``` node and ```fail``` node. -The starting node of the pipeline is denoted by ```start_at``` and every node needs to define the next -node to traverse during successful execution of the current node using ```next```. - -Nodes can optionally mention the node to traverse during failure using ```on_failure```. - -The pipeline which contains one node to call the above function. - -```yaml -dag: - description: A single node pipeline - start_at: step 1 - steps: - step 1: - type: task - next: success - command: my_module.my_function - command_type: python - success: - type: success - failure: - type: fail -``` - -### Python SDK - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='my_module.my_function') - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - -You can execute it via ```python pipeline.py```. - ---- - -## Mocking a node in pipeline - -In magnus, you can skip execution of a node or mock using a node of type ```as-is```. -This functionality is useful when you want to focus on designing the flow of code but not the specific implementation. - -Example: - -```yaml -dag: - description: A single node pipeline with mock - start_at: step 1 - steps: - step 1: - type: as-is # The function would not execute as this is as-is node - next: success - command: my_module.my_function # arbitrary config can be passed - command_type: python - success: - type: success - failure: - type: fail -``` - -or via python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline, AsIs - -def pipeline(): - first = AsIs(name='step 1', command='my_module.my_function') # The function would not execute - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - - ---- - -## Using shell commands as part of the pipeline - -In magnus, a pipeline can have shell commands as part of the pipeline. The only caveat in doing so is magnus -would not be able to support returning ```parameters```, ```secrets``` or any of the built-in functions. The cataloging -functionality of magnus still would work via the configuration file. - -Parameters can be accessed by looking for environment variables with a prefix of ```MAGNUS_PRM_```. - -Example: Step 1 of the below pipeline would - -- Get all the files from the catalog to the ```compute_data_folder```. -- Execute the command python my_module.my_function in the shell. -- Put all the files from the ```compute_data_folder``` to the catalog. - -```yaml -dag: - description: A single node pipeline with shell - start_at: step 1 - steps: - step 1: - type: task - next: success - command: python -m my_module.my_function # You can use this to call some executable in the PATH - command_type: shell - catalog: - get: - - "*" - put: - - "*" - success: - type: success - failure: - type: fail -``` - -or via python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - catalog_config = { - 'get' : ['*'], - 'put' : ['*'], - } - first = Task(name='step 1', command='python -m my_module.my_function', command_type='shell', catalog=catalog_config) - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - - ---- -## Using python lambda expressions in pipeline - -You can use python lambda expressions as a task type. Please note that you cannot have ```_``` or ```__``` as part of -the expression. This is to prevent any malicious code to be passed into the expression. In the example below, -```step 1``` takes in a parameter ```x``` and returns the integer ```x + 1```. - -Example: - -```yaml -dag: - description: A single node pipeline with python lambda - start_at: step 1 - steps: - step 1: - command_type: python-lambda - command: "lambda x: {'x': int(x) + 1}" - next: success - success: - type: success - failure: - type: fail -``` - -or via python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='lambda x: {'x': int(x) + 1}', command_type='python-lambda') - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - - ---- - -## Using notebook in pipeline - -You can use notebooks as a ```command_type``` of a step in the pipeline. Any of the functionality from python functions -is available via notebook too. - -We use [ploomber](https://ploomber.io/) to inspect the parameters and send them dynamically from the parameter space. - -The command refers to the notebook that you want to use as a task and it should point to the notebook. -The output notebook naming could also be provided by using the ```notebook_output_path``` or would be defaulted to the -notebook mentioned in ```command``` section post-fixed with ```_out```. - - -```yaml -dag: - description: A single node pipeline with notebook - start_at: step 1 - steps: - step 1: - command_type: notebook - command: pre_processing.iypnb - next: success - notebook_output_path: notebooks/output.ipynb - success: - type: success - failure: - type: fail -``` - -or via python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='pre_processing.iypnb', command_type='notebook', notebook_output_path="notebooks/output.ipynb") - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - -The file name should end with ```.ipynb```. - ---- - -## A multi node pipeline - -A pipeline can have many nodes as part of its execution. - -Example: - -```python -# In my_module.py - -def first_function(): - print('In the function, first_function of my_module') - - -def second_function(): - print('In the function, second_function of my_module') - -``` - - -The pipeline which calls first_function of the above module and then to the call the second_function is given below. - -```yaml -dag: - description: A multi node pipeline - start_at: step 1 - steps: - step 1: - type: task - next: step 2 - command: my_module.first_function - command_type: python - step 2: - type: task - next: success - command: my_module.second_function - command_type: python - success: - type: success - failure: - type: fail -``` - - -or via python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='my_module.first_function', next_node='step 2') - second = Task(name='step 2', command='my_module.second_function') - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first, second]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - - - ---- - -## Using on-failure to handle errors - -You can instruct magnus to traverse to a different node of the dag if the current node fails to execute. -A non-zero exit status of the python function or shell command is considered a failure. - -The default behavior in case of a failure of a node is, if no ```on_failure``` is defined, is to -traverse to the ```fail``` node of the graph and mark the execution of the dag as failure. - -The execution of a dag is considered failure if and only if the ```fail``` node of the graph is reached. - -```python -# In my_module.py - -def first_function(): - print('In the function, first_function of my_module') - - -def second_function(): - print('In the function, second_function of my_module') - - -def handle_error(): - print('Send an email notification') - ## Some logic to send error notification - ... - -``` - -The pipeline definition to call ```my_module.handle_error``` in case of a failure of any node is defined below. - - -```yaml -dag: - description: A multi node pipeline with on_failure - start_at: step 1 - steps: - step 1: - type: task - next: step 2 - command: my_module.first_function - command_type: python - on_failure: graceful exit - step 2: - type: task - next: success - command: my_module.second_function - command_type: python - on_failure: graceful exit - graceful exit: - type: task - next: fail - command: my_module.handle_error - command_type: python - success: - type: success - failure: - type: fail -``` - -or via python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='my_module.first_function', next_node='step 2', on_failure='graceful exit') - second = Task(name='step 2', command='my_module.second_function') - third = Task(name='graceful exit', command='my_module.handle_error', next_node='fail') - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first, second, third]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - ---- -## Passing parameters between nodes - -There are several ways we can pass parameters between nodes. Please note that this functionality is only for simple -python data types which can be JSON serializable. Use the catalog functionality to pass files across to different -nodes of the graph. - -You can choose any of the methods to pass the parameters from below. All are compatible with each other. - -The example pipeline to call all the below functions is given here: - - -```yaml -dag: - description: A multi node pipeline to pass parameters - start_at: step 1 - steps: - step 1: - type: task - next: step 2 - command: my_module.first_function - command_type: python - step 2: - type: task - next: success - command: my_module.second_function - command_type: python - success: - type: success - failure: - type: fail -``` - -or via python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='my_module.first_function', next_node='step 2') - second = Task(name='step 2', command='my_module.second_function') - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first, second]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - -``` - - -### Pythonically - -```python -# In my_module.py - -def first_function(): - print('In the function, first_function of my_module') - return {'a': 4} - - -def second_function(a): - print('In the function, second_function of my_module') - print(a) - -``` - -In the above code, ```first_function``` is returning a dictionary setting ```a``` to be 4. If the function was called -as a step in the magnus pipeline, magnus adds the key-value pair of ```a=4``` to the parameter space. Note that -```first_function``` can return a dictionary containing as many key-value pairs as needed, magnus would add all of them -to the parameter space. - -```second_function``` is expecting a ```named``` argument ```a```. If the function was called as a step in the magnus -pipeline, magnus would look for a parameter ```a``` in the parameter space and assign it. - -Very loosely, the whole process can be thought of as: ```second_function(**first_function())```. Since magnus holds -parameter space, the functions need not be consecutive and magnus handles the passing only the required arguments into -the function. - - -### Using in-built functions -You can also use the built-in functions that magnus provides to ```store``` and ```get``` parameters. - -```python -# In my_module.py -from magnus import store_parameter, get_parameter - -def first_function(): - print('In the function, first_function of my_module') - store_parameter(a=4) - - -def second_function(): - print('In the function, second_function of my_module') - a = get_parameter('a') # Get parameter with name provides only the named parameter. - parameters = get_parameter() # Returns a dictionary of all the parameters - print(a) # prints 4 - print(parameters) # prints {'a': 4} - -``` - -### Using environment variables -The parameters can also be accessed by using environment variables. All magnus specific parameters would be prefixed -by ```MAGNUS_PRM_```. Any environment variable that is prefixed by ```MAGNUS_PRM_``` is also added to the parameter -space. - -```python -# In my_module.py -import os - -def first_function(): - print('In the function, first_function of my_module') - os.environ['MAGNUS_PRM_a']=4 - - -def second_function(): - print('In the function, second_function of my_module') - a = os.environ['MAGNUS_PRM_a'] - print(a) - -``` - ---- - -## Passing parameters to the first node of the pipeline - -There are several ways to set parameters at the start of the execution of the pipeline. Please choose one that fits -your situation. - -### During execution of pipeline by magnus - -The step ```step parameters``` of the below pipeline expects a parameter ```x``` in the lambda expression. - -```yaml -# in getting-started.yaml -dag: - description: Getting started - start_at: step parameters - steps: - step parameters: - type: task - command_type: python-lambda - command: "lambda x: {'x': int(x) + 1}" - next: success - success: - type: success - fail: - type: fail -``` - -or via Python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='lambda x: {'x': int(x) + 1}', command_type='python-lambda') - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() -``` - -You can pass the parameter during the execution of the run like below. - -```shell -magnus execute --file getting-started.yaml --parameters-file parameters.yaml -``` - -```yaml -# in parameters.yaml -x: 3 -``` - -### Using environment variables - -For the same pipeline defined in ```getting-started.yaml```, you can also pass the parameters as environment variables -prefixed by ```MAGNUS_PRM_x```. - -The below command does the same job of passing ```x``` as 3. - -```shell -MAGNUS_PRM_x=3; magnus execute --file getting-started.yaml -``` - -You can pass in as many parameters as you want by prefixing them with ```MAGNUS_PRM_```. All parameters would be read -as ```string``` and have to casted appropriately by the code. - -This method of sending parameters by environmental variables is independent of who does the pipeline execution. - ---- -## Using the catalog to pass artifacts between nodes - -While parameters are used to transfer simple and JSON serializable data types, catalog can be used to make larger files -or artifacts available to down stream nodes. A typical configuration of catalog provider would be: - -```yaml -catalog: - type: #defaults to file-system - config: - compute_data_folder: # defaults to data/ -``` - -If no config is provided, magnus defaults to ```file-system```. - -Logically magnus does the following: - -- ```get``` files from the catalog before the execution to a specific ```compute data folder``` -- execute the command -- ```put``` any files from the ```compute data folder``` back to the catalog. - -### Using the configuration. - -```yaml -dag: - description: Getting started - start_at: step shell make data - steps: - step shell make data: - type: task - command_type: shell - command: mkdir data ; env >> data/data.txt - next: step shell ls data - catalog: - put: - - "*" - step shell ls data: - type: task - command_type: shell - command: ls data/ - next: success - catalog: - compute_data_folder: data/ # This is the default value too. - get: - - "*" - success: - type: success - fail: - type: fail -``` - -or via Python SDK: - -```python - -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - catalog_get_all = { - 'get' : ['*'] - } - - catalog_put_all = { - 'put': ['*'] - } - - first = Task(name='step shell make data', command='mkdir data ; env >> data/data.txt', command_type='shell', - catalog=catalog_put_all) - second = Task(name='step shell ls data', command='ls data/', command_type='shell', - catalog=catalog_get_all) - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first, second]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() - - -``` - - -In the above dag definition, ```step shell make data``` makes a data folder and dumps the environmental variables into -```data.txt``` file and instructs the catalog to ```put``` all (i.e '*') files into the catalog for downstream nodes. - -While the step ```step shell ls data``` instructs the catalog to ```get``` (i.e '*') files from the catalog and put -them in ```compute_data_folder``` which is ```data``` and executes the command to see the contents of the directory. - -You can over-ride the ```compute_data_folder``` of a single step to any folder that you want as shown. - -Glob patterns are perfectly allowed and you can it to selectively ```get``` or ```put``` files in the catalog. - -### Using the in-built functions - -You can interact with the catalog from the python code too if that is convenient. - -```python -# In my_module.py -from pathlib import Path - -from magnus import put_in_catalog, get_from_catalog - -def first_function(): - print('In the function, first_function of my_module') - Path('data').mkdir(parents=True, exist_ok=True) - - with open('data/data.txt', 'w') as fw: - fw.write('something interesting) - - # filepath is required and can be a glob pattern - put_in_catalog(filepath='data/data.txt') - -def second_function(): - print('In the function, second_function of my_module') - - # name is required and can be a glob pattern. - # destination_folder is defaulted to the compute_data_folder as defined in the config - get_from_catalog(name='data.txt', destination_folder='data/') - -``` - -The python function ```first_function``` makes the ```compute_data_folder``` and instructs the catalog to put it the -catalog. The python function ```second_function``` instructs the catalog to get the file by name ```data.txt``` from -the catalog and put it in the folder ```data/```. You can use glob patterns both in ```put_in_catalog``` or -```get_from_catalog```. - -The corresponding pipeline definition need not even aware of the cataloging happening by the functions. - -```yaml -dag: - description: A multi node pipeline - start_at: step 1 - steps: - step 1: - type: task - next: step 2 - command: my_module.first_function - command_type: python - step 2: - type: task - next: success - command: my_module.second_function - command_type: python - success: - type: success - failure: - type: fail -``` - ---- -## Using the catalog to source external data - -In magnus, you can only ```get``` from catalog if the catalog location already exists. Calling ```put``` in catalog, -which safely makes the catalog location if it does not exist, before you are trying to ```get``` from the catalog -ensures that the catalog location is always present. - -But there are situations where you want to call ```get``` before you ```put``` data in the catalog location by the -steps of the pipeline. For example, you want to source a data file generated by external processes and transform them -in your pipeline. You can achieve that by the fact all catalog providers (eg. file-system and extensions) use -```run_id``` as the directory (or partition) of the catalog. - -To source data from external sources for a particular run, - -- Create a ```run_id``` that you want to use for pipeline execution. -- Create the directory (or partition) in the catalog location by that ```run_id``` -- Copy the contents that you want the pipeline steps to access in the catalog location. -- Run the magnus pipeline by providing the ```run_id``` i.e ```magnus execute --run-id run_id --file <>``` - -Since the catalog location already exists, ```get``` from the catalog will source the external data. - ---- -## Accessing secrets within code. - -Secrets are the only service that magnus provides where you need to ```import magnus``` in your source code. This is -to ensure that the integrity of the secrets are held and handled safely. - -A typical configuration of the secrets is: - -```yaml -secrets: - type: #defaults to do-nothing - config: -``` - -By default, magnus chooses a ```do-nothing``` secrets provider which holds no secrets. For local development, -```dotenv``` secrets manager is useful and the config is as below. - -```yaml -secrets: - type: dotenv - config: - location: # defaults to .env -``` - -Example: - -``` -#Inside .env file -secret_name=secret_value#Any comment that you want to pass - -``` - -Any content after # is ignored and the format is ```key=value``` pairs. - -```python -# In my_module.py -from magnus import get_secret - -def first_function(): - print('In the function, first_function of my_module') - secret_value = get_secret('secret_name') - print(secret_value) # Should print secret_value - - secrets = get_secret() - print(secrets) # Should print {'secret_name': 'secret_value'} -``` - -The pipeline to run the above function as a step of the pipeline. - -```yaml -# in config.yaml -secrets: - type: dotenv - config: - location: # defaults to .env - -# in pipeline.yaml -dag: - description: Demo of secrets - start_at: step 1 - steps: - step 1: - type: task - next: success - command: my_module.first_function - command_type: python - success: - type: success - failure: - type: fail -``` - -or via Python SDK: - -```python -#in pipeline.py - -from magnus import Task, Pipeline - -def pipeline(): - first = Task(name='step 1', command='my_module.my_function') - pipeline = Pipeline(start_at=first, name='my first pipeline') - pipeline.construct([first]) - pipeline.execute() - -if __name__ == '__main__': - pipeline() -``` - ---- -## Parallel node - -We will be using ```as-is``` nodes as part of the examples to keep it simple but the concepts of nesting/branching -remain the same even in the case of actual tasks. - -Example of a parallel node: - -```yaml -# In config.yaml -run_log_store: - type: file-system # Use chunked-fs when using parallel - -# In pipeline.yaml -dag: - description: DAG for testing with as-is and parallel - start_at: step1 - steps: - step1: - type: as-is - next: step2 - step2: - type: parallel - next: success - branches: - branch_1: - start_at: step_1 - steps: - step_1: - type: as-is - next: success - success: - type: success - fail: - type: fail - branch_2: - start_at: step_1 - steps: - step_1: - type: as-is - next: success - success: - type: success - fail: - type: fail - success: - type: success - fail: - type: fail -``` - -You can execute the above dag by: - -```magnus execute --file example-parallel.yaml``` - -The above run should produce a ```run_log``` in the ```.run_log_store``` directory with the ```run_id``` as filename. - -The contents of the log should be similar to this: - -
- Click to show the run log - - -```json - -{ - "run_id": "20220120131257", - "dag_hash": "cf5cc7df88d4af3bc0936a9a8a3c4572ce4e11bc", - "use_cached": false, - "tag": null, - "original_run_id": "", - "status": "SUCCESS", - "steps": { - "step1": { - "name": "step1", - "internal_name": "step1", - "status": "SUCCESS", - "step_type": "as-is", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": , - "code_identifier_message": - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 13:12:57.999265", - "end_time": "2022-01-20 13:12:57.999287", - "duration": "0:00:00.000022", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - }, - "step2": { - "name": "step2", - "internal_name": "step2", - "status": "SUCCESS", - "step_type": "parallel", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": , - "code_identifier_message": - } - ], - "attempts": [], - "user_defined_metrics": {}, - "branches": { - "step2.branch_1": { - "internal_name": "step2.branch_1", - "status": "SUCCESS", - "steps": { - "step2.branch_1.step_1": { - "name": "step_1", - "internal_name": "step2.branch_1.step_1", - "status": "SUCCESS", - "step_type": "as-is", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": , - "code_identifier_message": - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 13:12:58.090461", - "end_time": "2022-01-20 13:12:58.090476", - "duration": "0:00:00.000015", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - }, - "step2.branch_1.success": { - "name": "success", - "internal_name": "step2.branch_1.success", - "status": "SUCCESS", - "step_type": "success", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": , - "code_identifier_message": - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 13:12:58.135551", - "end_time": "2022-01-20 13:12:58.135732", - "duration": "0:00:00.000181", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - } - } - }, - "step2.branch_2": { - "internal_name": "step2.branch_2", - "status": "SUCCESS", - "steps": { - "step2.branch_2.step_1": { - "name": "step_1", - "internal_name": "step2.branch_2.step_1", - "status": "SUCCESS", - "step_type": "as-is", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": , - "code_identifier_message": - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 13:12:58.187648", - "end_time": "2022-01-20 13:12:58.187661", - "duration": "0:00:00.000013", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - }, - "step2.branch_2.success": { - "name": "success", - "internal_name": "step2.branch_2.success", - "status": "SUCCESS", - "step_type": "success", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": , - "code_identifier_message": - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 13:12:58.233479", - "end_time": "2022-01-20 13:12:58.233681", - "duration": "0:00:00.000202", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - } - } - } - }, - "data_catalog": [] - }, - "success": { - "name": "success", - "internal_name": "success", - "status": "SUCCESS", - "step_type": "success", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": , - "code_identifier_message": - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 13:12:58.280538", - "end_time": "2022-01-20 13:12:58.280597", - "duration": "0:00:00.000059", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - } - }, - "parameters": {}, - "run_config": { - "executor": { - "type": "local", - "config": {} - }, - "run_log_store": { - "type": "file-system", - "config": {} - }, - "catalog": { - "type": "file-system", - "config": {} - }, - "secrets": { - "type": "do-nothing", - "config": {} - } - } -} - -``` -
- -The individual steps of the dag are named in [```dot path convention```](../concepts/run-log/#naming_step_log) - -You can nest a ```parallel``` node, ```dag``` or a ```map``` node within parallel node to enable modular dag designs. - -### Enabling parallel execution - -Though the dag definition defines a ```parallel``` node, the execution of the dag and the parallelism is actually -controlled by the executor. In ```local``` execution, you can enable parallel branch execution by modifying the config. - -```yaml -executor: - type: local - config: - enable_parallel: True -``` - -Point to note: - -- Run log stores which use a single file as their log source (eg. file-system) cannot reliably run parallel executions - as race conditions to modify the same file can happen leaving the run log in inconsistent state. The logs of the - execution would also warn the same. Partitioned run log stores (eg. db) can be reliable run log stores. - ---- -## Embedding dag within dag - -You can embed dag's defined elsewhere into your dag. - -For example, we can define a dag which works all by itself in sub-dag.yaml - -```yaml -# in sub-dag.yaml -dag: - description: sub dag - start_at: step1 - steps: - step1: - type: as-is - next: step2 - step2: - type: as-is - next: success - success: - type: success - fail: - type: fail - -``` - -We can embed this dag into another dag as a node like below. - -```yaml -dag: - description: DAG for nested dag - start_at: step_dag_within_dag - steps: - step_dag_within_dag: - type: dag - dag_definition: sub-dag.yaml # Should be the filepath to the dag you want to embed. - next: success - success: - type: success - fail: - type: fail - -``` - -Nested dag's should allow for a very modular design where individual dag's do well defined tasks but the nested dag -can stitch them to complete the whole task. - -As with parallel execution, the individual steps of the dag are named in -[```dot path convention```](../concepts/run-log/#naming_step_log) - ---- -## Looping a branch over an iterable parameter - -Often, you would need to do the same repetitive tasks over a list and magnus allows you to do that. - -Example of dynamic branch looping is below. - -```yaml -# in map-state.yaml -dag: - description: DAG for map - start_at: step1 - steps: - step1: - type: task - command: "lambda : {'variables' : ['a', 'b', 'c']}" - command_type: python-lambda - next: step2 - step2: - type: map - iterate_on: variables - iterate_as: x - next: success - branch: - start_at: step_1 - steps: - step_1: - type: task - command: "lambda x : {'state_' + str(x) : 5}" - command_type: python-lambda - next: success - success: - type: success - fail: - type: fail - success: - type: success - fail: - type: fail - -``` - -In the above dag, step1 sets the parameters ```variables``` as list ```['a', 'b', 'c']```. -step2 is a node of type map which will iterate on ```variables``` and execute the ```branch``` defined as part of the -definition of step2 for every value in the iterable ```variables```. - -The ```branch``` definition of the step2 basically creates one more parameter ```state_=5``` by the lambda -expression. You can see these parameters as part of the run log show below. - -
- Click to show the run log - -``` json -{ - "run_id": "20220120150813", - "dag_hash": "c0492a644b4f28f8441d669d9f0efb0f6d6be3d3", - "use_cached": false, - "tag": null, - "original_run_id": "", - "status": "SUCCESS", - "steps": { - "step1": { - "name": "step1", - "internal_name": "step1", - "status": "SUCCESS", - "step_type": "task", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.069919", - "end_time": "2022-01-20 15:08:14.070484", - "duration": "0:00:00.000565", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - }, - "step2": { - "name": "step2", - "internal_name": "step2", - "status": "SUCCESS", - "step_type": "map", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [], - "user_defined_metrics": {}, - "branches": { - "step2.a": { - "internal_name": "step2.a", - "status": "SUCCESS", - "steps": { - "step2.a.step_1": { - "name": "step_1", - "internal_name": "step2.a.step_1", - "status": "SUCCESS", - "step_type": "task", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.162440", - "end_time": "2022-01-20 15:08:14.162882", - "duration": "0:00:00.000442", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - }, - "step2.a.success": { - "name": "success", - "internal_name": "step2.a.success", - "status": "SUCCESS", - "step_type": "success", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.209895", - "end_time": "2022-01-20 15:08:14.210106", - "duration": "0:00:00.000211", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - } - } - }, - "step2.b": { - "internal_name": "step2.b", - "status": "SUCCESS", - "steps": { - "step2.b.step_1": { - "name": "step_1", - "internal_name": "step2.b.step_1", - "status": "SUCCESS", - "step_type": "task", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.258519", - "end_time": "2022-01-20 15:08:14.258982", - "duration": "0:00:00.000463", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - }, - "step2.b.success": { - "name": "success", - "internal_name": "step2.b.success", - "status": "SUCCESS", - "step_type": "success", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.305524", - "end_time": "2022-01-20 15:08:14.305754", - "duration": "0:00:00.000230", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - } - } - }, - "step2.c": { - "internal_name": "step2.c", - "status": "SUCCESS", - "steps": { - "step2.c.step_1": { - "name": "step_1", - "internal_name": "step2.c.step_1", - "status": "SUCCESS", - "step_type": "task", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.353182", - "end_time": "2022-01-20 15:08:14.353603", - "duration": "0:00:00.000421", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - }, - "step2.c.success": { - "name": "success", - "internal_name": "step2.c.success", - "status": "SUCCESS", - "step_type": "success", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": "INTENTIONALLY_REMOVED" - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.401043", - "end_time": "2022-01-20 15:08:14.401304", - "duration": "0:00:00.000261", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - } - } - } - }, - "data_catalog": [] - }, - "success": { - "name": "success", - "internal_name": "success", - "status": "SUCCESS", - "step_type": "success", - "message": "", - "mock": false, - "code_identities": [ - { - "code_identifier": "2a5b33bdf60c4f0d38cae04ab3f988b3d1c6ed59", - "code_identifier_type": "git", - "code_identifier_dependable": false, - "code_identifier_url": "INTENTIONALLY_REMOVED", - "code_identifier_message": `"INTENTIONALLY_REMOVED"` - } - ], - "attempts": [ - { - "attempt_number": 0, - "start_time": "2022-01-20 15:08:14.449759", - "end_time": "2022-01-20 15:08:14.449826", - "duration": "0:00:00.000067", - "status": "SUCCESS", - "message": "" - } - ], - "user_defined_metrics": {}, - "branches": {}, - "data_catalog": [] - } - }, - "parameters": { - "variables": [ - "a", - "b", - "c" - ], - "state_a": 5, - "state_b": 5, - "state_c": 5 - }, - "run_config": { - "executor": { - "type": "local", - "config": {} - }, - "run_log_store": { - "type": "buffered", - "config": {} - }, - "catalog": { - "type": "file-system", - "config": {} - }, - "secrets": { - "type": "do-nothing", - "config": {} - } - } -} -``` -
- -The individual steps of the dag are named in [```dot path convention```](../concepts/run-log/#naming_step_log). - -### Enabling parallel execution - -Though the dag definition defines a ```map``` node where the branches can be executed in parallel, -the execution of the dag and the parallelism is actually -controlled by the executor. In ```local``` execution, you can enable parallel branch execution by modifying the config. - -```yaml -executor: - type: local - config: - enable_parallel: True -``` - -Point to note: - -- Run log stores which use a single file as their log source (eg. file-system) cannot reliably run parallel executions - as race conditions to modify the same file can happen leaving the run log in inconsistent state. The logs of the - execution would also warn the same. Partitioned run log stores (eg. db) can be reliable run log stores. - ---- -## Nesting and complex dags - -Magnus does not limit you at all in nesting at any level. You have construct deep nesting levels easily and magnus -would execute them as you designed. - -As a general coding practice, having deeply nested branches could be hard to read and maintain. - -***NOTE***: There is a possibility that you can nest the same dag within the dag definition resulting in a infinite -loop. We are actively finding ways to detect these situations and warn you. - ---- -## Advanced use as-is - -Node type ```as-is``` defined in magnus can be a very powerful tool in some deployment patterns. - -For example in the below dag definition, the step ```step echo``` does nothing as part of ```local``` execution. - -```yaml -# In config.yaml -executor: - type: demo-renderer - -run_log_store: - type: file-system - -# In pipeline.yaml -dag: - description: Getting started - start_at: step parameters - steps: - step parameters: - type: task - command_type: python-lambda - command: "lambda x: {'x': int(x) + 1}" - next: step shell - step shell: - type: task - command_type: shell - command: mkdir data ; env >> data/data.txt - next: step echo - catalog: - put: - - "*" - step echo: - type: as-is - command_type: shell - next: success - success: - type: success - fail: - type: fail -``` - -But a deployment pattern, like ```demo-renderer```, can use it to inject a command into the bash script. To test it out, -uncomment the config to change to executor to ```demo-renderer``` and the run log store to be ```file-system``` and -execute it like below. - -```magnus execute --file getting-started.yaml``` - -should generate a bash script as show below in ```demo-bash.sh```. - -```shell -for ARGUMENT in "${@:2}" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - VALUE=$(echo $ARGUMENT | cut -f2 -d=) - export "MAGNUS_PRM_$KEY"=$VALUE -done -magnus execute_single_node $1 step%parameters --file getting-started.yaml -exit_code=$? -echo $exit_code -if [ $exit_code -ne 0 ]; -then - $(magnus execute_single_node $1 fail --file getting-started.yaml) - exit 1 -fi -magnus execute_single_node $1 step%shell --file getting-started.yaml -exit_code=$? -echo $exit_code -if [ $exit_code -ne 0 ]; -then - $(magnus execute_single_node $1 fail --file getting-started.yaml) - exit 1 -fi -echo hello -exit_code=$? -echo $exit_code -if [ $exit_code -ne 0 ]; -then - $(magnus execute_single_node $1 fail --file getting-started.yaml) - exit 1 -fi -magnus execute_single_node $1 success --file getting-started.yaml -``` - -The shell script is translation of the dag into a series of bash commands but notice the command ```echo hello``` as -part of the script. While the ```local``` executor interpreted that node as a stub or a mock node, the -```demo-renderer``` execution used the ```render_string``` variable of the node ```config``` to inject a script. - -This feature is very useful when you want certain few steps (may be email notifications) to be only possible in -production like environments but want to mock the during dev/experimental set up. - -***NOTE***: When trying to ```locally``` re-run a dag definition with ```as-is``` node used to inject scripts, -the run would start from ```as-is``` step onwards independent of the source of failure. You can change this -behavior by writing extensions which skip over ```as-is``` nodes during re-run. -## Controlling the log level of magnus - -The default log level of magnus is WARNING but you can change it at the point of execution to one of -```['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET]``` by using the command line argument --log-level. - -For example: - -```magnus execute --file --log-level DEBUG``` - -would set the magnus log level to DEBUG. This setting only affects magnus logs and will not alter your application log -levels. - ---- -## Order of configurations - -Magnus supports many ways of providing configurations but there is a order of preference. - -Magnus defaults to the following if no config is provided. - -```yaml -executor: - type: local - config: - enable_parallel: "false" - -run_log_store: - type: buffered - -catalog: - type: file-system - config: - compute_data_folder: data/ - catalog_location: .catalog - -secrets: - type: do-nothing - -experiment_tracking: - type: do-nothing - -``` - -But you can over-ride these defaults by providing a ```magnus-config.yaml``` in the source directory. For example, -if the ```magnus-config.yaml``` file has the following contents, even if you do not provide a config in the dag -definition file, these would taken as default service providers. - -```yaml -executor: - type: local - config: - enable_parallel: True - -run_log_store: - type: file-system - -catalog: - type: file-system - config: - compute_data_folder: data/ # default - catalog_location: .catalog # default - -secrets: - type: dotenv - config: - location: .env # default -``` - -Finally, you can also over-ride the configurations set in the dag definition file by providing a custom configuration -file containing only the configurations. - -For example, you can provide a dag definition file as above with ```do-nothing``` secrets handler but by providing -the below configurations file at the run time, you can over-ride it to ```dotenv```. - -```yaml -#in prod-configuration.yaml -secrets: - type: dotenv - -``` - -The command to execute while providing the configuration file. - -```magnus execute --file --config-file prod-configuration.yaml``` - - - -The design thought is enable switching between different configurations by different actors involved in the data science -workflow. The engineering team could provide ```magnus-config.yaml``` that should be default to the team or project -for dev/experimental phase of the work but can over-ride the configuration during production deployment. diff --git a/docs/extensions.md b/docs/extensions.md index 6821aeab..0b932ea0 100644 --- a/docs/extensions.md +++ b/docs/extensions.md @@ -2,10 +2,10 @@ Magnus is built around the idea to decouple the pipeline definition and pipeline execution. -[All the concepts](../concepts/the-big-picture/) are defined with this principle and therefore +[All the concepts](/concepts/the-big-picture/) are defined with this principle and therefore are extendible as long as the API is satisfied. -We internally use [stevedore](https://pypi.org/project/stevedore/) to manage extensions. +We internally use [stevedore](https:/pypi.org/project/stevedore/) to manage extensions. Our [pyproject.toml](https://github.com/AstraZeneca/magnus-core/blob/main/pyproject.toml) has plugin space for all the concepts. @@ -80,9 +80,11 @@ are extended from pydantic BaseModel. ## Executor -Examples: [local](../../configurations/executors/local), -[local-container](../../configurations/executors/local-container), -[argo](../../configurations/executors/argo) +Register to namespace: [tool.poetry.plugins."executor"] + +Examples: [local](/configurations/executors/local), +[local-container](/configurations/executors/local-container), +[argo](/configurations/executors/argo) ::: magnus.executor.BaseExecutor options: @@ -95,9 +97,11 @@ Examples: [local](../../configurations/executors/local), ## Run Log -Examples: [buffered](../../configurations/run-log/#buffered), -[file-system](../../configurations/run-log/#file-system), - [chunked-fs](../../configurations/run-log/#chunked-fs) +Register to namespace: [tool.poetry.plugins."run_log_store"] + +Examples: [buffered](/configurations/run-log/#buffered), +[file-system](/configurations/run-log/#file-system), + [chunked-fs](/configurations/run-log/#chunked-fs) ::: magnus.datastore.BaseRunLogStore options: @@ -113,9 +117,11 @@ The ```RunLog``` is a nested pydantic model and is located in ```magnus.datastor ## Catalog +Register to namespace: [tool.poetry.plugins."catalog"] + Example: -[do-nothing](../../configurations/catalog/#do-nothing), - [file-system](../../configurations/catalog/#file-system) +[do-nothing](/configurations/catalog/#do-nothing), + [file-system](/configurations/catalog/#file-system) ::: magnus.catalog.BaseCatalog options: @@ -128,10 +134,12 @@ Example: ## Secrets +Register to namespace: [tool.poetry.plugins."secrets"] + Example: -[do-nothing](../../configurations/secrets/#do-nothing), - [env-secrets-manager](../../configurations/secrets/#environment_secret_manager), - [dotenv](../../configurations/secrets/#dotenv) +[do-nothing](/configurations/secrets/#do-nothing), + [env-secrets-manager](/configurations/secrets/#environment_secret_manager), + [dotenv](/configurations/secrets/#dotenv) ::: magnus.secrets.BaseSecrets options: @@ -144,8 +152,10 @@ Example: ## Experiment tracking +Register to namespace: [tool.poetry.plugins."experiment_tracker"] + Example: -[do-nothing](../../configurations/experiment-tracking), ```mlflow``` +[do-nothing](/configurations/experiment-tracking), ```mlflow``` ::: magnus.experiment_tracker.BaseExperimentTracker options: @@ -157,12 +167,13 @@ Example: ## Nodes +Register to namespace: [tool.poetry.plugins."nodes"] Example: -[task](../../concepts/task), -[stub](../../concepts/stub), -[parallel](../../concepts/parallel), -[map](../../concepts/map) +[task](/concepts/task), +[stub](/concepts/stub), +[parallel](/concepts/parallel), +[map](/concepts/map) ::: magnus.nodes.BaseNode options: @@ -176,10 +187,12 @@ Example: ## Tasks +Register to namespace: [tool.poetry.plugins."tasks"] + Example: -[python](../../concepts/task/#python_functions), -[shell](../../concepts/task/#shell), -[notebook](../../concepts/task/#notebook) +[python](/concepts/task/#python_functions), +[shell](/concepts/task/#shell), +[notebook](/concepts/task/#notebook) ::: magnus.tasks.BaseTaskType options: @@ -191,3 +204,9 @@ Example: ## Roadmap + +- AWS environments using Sagemaker pipelines or AWS step functions. +- HPC environment using SLURM executor. +- Database based Run log store. +- Better integrations with experiment tracking tools. +- Azure ML environments. diff --git a/docs/how-do-i.md b/docs/how-do-i.md deleted file mode 100644 index 7c32042a..00000000 --- a/docs/how-do-i.md +++ /dev/null @@ -1,41 +0,0 @@ -# How do I - -## Pass parameters between steps? - ---8<-- -docs/concepts/nodes.md:how-do-i-pass-simple ---8<-- - -## Pass data files between steps? - -In magnus, data files are passed to downstream steps using the concept of [catalog](../concepts/catalog). The catalog -settings and behavior can be completely controlled by the pipeline definition but can also be controlled via code if -its convenient. - ---8<-- -docs/concepts/catalog.md:how-do-i-pass-data ---8<-- - - -## Pass data objects between steps? - -In magnus, data are passed to downstream steps using the concept of [catalog](../concepts/catalog). While this is -good for files, it is inconvenient to dump and load the object into files for the cataloging to happen. Magnus provides -utility functions to make it easier. - ---8<-- -docs/concepts/catalog.md:how-do-i-pass-objects ---8<-- - -## Define variables? - ---8<-- -docs/concepts/dag.md:how-do-i-parameterize ---8<-- - - -## Track experiments? - ---8<-- -docs/concepts/experiment-tracking.md:how-do-i-track ---8<-- diff --git a/docs/index.md b/docs/index.md index 2f80ac93..90b7df8f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -13,32 +13,32 @@ sidebarDepth: 0 Magnus is a simplified workflow definition language that helps in: - **Streamlined Design Process:** Magnus enables users to efficiently plan their pipelines with -[stubbed nodes](../concepts/stub), along with offering support for various structures such as -[tasks](../concepts/task), [parallel branches](../concepts/parallel), and [loops or map branches](../concepts/map) -in both [yaml](../concepts/pipeline) or a [python SDK](../sdk) for maximum flexibility. +[stubbed nodes](concepts/stub), along with offering support for various structures such as +[tasks](../concepts/task), [parallel branches](concepts/parallel), and [loops or map branches](concepts/map) +in both [yaml](concepts/pipeline) or a [python SDK](sdk) for maximum flexibility. - **Incremental Development:** Build your pipeline piece by piece with Magnus, which allows for the -implementation of tasks as [python functions](../concepts/task/#python_functions), -[notebooks](../concepts/task/#notebooks), or [shell scripts](../concepts/task/#shell), +implementation of tasks as [python functions](concepts/task/#python_functions), +[notebooks](concepts/task/#notebooks), or [shell scripts](concepts/task/#shell), adapting to the developer's preferred tools and methods. - **Robust Testing:** Ensure your pipeline performs as expected with the ability to test using sampled data. Magnus -also provides the capability to [mock and patch tasks](../configurations/executors/mocked) +also provides the capability to [mock and patch tasks](configurations/executors/mocked) for thorough evaluation before full-scale deployment. - **Seamless Deployment:** Transition from the development stage to production with ease. -Magnus simplifies the process by requiring [only configuration changes](../configurations/overview) -to adapt to different environments, including support for [argo workflows](../configurations/executors/argo). +Magnus simplifies the process by requiring [only configuration changes](configurations/overview) +to adapt to different environments, including support for [argo workflows](configurations/executors/argo). - **Efficient Debugging:** Quickly identify and resolve issues in pipeline execution with Magnus's local -debugging features. Retrieve data from failed tasks and [retry failures](../concepts/run-log/#retrying_failures) +debugging features. Retrieve data from failed tasks and [retry failures](concepts/run-log/#retrying_failures) using your chosen debugging tools to maintain a smooth development experience. Along with the developer friendly features, magnus also acts as an interface to production grade concepts -such as [data catalog](../concepts/catalog), [reproducibility](../concepts/run-log), -[experiment tracking](../concepts/experiment-tracking) -and secure [access to secrets](../concepts/secrets). +such as [data catalog](concepts/catalog), [reproducibility](concepts/run-log), +[experiment tracking](concepts/experiment-tracking) +and secure [access to secrets](concepts/secrets). ## Motivation diff --git a/magnus/extensions/catalog/file_system/integration.py b/docs/roadmap.md similarity index 100% rename from magnus/extensions/catalog/file_system/integration.py rename to docs/roadmap.md diff --git a/docs/usage.md b/docs/usage.md index 375ee2a5..44b553e1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -40,14 +40,14 @@ pip install "magnus[mlflow]" ## Usage -Pipelines defined in **magnus** can be either via [python sdk](../../sdk) or ```yaml``` based definitions. +Pipelines defined in **magnus** can be either via [python sdk](/sdk) or ```yaml``` based definitions. To execute a pipeline, defined in ```yaml```, use the **magnus** cli. The options are detailed below: - ```-f, --file``` (str): The pipeline definition file, defaults to pipeline.yaml -- ```-c, --config-file``` (str): [config file](../../configurations/overview) to be used for the run [default: None] -- ```-p, --parameters-file``` (str): [Parameters](../../concepts/parameters) accessible by the application [default: None] +- ```-c, --config-file``` (str): [config file](/configurations/overview) to be used for the run [default: None] +- ```-p, --parameters-file``` (str): [Parameters](/concepts/parameters) accessible by the application [default: None] - ```--log-level``` : The log level, one of ```INFO | DEBUG | WARNING| ERROR| FATAL``` [default: INFO] - ```--tag``` (str): A tag attached to the run[default: ] - ```--run-id``` (str): An optional run_id, one would be generated if not provided diff --git a/examples/concepts/catalog_object.py b/examples/concepts/catalog_object.py index b605cf94..60b96001 100644 --- a/examples/concepts/catalog_object.py +++ b/examples/concepts/catalog_object.py @@ -37,6 +37,8 @@ def retrieve_object(): data_model = get_object("everything_model") + assert data_model == EverythingModel(spam="Hello", eggs=EggsModel(ham="Yes, please!!")) + print(data_model) ">>>spam='Hello' eggs=EggsModel(ham='Yes, please!!')" diff --git a/examples/concepts/nesting.py b/examples/concepts/nesting.py index 9065bed3..8e2b0fea 100644 --- a/examples/concepts/nesting.py +++ b/examples/concepts/nesting.py @@ -31,7 +31,7 @@ def main(): inner_most_map = Map( name="inner most", branch=stubbed_pipeline, - iterate_on="array", + iterate_on="array", # Parameter defined in line #20 iterate_as="y", terminate_with_success=True, ) diff --git a/examples/concepts/notebook_api_parameters_out.ipynb b/examples/concepts/notebook_api_parameters_out.ipynb index 38c9c2c7..0ec96dbf 100644 --- a/examples/concepts/notebook_api_parameters_out.ipynb +++ b/examples/concepts/notebook_api_parameters_out.ipynb @@ -3,11 +3,11 @@ { "cell_type": "code", "execution_count": 1, - "id": "450e93e2", + "id": "349718a9", "metadata": { "ploomber": { - "timestamp_end": 1707694274.506838, - "timestamp_start": 1707694274.506497 + "timestamp_end": 1707888032.786035, + "timestamp_start": 1707888032.785661 }, "tags": [ "injected-parameters" @@ -26,8 +26,8 @@ "id": "4377a9c8", "metadata": { "ploomber": { - "timestamp_end": 1707694274.507047, - "timestamp_start": 1707694274.506862 + "timestamp_end": 1707888032.786291, + "timestamp_start": 1707888032.786067 } }, "outputs": [], @@ -41,8 +41,8 @@ "id": "3e98e89e-765c-42d4-81ea-c371c2eab14d", "metadata": { "ploomber": { - "timestamp_end": 1707694274.508298, - "timestamp_start": 1707694274.507062 + "timestamp_end": 1707888032.787494, + "timestamp_start": 1707888032.78631 } }, "outputs": [], @@ -64,8 +64,8 @@ "id": "0e04f11a", "metadata": { "ploomber": { - "timestamp_end": 1707694274.508544, - "timestamp_start": 1707694274.508314 + "timestamp_end": 1707888032.787735, + "timestamp_start": 1707888032.787511 } }, "outputs": [], @@ -80,8 +80,8 @@ "id": "9f1cbac6-cada-42b0-8fb1-ddb25a88836c", "metadata": { "ploomber": { - "timestamp_end": 1707694274.509087, - "timestamp_start": 1707694274.508558 + "timestamp_end": 1707888032.788255, + "timestamp_start": 1707888032.787749 } }, "outputs": [ diff --git a/examples/concepts/notebook_env_parameters_out.ipynb b/examples/concepts/notebook_env_parameters_out.ipynb index cae86f63..4b90285f 100644 --- a/examples/concepts/notebook_env_parameters_out.ipynb +++ b/examples/concepts/notebook_env_parameters_out.ipynb @@ -3,11 +3,11 @@ { "cell_type": "code", "execution_count": 1, - "id": "f208a69d", + "id": "72c0fad9", "metadata": { "ploomber": { - "timestamp_end": 1707694274.798098, - "timestamp_start": 1707694274.797831 + "timestamp_end": 1707888033.079694, + "timestamp_start": 1707888033.079387 }, "tags": [ "injected-parameters" @@ -26,8 +26,8 @@ "id": "4377a9c8", "metadata": { "ploomber": { - "timestamp_end": 1707694274.798291, - "timestamp_start": 1707694274.79812 + "timestamp_end": 1707888033.079896, + "timestamp_start": 1707888033.079714 } }, "outputs": [], @@ -43,8 +43,8 @@ "id": "3e98e89e-765c-42d4-81ea-c371c2eab14d", "metadata": { "ploomber": { - "timestamp_end": 1707694274.799375, - "timestamp_start": 1707694274.798304 + "timestamp_end": 1707888033.081091, + "timestamp_start": 1707888033.07991 } }, "outputs": [], @@ -64,8 +64,8 @@ "id": "0e04f11a", "metadata": { "ploomber": { - "timestamp_end": 1707694274.799643, - "timestamp_start": 1707694274.79939 + "timestamp_end": 1707888033.081326, + "timestamp_start": 1707888033.081107 } }, "outputs": [], @@ -80,8 +80,8 @@ "id": "9f1cbac6-cada-42b0-8fb1-ddb25a88836c", "metadata": { "ploomber": { - "timestamp_end": 1707694274.800084, - "timestamp_start": 1707694274.799656 + "timestamp_end": 1707888033.082294, + "timestamp_start": 1707888033.081339 } }, "outputs": [ diff --git a/examples/concepts/notebook_native_parameters_out.ipynb b/examples/concepts/notebook_native_parameters_out.ipynb index f3225810..b3626246 100644 --- a/examples/concepts/notebook_native_parameters_out.ipynb +++ b/examples/concepts/notebook_native_parameters_out.ipynb @@ -6,8 +6,8 @@ "id": "3e98e89e-765c-42d4-81ea-c371c2eab14d", "metadata": { "ploomber": { - "timestamp_end": 1707694274.981598, - "timestamp_start": 1707694274.980265 + "timestamp_end": 1707888033.273499, + "timestamp_start": 1707888033.27216 } }, "outputs": [], @@ -29,8 +29,8 @@ "id": "e7f0aab2", "metadata": { "ploomber": { - "timestamp_end": 1707694274.981757, - "timestamp_start": 1707694274.981621 + "timestamp_end": 1707888033.273656, + "timestamp_start": 1707888033.273521 }, "tags": [ "parameters" @@ -46,11 +46,11 @@ { "cell_type": "code", "execution_count": 3, - "id": "07436d18", + "id": "894a6034", "metadata": { "ploomber": { - "timestamp_end": 1707694274.981902, - "timestamp_start": 1707694274.981771 + "timestamp_end": 1707888033.273794, + "timestamp_start": 1707888033.273669 }, "tags": [ "injected-parameters" @@ -69,8 +69,8 @@ "id": "0e04f11a", "metadata": { "ploomber": { - "timestamp_end": 1707694274.982051, - "timestamp_start": 1707694274.981915 + "timestamp_end": 1707888033.273938, + "timestamp_start": 1707888033.273806 } }, "outputs": [], @@ -84,8 +84,8 @@ "id": "9f1cbac6-cada-42b0-8fb1-ddb25a88836c", "metadata": { "ploomber": { - "timestamp_end": 1707694274.982442, - "timestamp_start": 1707694274.982064 + "timestamp_end": 1707888033.274332, + "timestamp_start": 1707888033.273949 } }, "outputs": [ @@ -124,8 +124,8 @@ "id": "a58a4492", "metadata": { "ploomber": { - "timestamp_end": 1707694274.982702, - "timestamp_start": 1707694274.982571 + "timestamp_end": 1707888033.274591, + "timestamp_start": 1707888033.27446 }, "tags": [ "magnus_output" diff --git a/examples/concepts/simple_notebook_out.ipynb b/examples/concepts/simple_notebook_out.ipynb index 60c11626..371d5071 100644 --- a/examples/concepts/simple_notebook_out.ipynb +++ b/examples/concepts/simple_notebook_out.ipynb @@ -3,11 +3,11 @@ { "cell_type": "code", "execution_count": 1, - "id": "905e14a3", + "id": "02d63323", "metadata": { "ploomber": { - "timestamp_end": 1707694275.830048, - "timestamp_start": 1707694275.829869 + "timestamp_end": 1707888034.186718, + "timestamp_start": 1707888034.186525 }, "tags": [ "injected-parameters" @@ -24,8 +24,8 @@ "id": "3e98e89e-765c-42d4-81ea-c371c2eab14d", "metadata": { "ploomber": { - "timestamp_end": 1707694275.830239, - "timestamp_start": 1707694275.830066 + "timestamp_end": 1707888034.186921, + "timestamp_start": 1707888034.18674 } }, "outputs": [], @@ -40,8 +40,8 @@ "id": "9f1cbac6-cada-42b0-8fb1-ddb25a88836c", "metadata": { "ploomber": { - "timestamp_end": 1707694275.830373, - "timestamp_start": 1707694275.830252 + "timestamp_end": 1707888034.187058, + "timestamp_start": 1707888034.186934 } }, "outputs": [], @@ -56,8 +56,8 @@ "id": "9dcadc93-aa77-4a0a-9465-2e33eef4da44", "metadata": { "ploomber": { - "timestamp_end": 1707694275.830506, - "timestamp_start": 1707694275.830386 + "timestamp_end": 1707888034.187196, + "timestamp_start": 1707888034.18707 } }, "outputs": [], @@ -71,8 +71,8 @@ "id": "7b872cdf-820b-47b5-8f22-15c4b69c8637", "metadata": { "ploomber": { - "timestamp_end": 1707694275.83063, - "timestamp_start": 1707694275.830519 + "timestamp_end": 1707888034.187316, + "timestamp_start": 1707888034.187208 } }, "outputs": [], diff --git a/examples/python-tasks-argo.py b/examples/python-tasks-argo.py index 39796a76..dcb48122 100644 --- a/examples/python-tasks-argo.py +++ b/examples/python-tasks-argo.py @@ -8,7 +8,7 @@ You can run this pipeline by: python examples/python-tasks.py """ -# TODO: This need not exist and can be replaced by configuration file by environment variable. + from magnus import Pipeline, Task diff --git a/magnus/__init__.py b/magnus/__init__.py index 0b2148cd..98c1aee8 100644 --- a/magnus/__init__.py +++ b/magnus/__init__.py @@ -31,6 +31,4 @@ # TODO: Implement Sagemaker pipelines as a executor. -# TODO: Bring this in the mix: https://github.com/mkdocs/mkdocs-click - # TODO: Think of way of generating dag hash without executor configuration diff --git a/magnus/cli.py b/magnus/cli.py index da9a81a7..be04f932 100644 --- a/magnus/cli.py +++ b/magnus/cli.py @@ -72,53 +72,6 @@ def execute(file, config_file, parameters_file, log_level, tag, run_id, use_cach ) -@cli.command("execute_step", short_help="Execute a single step of the pipeline") -@click.argument("step_name") -@click.option("-f", "--file", default="pipeline.yaml", help="The pipeline definition file", show_default=True) -@click.option( - "-c", "--config-file", default=None, help="config file, in yaml, to be used for the run", show_default=True -) -@click.option( - "-p", - "--parameters-file", - default=None, - help="Parameters, in yaml, accessible by the application", - show_default=True, -) -@click.option( - "--log-level", - default=defaults.LOG_LEVEL, - help="The log level", - show_default=True, - type=click.Choice(["INFO", "DEBUG", "WARNING", "ERROR", "FATAL"]), -) -@click.option("--tag", help="A tag attached to the run") -@click.option("--run-id", help="An optional run_id, one would be generated if not provided") -@click.option("--use-cached", help="Provide the previous run_id to re-run.", show_default=True) -def execute_step(step_name, file, config_file, parameters_file, log_level, tag, run_id, use_cached): # pragma: no cover - """ - External entry point to executing a single step of the pipeline. - - This command is helpful to run only one step of the pipeline in isolation. - Only the steps of the parent dag could be invoked using this method. - - You can re-run an older run by providing the run_id of the older run in --use-cached. - Ensure that the catalogs and run logs are accessible by the present configuration. - - When running map states, ensure that the parameter to iterate on is available in parameter space. - """ - logger.setLevel(log_level) - entrypoints.execute_single_step( - configuration_file=config_file, - pipeline_file=file, - step_name=step_name, - tag=tag, - run_id=run_id, - parameters_file=parameters_file, - use_cached=use_cached, - ) - - @cli.command("execute_single_node", short_help="Internal entry point to execute a single node", hidden=True) @click.argument("run_id") @click.argument("step_name") @@ -269,86 +222,6 @@ def execute_function( ) -@cli.command("execute_container", short_help="Entry point to execute a container") -@click.argument("image") -@click.option("--entrypoint", default=defaults.ENTRYPOINT.USER.value, hidden=True) -@click.option("--command", default="", help="The command to execute. Defaults to CMD of image") -@click.option( - "-c", "--config-file", default=None, help="config file, in yaml, to be used for the run", show_default=True -) -@click.option( - "-p", - "--parameters-file", - default=None, - help="Parameters, in yaml, accessible by the application", - show_default=True, -) -@click.option( - "--log-level", - default=defaults.LOG_LEVEL, - help="The log level", - show_default=True, - type=click.Choice(["INFO", "DEBUG", "WARNING", "ERROR", "FATAL"]), -) -@click.option( - "--context-path", - default=defaults.DEFAULT_CONTAINER_CONTEXT_PATH, - help="The context path for data and parameter files", -) -@click.option( - "--data-folder", - default=defaults.DEFAULT_CONTAINER_DATA_PATH, - help="The catalog data folder relative to context", -) -@click.option( - "--output-parameters-file", default=defaults.DEFAULT_CONTAINER_OUTPUT_PARAMETERS, help="The output parameters file" -) -@click.option("--experiment-tracking-file", default="", help="The output experiment tracking file") -@click.option("--put-in-catalog", "-put", default=None, multiple=True, help="The data to put from the catalog") -@click.option("--expose-secret", default=None, multiple=True, help="The secret to expose to the container") -@click.option("--tag", help="A tag attached to the run") -@click.option("--run-id", help="An optional run_id, one would be generated if not provided") -def execute_container( - image, - entrypoint, - command, - config_file, - parameters_file, - log_level, - context_path, - data_folder, - output_parameters_file, - experiment_tracking_file, - put_in_catalog, - expose_secret, - tag, - run_id, -): - """ - External entry point to execute a container in isolation. - - The container would be executed in the environment defined by the config file or default if none. - The execution plan is unchained. - """ - logger.setLevel(log_level) - catalog_config = {"compute_data_folder": data_folder, "put": list(put_in_catalog) if put_in_catalog else None} - expose_secrets = list(expose_secret) if expose_secret else [] - entrypoints.execute_container( - image=image, - entrypoint=entrypoint, - command=command, - configuration_file=config_file, - parameters_file=parameters_file, - context_path=context_path, - catalog_config=catalog_config, - output_parameters_file=output_parameters_file, - experiment_tracking_file=experiment_tracking_file, - expose_secrets=expose_secrets, - tag=tag, - run_id=run_id, - ) - - @cli.command("fan", short_help="Internal entry point to fan in or out a composite node", hidden=True) @click.argument("run_id") @click.argument("step_name") @@ -394,38 +267,6 @@ def fan(run_id, step_name, mode, map_variable, file, config_file, parameters_fil ) -@cli.command("wrap_around_container", short_help="Internal entry point to sync data/parameters in and out", hidden=True) -@click.argument("run_id") -@click.argument("step_identifier") -@click.option("--map-variable", default="", help="The map variable dictionary in str", show_default=True) -@click.option( - "-m", "--mode", help="pre or post execution of the container", required=True, type=click.Choice(["pre", "post"]) -) -def wrap_around_container(run_id: str, step_identifier: str, map_variable: str, mode: str): - """ - Internal entrypoint for magnus to sync data/parameters in and out. - - Only 3rd party orchestrators using containers as command types should use this entry point. - - mode: - pre would be called prior the execution of the container. - - Create the step log - - It should read the step config from environmental variables and resolve it with the executor config. - - sync catalog/parameters and send it in. - post would be called after the execution of the container. - - Update the step log - - Sync back the catalog/parameters and send it to central storage. - - - Args: - run_id (str): The run_id to identify parameters/run log/catalog information - step_identifier (str): A unique identifier to retrieve the step configuration - mode (str): Pre or post processing of the container execution - """ - # TODO: Needs to be added but not prioritizing. - # Will be added after we merge magnus and magnus extensions - - # Needed for the binary creation if __name__ == "__main__": cli() diff --git a/magnus/entrypoints.py b/magnus/entrypoints.py index 00926ac9..89791f26 100644 --- a/magnus/entrypoints.py +++ b/magnus/entrypoints.py @@ -1,11 +1,11 @@ import json import logging -from typing import List, Optional, cast +from typing import Optional, cast from rich import print import magnus.context as context -from magnus import defaults, exceptions, graph, utils +from magnus import defaults, graph, utils from magnus.defaults import MagnusConfig, ServiceConfig logger = logging.getLogger(defaults.LOGGER_NAME) @@ -115,7 +115,6 @@ def prepare_configurations( if pipeline_file: # There are use cases where we are only preparing the executor pipeline_config = utils.load_yaml(pipeline_file) - # pipeline_config = utils.apply_variables(pipeline_config, variables=variables) logger.info("The input pipeline:") logger.info(json.dumps(pipeline_config, indent=4)) @@ -123,7 +122,6 @@ def prepare_configurations( # Create the graph dag_config = pipeline_config["dag"] dag_hash = utils.get_dag_hash(dag_config) - # TODO: Dag nodes should not self refer themselves dag = graph.create_graph(dag_config) run_context.pipeline_file = pipeline_file @@ -189,61 +187,6 @@ def execute( executor.send_return_code() -def execute_single_step( - configuration_file: str, - pipeline_file: str, - step_name: str, - run_id: str, - tag: str = "", - parameters_file: str = "", - use_cached: str = "", -): - """ - TODO: Remove this!! - The entry point into executing a single step of magnus. - - It should have similar set up of configurations to execute because orchestrator modes can initiate the execution. - - Args: - variables_file (str): The variables file, if used or None - step_name : The name of the step to execute in dot path convention - pipeline_file (str): The config/dag file - run_id (str): The run id of the run. - tag (str): If a tag is provided at the run time - parameters_file (str): The parameters being sent in to the application - - """ - run_id = utils.generate_run_id(run_id=run_id) - - run_context = prepare_configurations( - configuration_file=configuration_file, - pipeline_file=pipeline_file, - run_id=run_id, - tag=tag, - use_cached="", - parameters_file=parameters_file, - ) - print("Working with context:") - print(run_context) - - executor = run_context.executor - run_context.execution_plan = defaults.EXECUTION_PLAN.CHAINED.value - utils.set_magnus_environment_variables(run_id=run_id, configuration_file=configuration_file, tag=tag) - try: - _ = run_context.dag.get_node_by_name(step_name) # type: ignore - except exceptions.NodeNotFoundError as e: - msg = f"The node by name {step_name} is not found in the graph. Please provide a valid node name" - raise Exception(msg) from e - - executor._single_step = step_name - executor.prepare_for_graph_execution() - - logger.info("Executing the graph") - executor.execute_graph(dag=run_context.dag) # type: ignore - - executor.send_return_code() - - def execute_single_node( configuration_file: str, pipeline_file: str, @@ -477,68 +420,6 @@ def execute_function( executor.send_return_code() -def execute_container( - image: str, - entrypoint: str, - command: str = "", - configuration_file="", - parameters_file: str = "", - context_path: str = defaults.DEFAULT_CONTAINER_CONTEXT_PATH, - catalog_config: Optional[dict] = None, - output_parameters_file: str = defaults.DEFAULT_CONTAINER_OUTPUT_PARAMETERS, - experiment_tracking_file: str = "", - expose_secrets: Optional[List[str]] = None, - tag: str = "", - run_id: str = "", -): - """ - The entry point to magnus execution of a container. - This method, as designed, should only be used by interactive computes. - """ - run_id = utils.generate_run_id(run_id=run_id) - - run_context = prepare_configurations( - configuration_file=configuration_file, - run_id=run_id, - tag=tag, - parameters_file=parameters_file, - ) - executor = run_context.executor - - run_context.execution_plan = defaults.EXECUTION_PLAN.UNCHAINED.value - utils.set_magnus_environment_variables(run_id=run_id, configuration_file=configuration_file, tag=tag) - - print("Working with context:") - print(run_context) - - # Prepare the graph with a single node - step_config = { - "image": image, - "context_path": context_path, - "command": command, - "data_folder": defaults.DEFAULT_CONTAINER_DATA_PATH, - "output_parameters_file": output_parameters_file, - "secrets": expose_secrets, - "experiment_tracking_file": experiment_tracking_file, - "command_type": "container", - "type": "task", - "next": "success", - "catalog": catalog_config, - } - node = graph.create_node(name="executing job", step_config=step_config) - - if entrypoint == defaults.ENTRYPOINT.USER.value: - # Prepare for graph execution - executor.prepare_for_graph_execution() - - logger.info("Executing the job from the user. We are still in the caller's compute environment") - executor.execute_job(node=node) - else: - raise ValueError(f"Invalid entrypoint {entrypoint}") - - executor.send_return_code() - - def fan( configuration_file: str, pipeline_file: str, @@ -598,26 +479,6 @@ def fan( raise ValueError(f"Invalid mode {mode}") -def wrap_around_container(run_id: str, step_identifier: str, map_variable: str, mode: str): - """ - This function provides a pre and post processing steps for magnus to execute a container in non-interactive mode. - - Expectations: - It is expected that the config is available as a JSON string in the environment. - It is also expected that step_identifiers (key: step_identifier, value: step_config) is available as - a JSON string in the environment. - - We prepare configurations with the config variable from the environment, the dag is empty. - - - Args: - run_id (str): _description_ - step_identifier (str): _description_ - map_variable (str): _description_ - mode (str): _description_ - """ - - if __name__ == "__main__": # This is only for perf testing purposes. prepare_configurations(run_id="abc", pipeline_file="example/mocking.yaml") diff --git a/magnus/executor.py b/magnus/executor.py index e1f4644e..56a77241 100644 --- a/magnus/executor.py +++ b/magnus/executor.py @@ -48,20 +48,6 @@ class BaseExecutor(ABC, BaseModel): def _context(self): return context.run_context - @property - def step_decorator_run_id(self): - """ - TODO: Experimental feature, design is not mature yet. - - This function is used by the decorator function. - The design idea is we can over-ride this method in different implementations to retrieve the run_id. - But is it really intrusive to ask to set the environmental variable MAGNUS_RUN_ID? - - Returns: - _type_: _description_ - """ - return os.environ.get("MAGNUS_RUN_ID", None) - def _is_parallel_execution(self) -> bool: """ Controls the parallelization of branches in map and parallel state. diff --git a/magnus/experiment_tracker.py b/magnus/experiment_tracker.py index 6518bd00..c4d39436 100644 --- a/magnus/experiment_tracker.py +++ b/magnus/experiment_tracker.py @@ -29,7 +29,7 @@ def retrieve_step_details(key: str) -> Tuple[str, int]: def get_tracked_data() -> Dict[str, Any]: - tracked_data = defaultdict(dict) + tracked_data: Dict[str, Any] = defaultdict(dict) for env_var, value in os.environ.items(): if env_var.startswith(defaults.TRACK_PREFIX): key, step = retrieve_step_details(env_var) diff --git a/magnus/extensions/executor/argo/implementation.py b/magnus/extensions/executor/argo/implementation.py index 0210c7b4..2065381e 100644 --- a/magnus/extensions/executor/argo/implementation.py +++ b/magnus/extensions/executor/argo/implementation.py @@ -17,6 +17,7 @@ from magnus.extensions.executor import GenericExecutor from magnus.extensions.nodes import DagNode, MapNode, ParallelNode from magnus.graph import Graph, create_node, search_node_by_internal_name +from magnus.integration import BaseIntegration from magnus.nodes import BaseNode logger = logging.getLogger(defaults.NAME) @@ -211,7 +212,7 @@ class TemplateDefaults(BaseModel): description="Max run time of a step", ) - @computed_field + @computed_field # type: ignore @property def timeout(self) -> str: return f"{self.max_step_duration + 60*60}s" @@ -591,20 +592,6 @@ def validate_parallelism(cls, parallelism: Optional[int]) -> Optional[int]: raise ValueError("Parallelism must be a positive integer greater than 0") return parallelism - # @computed_field - # @property - # def podSpecPatch(self) -> str: - # return json.dumps( - # { - # "containers": [ - # { - # "name": "main", - # "resources": self.resources.model_dump_json(exclude_none=True), - # } - # ] - # } - # ) - @computed_field # type: ignore @property def volumes(self) -> List[Volume]: @@ -736,7 +723,7 @@ def validate_parallelism(cls, parallelism: Optional[int]) -> Optional[int]: raise ValueError("Parallelism must be a positive integer greater than 0") return parallelism - @computed_field + @computed_field # type: ignore @property def step_timeout(self) -> int: """ @@ -1142,3 +1129,54 @@ def send_return_code(self, stage="traversal"): run_log = self._context.run_log_store.get_run_log_by_id(run_id=run_id, full=False) if run_log.status == defaults.FAIL: raise exceptions.ExecutionFailedError(run_id) + + +class FileSystemRunLogStore(BaseIntegration): + """ + Only local execution mode is possible for Buffered Run Log store + """ + + executor_type = "argo" + service_type = "run_log_store" # One of secret, catalog, datastore + service_provider = "file-system" # The actual implementation of the service + + def validate(self, **kwargs): + msg = ( + "Argo cannot run work with file-system run log store. " + "Unless you have made a mechanism to use volume mounts." + "Using this run log store if the pipeline has concurrent tasks might lead to unexpected results" + ) + logger.warning(msg) + + +class ChunkedFileSystemRunLogStore(BaseIntegration): + """ + Only local execution mode is possible for Buffered Run Log store + """ + + executor_type = "argo" + service_type = "run_log_store" # One of secret, catalog, datastore + service_provider = "chunked-fs" # The actual implementation of the service + + def validate(self, **kwargs): + msg = ( + "Argo cannot run work with chunked file-system run log store. " + "Unless you have made a mechanism to use volume mounts" + ) + logger.warning(msg) + + +class FileSystemCatalog(BaseIntegration): + """ + Only local execution mode is possible for Buffered Run Log store + """ + + executor_type = "argo" + service_type = "catalog" # One of secret, catalog, datastore + service_provider = "file-system" # The actual implementation of the service + + def validate(self, **kwargs): + msg = ( + "Argo cannot run work with file-system run log store. Unless you have made a mechanism to use volume mounts" + ) + logger.warning(msg) diff --git a/magnus/extensions/executor/argo/integration.py b/magnus/extensions/executor/argo/integration.py deleted file mode 100644 index d8176566..00000000 --- a/magnus/extensions/executor/argo/integration.py +++ /dev/null @@ -1,55 +0,0 @@ -import logging - -from magnus import defaults -from magnus.integration import BaseIntegration - -logger = logging.getLogger(defaults.NAME) - - -class FileSystemRunLogStore(BaseIntegration): - """ - Only local execution mode is possible for Buffered Run Log store - """ - - executor_type = "argo" - service_type = "run_log_store" # One of secret, catalog, datastore - service_provider = "file-system" # The actual implementation of the service - - def validate(self, **kwargs): - msg = ( - "Argo cannot run work with file-system run log store. Unless you have made a mechanism to use volume mounts" - ) - logger.warning(msg) - - -class ChunkedFileSystemRunLogStore(BaseIntegration): - """ - Only local execution mode is possible for Buffered Run Log store - """ - - executor_type = "argo" - service_type = "run_log_store" # One of secret, catalog, datastore - service_provider = "chunked-fs" # The actual implementation of the service - - def validate(self, **kwargs): - msg = ( - "Argo cannot run work with chunked file-system run log store. " - "Unless you have made a mechanism to use volume mounts" - ) - logger.warning(msg) - - -class FileSystemCatalog(BaseIntegration): - """ - Only local execution mode is possible for Buffered Run Log store - """ - - executor_type = "argo" - service_type = "catalog" # One of secret, catalog, datastore - service_provider = "file-system" # The actual implementation of the service - - def validate(self, **kwargs): - msg = ( - "Argo cannot run work with file-system run log store. Unless you have made a mechanism to use volume mounts" - ) - logger.warning(msg) diff --git a/magnus/extensions/executor/demo_renderer/__init__.py b/magnus/extensions/executor/demo_renderer/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/magnus/extensions/executor/demo_renderer/implementation.py b/magnus/extensions/executor/demo_renderer/implementation.py deleted file mode 100644 index 8982911f..00000000 --- a/magnus/extensions/executor/demo_renderer/implementation.py +++ /dev/null @@ -1,126 +0,0 @@ -# pragma: no cover -import logging -import re - -from magnus import defaults, utils -from magnus.defaults import TypeMapVariable -from magnus.extensions.executor import GenericExecutor -from magnus.extensions.nodes import StubNode -from magnus.graph import Graph -from magnus.nodes import BaseNode - -logger = logging.getLogger(defaults.LOGGER_NAME) - - -class DemoRenderer(GenericExecutor): - """ - This renderer is an example of how you can render required job specifications as per your orchestration tool. - - BaseExecutor implements many of the functionalities that are common and can be safe defaults. - In this renderer example: We just render a bash script that sequentially calls the steps. - We do not handle composite steps in this execution type. - - Example config: - executor: - type: demo-renderer - """ - - service_name: str = "demo-renderer" - - def execute_node(self, node: BaseNode, map_variable: TypeMapVariable = None, **kwargs): - """ - This method does the actual execution of a task, as-is, success or fail node. - """ - self._set_up_run_log(exists_ok=True) - # Need to set up the step log for the node as the entry point is different - step_log = self._context.run_log_store.create_step_log(node.name, node._get_step_log_name(map_variable)) - - self.add_code_identities(node=node, step_log=step_log) - - step_log.step_type = node.node_type - step_log.status = defaults.PROCESSING - self._context.run_log_store.add_step_log(step_log, self._context.run_id) - - super()._execute_node(node, map_variable=map_variable, **kwargs) - - step_log = self._context.run_log_store.get_step_log(node._get_step_log_name(map_variable), self._context.run_id) - if step_log.status == defaults.FAIL: - raise Exception(f"Step {node.name} failed") - - def send_return_code(self, stage="traversal"): - """ - Convenience function used by pipeline to send return code to the caller of the cli - - Raises: - Exception: If the pipeline execution failed - """ - if stage != "traversal": # traversal does no actual execution, so return code is pointless - run_id = self._context.run_id - - run_log = self._context.run_log_store.get_run_log_by_id(run_id=run_id, full=False) - if run_log.status == defaults.FAIL: - raise Exception("Pipeline execution failed") - - def execute_graph(self, dag: Graph, map_variable: TypeMapVariable = None, **kwargs): - """ - Iterate through the graph and frame the bash script. - - For more complex outputs, dataclasses might be a better option. - - NOTE: This method should be over-written to write the exact specification to the compute engine. - - """ - current_node = dag.start_at - previous_node = None - logger.info(f"Rendering job started at {current_node}") - bash_script_lines = [] - - while True: - working_on = dag.get_node_by_name(current_node) - - if working_on.is_composite: - raise NotImplementedError("In this demo version, composite nodes are not implemented") - - if working_on.node_type == StubNode.node_type: - raise NotImplementedError("In this demo version, AsIs nodes are not implemented") - - if previous_node == current_node: - raise Exception("Potentially running in a infinite loop") - - previous_node = current_node - - logger.info(f"Creating execution log for {working_on}") - - _execute_node_command = utils.get_node_execution_command(working_on, over_write_run_id="$1") - re.sub("[^A-Za-z0-9]+", "", f"{current_node}_job_id") - fail_node_command = utils.get_node_execution_command(dag.get_fail_node(), over_write_run_id="$1") - - if working_on.node_type not in ["success", "fail"]: - bash_script_lines.append(f"{_execute_node_command}\n") - - bash_script_lines.append("exit_code=$?\necho $exit_code\n") - # Write failure node - bash_script_lines.append( - ("if [ $exit_code -ne 0 ];\nthen\n" f"\t $({fail_node_command})\n" "\texit 1\n" "fi\n") - ) - - if working_on.node_type == "success": - bash_script_lines.append(f"{_execute_node_command}") - if working_on.node_type in ["success", "fail"]: - break - - current_node = working_on._get_next_node() - - with open("demo-bash.sh", "w", encoding="utf-8") as fw: - fw.writelines(bash_script_lines) - - msg = ( - "demo-bash.sh for running the pipeline is written. To execute it \n" - "1). Activate the environment:\n" - "\t for example poetry shell or pipenv shell etc\n" - "2). Make the shell script executable.\n" - "\t chmod 755 demo-bash.sh\n" - "3). Run the script by: source demo-bash.sh \n" - "\t The first argument to the script is the run id you want for the run." - ) - logger.info(msg) diff --git a/magnus/extensions/executor/local_container/implementation.py b/magnus/extensions/executor/local_container/implementation.py index 3c521a97..9e38e27a 100644 --- a/magnus/extensions/executor/local_container/implementation.py +++ b/magnus/extensions/executor/local_container/implementation.py @@ -314,3 +314,54 @@ def configure_for_execution(self, **kwargs): self.service = cast(FileSystemCatalog, self.service) self.service.catalog_location = self.executor._container_catalog_location + + +class LocalContainerComputeDotEnvSecrets(BaseIntegration): + """ + Integration between local container and dot env secrets + """ + + executor_type = "local-container" + service_type = "secrets" # One of secret, catalog, datastore + service_provider = "dotenv" # The actual implementation of the service + + def validate(self, **kwargs): + logger.warning("Using dot env for non local deployments is not ideal, consider options") + + def configure_for_traversal(self, **kwargs): + from magnus.extensions.secrets.dotenv.implementation import DotEnvSecrets + + self.executor = cast(LocalContainerExecutor, self.executor) + self.service = cast(DotEnvSecrets, self.service) + + secrets_location = self.service.secrets_location + self.executor._volumes[str(Path(secrets_location).resolve())] = { + "bind": f"{self.executor._container_secrets_location}", + "mode": "ro", + } + + def configure_for_execution(self, **kwargs): + from magnus.extensions.secrets.dotenv.implementation import DotEnvSecrets + + self.executor = cast(LocalContainerExecutor, self.executor) + self.service = cast(DotEnvSecrets, self.service) + + self.service.location = self.executor._container_secrets_location + + +class LocalContainerComputeEnvSecretsManager(BaseIntegration): + """ + Integration between local container and env secrets manager + """ + + executor_type = "local-container" + service_type = "secrets" # One of secret, catalog, datastore + service_provider = "env-secrets-manager" # The actual implementation of the service + + def validate(self, **kwargs): + msg = ( + "Local container executions cannot be used with environment secrets manager. " + "Please use a supported secrets manager" + ) + logger.exception(msg) + raise Exception(msg) diff --git a/magnus/extensions/executor/mocked/implementation.py b/magnus/extensions/executor/mocked/implementation.py index 9b4aaaf5..b7e9d4f6 100644 --- a/magnus/extensions/executor/mocked/implementation.py +++ b/magnus/extensions/executor/mocked/implementation.py @@ -8,6 +8,7 @@ from magnus.defaults import TypeMapVariable from magnus.extensions.executor import GenericExecutor from magnus.extensions.nodes import TaskNode +from magnus.integration import BaseIntegration from magnus.nodes import BaseNode from magnus.tasks import BaseTaskType @@ -25,6 +26,8 @@ class EasyModel(model): # type: ignore class MockedExecutor(GenericExecutor): service_name: str = "mocked" + enable_parallel: bool = defaults.ENABLE_PARALLEL + patches: Dict[str, Any] = Field(default_factory=dict) @property @@ -185,3 +188,33 @@ def execute_node(self, node: BaseNode, map_variable: TypeMapVariable = None, **k map_variable (dict[str, str], optional): _description_. Defaults to None. """ self._execute_node(node=node, map_variable=map_variable, **kwargs) + + +class LocalContainerComputeFileSystemRunLogstore(BaseIntegration): + """ + Integration between local container and file system run log store + """ + + executor_type = "local-container" + service_type = "run_log_store" # One of secret, catalog, datastore + service_provider = "file-system" # The actual implementation of the service + + def validate(self, **kwargs): + if self.executor._is_parallel_execution(): # pragma: no branch + msg = "Mocked executor does not support parallel execution. " + logger.warning(msg) + + +class LocalContainerComputeChunkedFSRunLogstore(BaseIntegration): + """ + Integration between local container and file system run log store + """ + + executor_type = "local-container" + service_type = "run_log_store" # One of secret, catalog, datastore + service_provider = "chunked-fs" # The actual implementation of the service + + def validate(self, **kwargs): + if self.executor._is_parallel_execution(): # pragma: no branch + msg = "Mocked executor does not support parallel execution. " + logger.warning(msg) diff --git a/magnus/extensions/run_log_store/file_system/integration.py b/magnus/extensions/run_log_store/file_system/integration.py deleted file mode 100644 index d52e4ac2..00000000 --- a/magnus/extensions/run_log_store/file_system/integration.py +++ /dev/null @@ -1,25 +0,0 @@ -import logging - -from magnus import defaults -from magnus.integration import BaseIntegration - -logger = logging.getLogger(defaults.LOGGER_NAME) - - -class LocalComputeFileSystemRunLogStore(BaseIntegration): - """ - Local compute and File system run log store - """ - - executor_type = "local" - service_type = "run_log_store" # One of secret, catalog, datastore - service_provider = "file-system" # The actual implementation of the service - - def validate(self, **kwargs): - if self.executor._is_parallel_execution(): # pragma: no branch - msg = ( - "Run log generated by file-system run log store are not thread safe. " - "Inconsistent results are possible because of race conditions to write to the same file.\n" - "Consider using partitioned run log store like database for consistent results." - ) - logger.warning(msg) diff --git a/magnus/extensions/secrets/dotenv/integration.py b/magnus/extensions/secrets/dotenv/integration.py deleted file mode 100644 index 33f18298..00000000 --- a/magnus/extensions/secrets/dotenv/integration.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -from pathlib import Path -from typing import cast - -from magnus import defaults -from magnus.extensions.executor.local_container.implementation import LocalContainerExecutor -from magnus.integration import BaseIntegration - -from .implementation import DotEnvSecrets - -logger = logging.getLogger(defaults.LOGGER_NAME) - - -class LocalContainerComputeDotEnvSecrets(BaseIntegration): - """ - Integration between local container and dot env secrets - """ - - executor_type = "local-container" - service_type = "secrets" # One of secret, catalog, datastore - service_provider = "dotenv" # The actual implementation of the service - - def validate(self, **kwargs): - logger.warning("Using dot env for non local deployments is not ideal, consider options") - - def configure_for_traversal(self, **kwargs): - self.executor = cast(LocalContainerExecutor, self.executor) - self.service = cast(DotEnvSecrets, self.service) - - secrets_location = self.service.secrets_location - self.executor._volumes[str(Path(secrets_location).resolve())] = { - "bind": f"{self.executor._container_secrets_location}", - "mode": "ro", - } - - def configure_for_execution(self, **kwargs): - self.executor = cast(LocalContainerExecutor, self.executor) - self.service = cast(DotEnvSecrets, self.service) - - self.service.location = self.executor._container_secrets_location diff --git a/magnus/extensions/secrets/env_secrets/integration.py b/magnus/extensions/secrets/env_secrets/integration.py deleted file mode 100644 index 8359bfc0..00000000 --- a/magnus/extensions/secrets/env_secrets/integration.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging - -from magnus import defaults -from magnus.integration import BaseIntegration - -logger = logging.getLogger(defaults.LOGGER_NAME) - - -class LocalContainerComputeEnvSecretsManager(BaseIntegration): - """ - Integration between local container and env secrets manager - """ - - executor_type = "local-container" - service_type = "secrets" # One of secret, catalog, datastore - service_provider = "env-secrets-manager" # The actual implementation of the service - - def validate(self, **kwargs): - msg = ( - "Local container executions cannot be used with environment secrets manager. " - "Please use a supported secrets manager" - ) - logger.exception(msg) - raise Exception(msg) diff --git a/magnus/nodes.py b/magnus/nodes.py index 756cf8da..e28b06fd 100644 --- a/magnus/nodes.py +++ b/magnus/nodes.py @@ -251,7 +251,7 @@ def _get_neighbors(self) -> List[str]: return neighbors @abstractmethod - def _get_executor_config(self, executor_type: str) -> dict: + def _get_executor_config(self, executor_type: str) -> str: """ Return the executor config of the node, if defined, or empty dict @@ -360,10 +360,10 @@ def parse_from_config(cls, config: Dict[str, Any]) -> "BaseNode": # --8<-- [end:docs] class TraversalNode(BaseNode): next_node: str = Field(serialization_alias="next") - on_failure: Optional[str] = Field(default="") + on_failure: str = Field(default="") overrides: Dict[str, str] = Field(default_factory=dict) - def _get_on_failure_node(self) -> Optional[str]: + def _get_on_failure_node(self) -> str: """ If the node defines a on_failure node in the config, return this or None. @@ -469,7 +469,7 @@ def _get_catalog_settings(self) -> Dict[str, Any]: def _get_branch_by_name(self, branch_name: str): raise exceptions.TerminalNodeError() - def _get_executor_config(self, executor_type) -> dict: + def _get_executor_config(self, executor_type) -> str: raise exceptions.TerminalNodeError() def _get_max_attempts(self) -> int: diff --git a/magnus/parameters.py b/magnus/parameters.py index efb5769e..59953dca 100644 --- a/magnus/parameters.py +++ b/magnus/parameters.py @@ -165,7 +165,7 @@ def filter_arguments_for_func( unassigned_params = unassigned_params.difference(bound_model.model_fields.keys()) else: # simple python data type. - bound_args[name] = cast_parameters_as_type(params[name], value.annotation) + bound_args[name] = cast_parameters_as_type(params[name], value.annotation) # type: ignore unassigned_params.remove(name) diff --git a/magnus/sdk.py b/magnus/sdk.py index e205e52c..61be2e60 100644 --- a/magnus/sdk.py +++ b/magnus/sdk.py @@ -477,89 +477,3 @@ def execute( run_context.executor.execute_graph(dag=run_context.dag) return run_context.run_log_store.get_run_log_by_id(run_id=run_context.run_id) - - -# class step(object): - -# def __init__( -# self, name: Union[str, FunctionType], -# catalog_config: dict = None, magnus_config: str = None, -# parameters_file: str = None): -# """ -# This decorator could be used to make the function within the scope of magnus. - -# Since we are not orchestrating, it is expected that resource management happens outside this scope. - -# Args: -# name (str, callable): The name of the step. The step log would have the same name -# catalog_config (dict): The configuration of the catalog per step. -# magnus_config (str): The name of the file having the magnus config, defaults to None. -# """ -# if isinstance(name, FunctionType): -# name = name() - -# self.name = name -# self.catalog_config = catalog_config -# self.active = True # Check if we are executing the function via pipeline - -# if pipeline.global_executor \ -# and pipeline.global_executor.execution_plan == defaults.EXECUTION_PLAN.CHAINED.value: -# self.active = False -# return - -# self.executor = pipeline.prepare_configurations( -# configuration_file=magnus_config, parameters_file=parameters_file) - -# self.executor.execution_plan = defaults.EXECUTION_PLAN.UNCHAINED.value -# run_id = self.executor.step_decorator_run_id -# if not run_id: -# msg = ( -# f'Step decorator expects run id from environment.' -# ) -# raise Exception(msg) - -# self.executor.run_id = run_id -# utils.set_magnus_environment_variables(run_id=run_id, configuration_file=magnus_config, tag=get_tag()) - -# try: -# # Try to get it if previous steps have created it -# # TODO: Can call the set_up_runlog now. -# run_log = self.executor.run_log_store.get_run_log_by_id(self.executor.run_id) -# if run_log.status in [defaults.FAIL, defaults.SUCCESS]: # TODO: Remove this in preference to defaults -# """ -# This check is mostly useless as we do not know when the graph ends as they are created dynamically. -# This only prevents from using a run_id which has reached a final state. -# #TODO: There is a need to create a status called step_success -# """ -# msg = ( -# f'The run_log for run_id: {run_id} already exists and is in {run_log.status} state.' -# ' Make sure that this was not run before.' -# ) -# raise Exception(msg) -# except exceptions.RunLogNotFoundError: -# # Create one if they are not created -# self.executor._set_up_run_log() - -# def __call__(self, func): -# """ -# The function is converted into a node and called via the magnus framework. -# """ -# @functools.wraps(func) -# def wrapped_f(*args, **kwargs): -# if not self.active: -# # If we are not running via decorator, execute the function -# return func(*args, **kwargs) - -# step_config = { -# 'command': func, -# 'command_type': 'python-function', -# 'type': 'task', -# 'next': 'not defined', -# 'catalog': self.catalog_config -# } -# node = graph.create_node(name=self.name, step_config=step_config) -# self.executor.execute_from_graph(node=node) -# run_log = self.executor.run_log_store.get_run_log_by_id(run_id=self.executor.run_id, full=False) -# # TODO: If the previous step succeeded, make the status of the run log step_success -# print(json.dumps(run_log.dict(), indent=4)) -# return wrapped_f diff --git a/magnus/tasks.py b/magnus/tasks.py index 27b2cfcc..6fee16e9 100644 --- a/magnus/tasks.py +++ b/magnus/tasks.py @@ -5,13 +5,9 @@ import json import logging import os -import shlex -import shutil import subprocess import sys -import tempfile -from pathlib import Path -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, Tuple from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator from pydantic._internal._model_construction import ModelMetaclass @@ -27,8 +23,6 @@ # TODO: Can we add memory peak, cpu usage, etc. to the metrics? -# --8<-- [start:docs] - class BaseTaskType(BaseModel): """A base task class which does the execution of command defined by the user.""" @@ -128,9 +122,6 @@ def output_to_file(self, map_variable: TypeMapVariable = None): os.remove(log_file.name) -# --8<-- [end:docs] - - class EasyModel(BaseModel): model_config = ConfigDict(extra="allow") @@ -195,59 +186,6 @@ def execute_command(self, map_variable: TypeMapVariable = None, **kwargs): self._set_parameters(user_set_parameters) -class PythonLambdaTaskType(BaseTaskType): # pylint: disable=too-few-public-methods - """The task class for python-lambda command.""" - - task_type: str = Field(default="python-lambda", serialization_alias="command_type") - command: str - - @field_validator("command") - @classmethod - def validate_command(cls, command: str): - if not command: - raise Exception("Command cannot be empty for shell task") - - return command - - def execute_command(self, map_variable: TypeMapVariable = None, **kwargs): - """Execute the lambda function as defined by the command. - - Args: - map_variable (dict, optional): If the node is part of an internal branch. Defaults to None. - - Raises: - Exception: If the lambda function has _ or __ in it that can cause issues. - """ - if "_" in self.command or "__" in self.command: - msg = ( - f"Command given to {self.task_type} cannot have _ or __ in them. " - "The string is supposed to be for simple expressions only." - ) - raise Exception(msg) - - f = eval(self.command) - - params = self._get_parameters() - filtered_parameters = parameters.filter_arguments_for_func(f, params, map_variable) - - if map_variable: - os.environ[defaults.PARAMETER_PREFIX + "MAP_VARIABLE"] = json.dumps(map_variable) - - logger.info(f"Calling lambda function: {self.command} with {filtered_parameters}") - try: - user_set_parameters = f(**filtered_parameters) - except Exception as _e: - msg = f"Call to the function {self.command} with {filtered_parameters} did not succeed.\n" - logger.exception(msg) - logger.exception(_e) - raise - - if map_variable: - del os.environ[defaults.PARAMETER_PREFIX + "MAP_VARIABLE"] - - self._set_parameters(user_set_parameters) - - class NotebookTaskType(BaseTaskType): """The task class for Notebook based execution.""" @@ -423,156 +361,6 @@ def execute_command(self, map_variable: TypeMapVariable = None, **kwargs): ) -class ContainerTaskType(BaseTaskType): - """ - The task class for container based execution. - """ - - task_type: str = Field(default="container", serialization_alias="command_type") - image: str - context_path: str = defaults.DEFAULT_CONTAINER_CONTEXT_PATH - command: str = "" # Would be defaulted to the entrypoint of the container - data_folder: str = defaults.DEFAULT_CONTAINER_DATA_PATH # Would be relative to the context_path - output_parameters_file: str = defaults.DEFAULT_CONTAINER_OUTPUT_PARAMETERS # would be relative to the context_path - secrets: List[str] = [] - experiment_tracking_file: str = "" - - _temp_dir: str = "" - - def get_cli_options(self) -> Tuple[str, dict]: - return "container", { - "image": self.image, - "context-path": self.context_path, - "command": self.command, - "data-folder": self.data_folder, - "output-parameters_file": self.output_parameters_file, - "secrets": self.secrets, - "experiment-tracking-file": self.experiment_tracking_file, - } - - def execute_command(self, map_variable: TypeMapVariable = None, **kwargs): - # Conditional import - from magnus import track_this - from magnus.context import run_context - - try: - import docker # pylint: disable=C0415 - - client = docker.from_env() - api_client = docker.APIClient() - except ImportError as e: - msg = "Task type of container requires docker to be installed. Please install via optional: docker" - logger.exception(msg) - raise Exception(msg) from e - except Exception as ex: - logger.exception("Could not get access to docker") - raise Exception("Could not get the docker socket file, do you have docker installed?") from ex - - container_env_variables = {} - - for key, value in self._get_parameters().items(): - container_env_variables[defaults.PARAMETER_PREFIX + key] = value - - if map_variable: - container_env_variables[defaults.PARAMETER_PREFIX + "MAP_VARIABLE"] = json.dumps(map_variable) - - for secret_name in self.secrets: - secret_value = run_context.secrets_handler.get(secret_name) - container_env_variables[secret_name] = secret_value - - mount_volumes = self.get_mount_volumes() - - executor_config = run_context.executor._resolve_executor_config(run_context.executor._context_node) - optional_docker_args = executor_config.get("optional_docker_args", {}) - - try: - container = client.containers.create( - self.image, - command=shlex.split(self.command), - auto_remove=False, - network_mode="host", - environment=container_env_variables, - volumes=mount_volumes, - **optional_docker_args, - ) - - container.start() - stream = api_client.logs(container=container.id, timestamps=True, stream=True, follow=True) - while True: - try: - output = next(stream).decode("utf-8") - output = output.strip("\r\n") - logger.info(output) - except StopIteration: - logger.info("Docker Run completed") - break - - exit_status = api_client.inspect_container(container.id)["State"]["ExitCode"] - container.remove(force=True) - - if exit_status != 0: - msg = ( - f"Docker command failed with exit code {exit_status}." - "Hint: When chaining multiple commands, use sh -c" - ) - raise Exception(msg) - - container_return_parameters = {} - experiment_tracking_variables = {} - if self._temp_dir: - parameters_file = Path(self._temp_dir) / self.output_parameters_file - if parameters_file.is_file(): - with open(parameters_file, "r") as f: - container_return_parameters = json.load(f) - - experiment_tracking_file = Path(self._temp_dir) / self.experiment_tracking_file - if experiment_tracking_file.is_file(): - with open(experiment_tracking_file, "r") as f: - experiment_tracking_variables = json.load(f) - - self._set_parameters(container_return_parameters) # type: ignore # TODO: Not fixing this for now. - track_this(**experiment_tracking_variables) - - except Exception as _e: - logger.exception("Problems with spinning up the container") - raise _e - finally: - if self._temp_dir: - shutil.rmtree(self._temp_dir) - - def get_mount_volumes(self) -> dict: - """ - Get the required mount volumes from the configuration. - We need to mount both the catalog and the parameter.json files. - - Returns: - dict: The mount volumes in the format that docker expects. - """ - from magnus.context import run_context - - compute_data_folder = run_context.executor.get_effective_compute_data_folder() - mount_volumes = {} - - # Create temporary directory for parameters.json and map it to context_path - self._temp_dir = tempfile.mkdtemp() - mount_volumes[str(Path(self._temp_dir).resolve())] = { - "bind": f"{str(Path(self.context_path).resolve())}/", - "mode": "rw", - } - logger.info(f"Mounting {str(Path(self._temp_dir).resolve())} to {str(Path(self.context_path).resolve())}/") - - # Map the data folder to context_path/data_folder - if compute_data_folder: - path_to_data = Path(self.context_path) / self.data_folder - mount_volumes[str(Path(compute_data_folder).resolve())] = { - "bind": f"{str(path_to_data)}/", - "mode": "rw", - } - logger.info(f"Mounting {compute_data_folder} to {str(path_to_data)}/") - - return mount_volumes - - def create_task(kwargs_for_init) -> BaseTaskType: """ Creates a task object from the command configuration. diff --git a/mkdocs.yml b/mkdocs.yml index 332c5360..5482f627 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -147,11 +147,7 @@ nav: - "Catalog": "configurations/catalog.md" - "Secrets": "configurations/secrets.md" - "Experiment tracking": "configurations/experiment-tracking.md" - - # - "Command Line": - # - "command-line.md" - "Python API": "interactions.md" - "Python SDK": "sdk.md" - # - "Examples": - # - "examples.md" - "Extensions": "extensions.md" + - "Roadmap": "roadmap.md" diff --git a/pyproject.toml b/pyproject.toml index e084516b..3113261c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,6 @@ magnus= 'magnus.cli:cli' [tool.poetry.plugins."executor"] "local" = "magnus.extensions.executor.local.implementation:LocalExecutor" "local-container" = "magnus.extensions.executor.local_container.implementation:LocalContainerExecutor" -"demo-renderer" = "magnus.extensions.executor.demo_renderer.implementation:DemoRenderer" "argo" = "magnus.extensions.executor.argo.implementation:ArgoExecutor" "mocked" = "magnus.extensions.executor.mocked.implementation:MockedExecutor" @@ -104,10 +103,8 @@ magnus= 'magnus.cli:cli' # Plugins for Tasks [tool.poetry.plugins."tasks"] "python" = "magnus.tasks:PythonTaskType" -"python-lambda" = "magnus.tasks:PythonLambdaTaskType" "shell" = "magnus.tasks:ShellTaskType" "notebook" = "magnus.tasks:NotebookTaskType" -"container" = "magnus.tasks:ContainerTaskType" # Plugins for Nodes [tool.poetry.plugins."nodes"] diff --git a/tests/magnus/extensions/executor/test_argo_executor.py b/tests/magnus/extensions/executor/test_argo_executor.py index 64d64239..3298e46a 100644 --- a/tests/magnus/extensions/executor/test_argo_executor.py +++ b/tests/magnus/extensions/executor/test_argo_executor.py @@ -24,7 +24,7 @@ def test_secret_env_renders_properly(): def test_retry_serialize_makes_limit_str(): retry = implementation.Retry(limit=10) - assert retry.model_dump(by_alias=True) == {"limit": "10", "retryPolicy": "Always"} + assert retry.model_dump(by_alias=True)["limit"] == "10" def test_limit_renders_gpu_when_available(): @@ -43,19 +43,6 @@ def test_limit_ignores_gpu_when_none(): assert limit.model_dump(by_alias=True, exclude_none=True) == {**request.model_dump()} -def test_user_controls_with_defaults(): - user_controls = implementation.UserControls() - - assert user_controls.model_dump(by_alias=True, exclude_none=True) == { - "activeDeadlineSeconds": 7200, - "imagePullPolicy": "", - "limits": {"cpu": "250m", "memory": "1Gi"}, - "nodeSelector": {}, - "requests": {"cpu": "250m", "memory": "1Gi"}, - "retryStrategy": {"limit": "0", "retryPolicy": "Always"}, - } - - def test_out_put_parameter_renders_properly(): output_parameter = implementation.OutputParameter(name="test_name", value="test_value") @@ -66,18 +53,6 @@ def test_out_put_parameter_renders_properly(): } -def test_dag_template_renders_properly(): - task = implementation.TaskTemplate(name="test_name", template="test_template") - - dag = implementation.DagTemplate(tasks=[task]) - - assert dag.model_dump(by_alias=True, exclude_none=True) == { - "name": "magnus-dag", - "dag": {"tasks": [task.model_dump(by_alias=True, exclude_none=True)]}, - "failFast": True, - } - - def test_volume_renders_properly(): volume = implementation.Volume(name="test_name", claim="test_claim", mount_path="mount here") @@ -91,7 +66,7 @@ def test_spec_reshapes_arguments(): test_env1 = implementation.EnvVar(name="test_env1", value="test_value1") test_env2 = implementation.EnvVar(name="test_env2", value="test_value2") - spec = implementation.Spec(arguments=[test_env1, test_env2]) + spec = implementation.Spec(arguments=[test_env1, test_env2], active_deadline_seconds=10) assert spec.model_dump(by_alias=True, exclude_none=True)["arguments"] == { "parameters": [{"name": "test_env1", "value": "test_value1"}, {"name": "test_env2", "value": "test_value2"}] @@ -102,7 +77,7 @@ def test_spec_populates_container_volumes_and_persistent_volumes(): volume1 = implementation.UserVolumeMounts(name="test_name1", mount_path="test_mount_path1") volume2 = implementation.UserVolumeMounts(name="test_name2", mount_path="test_mount_path2") - spec = implementation.Spec(persistent_volumes=[volume1, volume2]) + spec = implementation.Spec(persistent_volumes=[volume1, volume2], active_deadline_seconds=10) model_dump = spec.model_dump(by_alias=True, exclude_none=True) @@ -112,74 +87,6 @@ def test_spec_populates_container_volumes_and_persistent_volumes(): ] -def test_user_controls_defaults_limit_and_request(): - test_user_controls = implementation.UserControls() - - default_limit = implementation.Limit() - default_requests = implementation.Request() - - model_dump = test_user_controls.model_dump(by_alias=True, exclude_none=True) - - assert model_dump["limits"] == default_limit.model_dump(by_alias=True, exclude_none=True) - assert model_dump["requests"] == default_requests.model_dump(by_alias=True, exclude_none=True) - - -def test_user_controls_overrides_defaults_if_provided(): - from_config = { - "image": "test", - "limits": {"cpu": "1000m", "memory": "1Gi"}, - "requests": {"cpu": "500m", "memory": "1Gi"}, - } - test_user_controls = implementation.UserControls(**from_config) - - model_dump = test_user_controls.model_dump(by_alias=True, exclude_none=True) - - assert model_dump["limits"] == {"cpu": "1000m", "memory": "1Gi"} - assert model_dump["requests"] == {"cpu": "500m", "memory": "1Gi"} - - -def test_default_container_env_is_none_if_secrets_or_env_vars_are_empty(): - test_default_container = implementation.DefaultContainer(image="test", command="test_command") - - assert test_default_container.model_dump(by_alias=True)["env"] is None - - -def test_default_container_env_is_sum_of_k8s_secrets_and_env_vars(): - test_default_container = implementation.DefaultContainer(image="test", command="test_command") - - env_var = implementation.EnvVar(name="test_env", value="test_value") - secret_var = implementation.SecretEnvVar(environment_variable="env", secret_name="name", secret_key="key") - - test_default_container._env_vars = [env_var] - test_default_container._secrets_from_k8s = [secret_var] - - assert test_default_container.model_dump(by_alias=True, exclude_none=True)["env"] == [ - {**env_var.model_dump(by_alias=True, exclude_none=True)}, - {**secret_var.model_dump(by_alias=True, exclude_none=True)}, - ] - - -def test_user_controls_env_is_none_if_secrets_or_env_vars_are_empty(): - test_user_controls = implementation.UserControls() - - assert test_user_controls.model_dump(by_alias=True)["env"] is None - - -def test_user_controls_env_is_sum_of_k8s_secrets_and_env_vars(): - test_user_controls = implementation.UserControls() - - env_var = implementation.EnvVar(name="test_env", value="test_value") - secret_var = implementation.SecretEnvVar(environment_variable="env", secret_name="name", secret_key="key") - - test_user_controls._env_vars = [env_var] - test_user_controls._secrets_from_k8s = [secret_var] - - assert test_user_controls.model_dump(by_alias=True, exclude_none=True)["env"] == [ - {**env_var.model_dump(by_alias=True, exclude_none=True)}, - {**secret_var.model_dump(by_alias=True, exclude_none=True)}, - ] - - def test_output_parameter_valuefrom_includes_path(): test_out_put_parameter = implementation.OutputParameter(name="test_name", path="test_path") diff --git a/tests/magnus/extensions/executor/test_argo_integration.py b/tests/magnus/extensions/executor/test_argo_integration.py deleted file mode 100644 index 2162b68e..00000000 --- a/tests/magnus/extensions/executor/test_argo_integration.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging - -from magnus.extensions.executor.argo import integration - - -def test_file_system_run_log_store_is_allowed_with_a_warning(caplog): - test_integration = integration.FileSystemRunLogStore(executor="test", integration_service="test") - - with caplog.at_level(logging.WARNING, logger="magnus"): - test_integration.validate() - - assert "Argo cannot run work with file-system run log store. Unless you " in caplog.text - - -def test_chunked_file_system_run_log_store_is_allowed_with_a_warning(caplog): - test_integration = integration.ChunkedFileSystemRunLogStore(executor="test", integration_service="test") - - with caplog.at_level(logging.WARNING, logger="magnus"): - test_integration.validate() - - assert "Argo cannot run work with chunked file-system run log store" in caplog.text - - -def test_file_sytem_catalog_is_allowed_with_a_warning(caplog): - test_integration = integration.FileSystemCatalog(executor="test", integration_service="test") - - with caplog.at_level(logging.WARNING, logger="magnus"): - test_integration.validate() - - assert "Argo cannot run work with file-system run log store. Unless you " in caplog.text diff --git a/tests/magnus/extensions/executor/test_generic_executor.py b/tests/magnus/extensions/executor/test_generic_executor.py index b2d426de..f138f85b 100644 --- a/tests/magnus/extensions/executor/test_generic_executor.py +++ b/tests/magnus/extensions/executor/test_generic_executor.py @@ -527,44 +527,11 @@ def test_base_executor_resolve_executor_config_gives_global_config_if_node_does_ mock_node = mocker.MagicMock() mock_node._get_executor_config.return_value = {} - test_executor = GenericExecutor() - - assert test_executor._resolve_executor_config(mock_node) == {**test_executor.model_dump()} - - -def test_base_executor__resolve_node_config_updates_global_config_if_node_overrides(mocker, monkeypatch): - mock_node = mocker.MagicMock() - mock_node._get_executor_config.return_value = {"enable_parallel": True} + mock_run_context.variables = {} test_executor = GenericExecutor() - assert test_executor._resolve_executor_config(mock_node)["enable_parallel"] is True - - -def test_resolve_node_config_updates_config_with_nested_config(mocker): - mock_node = mocker.MagicMock() - mock_node._get_executor_config.return_value = {"first": {"second": {"third": {"a": 1}}}} - - test_executor = GenericExecutor() - - assert test_executor._resolve_executor_config(mock_node)["first"] == {"second": {"third": {"a": 1}}} - - -def test_base_executor__resolve_node_config_updates_global_config_if_node_adds(mocker, monkeypatch): - mock_node = mocker.MagicMock() - mock_node._get_executor_config.return_value = {"b": 2} - - test_executor = GenericExecutor() - assert test_executor._resolve_executor_config(mock_node) == {**test_executor.model_dump(), **{"b": 2}} - - -def test_base_executor_resolve_node_supresess_global_config_from_placeholders_if_its_not_mapping(mocker, monkeypatch): - mock_node = mocker.MagicMock() - mock_node._get_executor_config.return_value = {"b": 2, "replace": None} - - test_executor = executor.GenericExecutor(placeholders={"replace": {"a": 1}}) - - assert test_executor._resolve_executor_config(mock_node) == {**test_executor.model_dump(), **{"b": 2, "a": 1}} + assert test_executor._resolve_executor_config(mock_node) == {**test_executor.model_dump()} def test_get_status_and_next_node_name_returns_empty_for_terminal_node(mocker, monkeypatch, mock_run_context): @@ -688,26 +655,6 @@ def test_execute_node_sets_step_log_status_to_success_if_node_succeeds(mocker, m assert mock_step_log.status == defaults.SUCCESS -def test_execute_node_step_log_gets_tracked_data(mocker, monkeypatch, mock_run_context): - mock_run_context.run_log_store.get_parameters.return_value = {"a": 1} - - mock_step_log = mocker.MagicMock() - mock_run_context.run_log_store.get_step_log.return_value = mock_step_log - - mock_utils = mocker.MagicMock() - mock_utils.get_tracked_data.return_value = {"a": 2} - monkeypatch.setattr(executor, "utils", mock_utils) - - mock_node = mocker.MagicMock() - mock_node.execute.return_value.status = defaults.SUCCESS - - test_executor = GenericExecutor() - test_executor._sync_catalog = mocker.MagicMock() - - test_executor._execute_node(mock_node) - assert mock_step_log.user_defined_metrics == {"a": 2} - - def test_send_return_code_raises_exception_if_pipeline_execution_failed(mocker, mock_run_context): mock_run_context.run_log_store.get_run_log_by_id.return_value.status = defaults.FAIL diff --git a/tests/magnus/extensions/run_log_store/test_file_system_integration.py b/tests/magnus/extensions/run_log_store/test_file_system_integration.py deleted file mode 100644 index 4baefc04..00000000 --- a/tests/magnus/extensions/run_log_store/test_file_system_integration.py +++ /dev/null @@ -1,62 +0,0 @@ -import logging -from pathlib import Path - -from magnus.extensions.run_log_store.file_system import integration - - -def test_local_compute_throws_warning(mocker, caplog): - mock_executor = mocker.MagicMock() - mock_executor._is_parallel_execution.return_value = True - - test_integration = integration.LocalComputeFileSystemRunLogStore(executor=mock_executor, integration_service="test") - - with caplog.at_level(logging.WARNING, logger="magnus"): - test_integration.validate() - - assert "Run log generated by file-system run log store are not thread safe." in caplog.text - - -def test_local_container_validate_throws_warning(mocker, caplog): - mock_executor = mocker.MagicMock() - mock_executor._is_parallel_execution.return_value = True - - test_integration = integration.LocalContainerComputeFileSystemRunLogstore( - executor=mock_executor, integration_service="test" - ) - - with caplog.at_level(logging.WARNING, logger="magnus"): - test_integration.validate() - - assert "Run log generated by file-system run log store are not thread safe." in caplog.text - - -def test_container_configure_for_traversal_populates_volumes(mocker, monkeypatch): - mock_local_container = mocker.MagicMock() - monkeypatch.setattr(integration, "LocalContainerExecutor", mock_local_container) - - mock_executor = mocker.MagicMock() - mock_executor._volumes = {} - mock_executor._container_log_location = "this_location" - - mock_fs_run_log_store = mocker.MagicMock() - mock_fs_run_log_store.log_folder_name = "run_log_location" - - test_integration = integration.LocalContainerComputeFileSystemRunLogstore(mock_executor, mock_fs_run_log_store) - test_integration.configure_for_traversal() - - assert mock_executor._volumes == {str(Path("run_log_location").resolve()): {"bind": "this_location", "mode": "rw"}} - - -def test_configure_for_execution_assigns_catalog_location_within_container(mocker, monkeypatch): - mock_local_container = mocker.MagicMock() - monkeypatch.setattr(integration, "LocalContainerExecutor", mock_local_container) - - mock_executor = mocker.MagicMock() - mock_executor._container_log_location = "this_location" - - mock_fs_run_log_store = mocker.MagicMock() - - test_integration = integration.LocalContainerComputeFileSystemRunLogstore(mock_executor, mock_fs_run_log_store) - test_integration.configure_for_execution() - - assert mock_fs_run_log_store.log_folder == "this_location" diff --git a/tests/magnus/extensions/secrets/test_dotenv_integration.py b/tests/magnus/extensions/secrets/test_dotenv_integration.py deleted file mode 100644 index 630b33ce..00000000 --- a/tests/magnus/extensions/secrets/test_dotenv_integration.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging -from pathlib import Path - -from magnus.extensions.secrets.dotenv import integration - - -def test_validate_issues_warning(mocker, caplog): - mock_executor = mocker.MagicMock() - mock_dot_env_secrets = mocker.MagicMock() - - test_integration = integration.LocalContainerComputeDotEnvSecrets(mock_executor, mock_dot_env_secrets) - - with caplog.at_level(logging.WARNING, logger="magnus"): - test_integration.validate() - - assert "Using dot env for non local deployments is not ideal" in caplog.text - - -def test_configure_for_traversal_populates_volumes(mocker, monkeypatch): - mock_local_container = mocker.MagicMock() - monkeypatch.setattr(integration, "LocalContainerExecutor", mock_local_container) - - mock_executor = mocker.MagicMock() - mock_executor._volumes = {} - mock_executor._container_secrets_location = "this_location" - - mock_dot_env_secrets = mocker.MagicMock() - mock_dot_env_secrets.secrets_location = "secrets_location" - - test_integration = integration.LocalContainerComputeDotEnvSecrets(mock_executor, mock_dot_env_secrets) - test_integration.configure_for_traversal() - - assert mock_executor._volumes == {str(Path("secrets_location").resolve()): {"bind": "this_location", "mode": "ro"}} - - -def test_configure_for_execution_assigns_secrets_location_within_container(mocker, monkeypatch): - mock_local_container = mocker.MagicMock() - monkeypatch.setattr(integration, "LocalContainerExecutor", mock_local_container) - - mock_executor = mocker.MagicMock() - mock_executor._container_secrets_location = "this_location" - - mock_dot_env_secrets = mocker.MagicMock() - - test_integration = integration.LocalContainerComputeDotEnvSecrets(mock_executor, mock_dot_env_secrets) - test_integration.configure_for_execution() - - assert mock_dot_env_secrets.location == "this_location" diff --git a/tests/magnus/extensions/secrets/test_env_secrets_integration.py b/tests/magnus/extensions/secrets/test_env_secrets_integration.py deleted file mode 100644 index 5d4300a3..00000000 --- a/tests/magnus/extensions/secrets/test_env_secrets_integration.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest - -from magnus.extensions.secrets.env_secrets import integration - - -def test_local_container_integration_raises_exception(caplog, mocker): - mock_executor = mocker.MagicMock() - - test_integration = integration.LocalContainerComputeEnvSecretsManager(mock_executor, "service") - - with pytest.raises(Exception, match="Local container executions cannot be used with environment secrets manager"): - test_integration.validate() diff --git a/tests/magnus/test_interaction.py b/tests/magnus/test_interaction.py index 4f70edd3..6b54f6d7 100644 --- a/tests/magnus/test_interaction.py +++ b/tests/magnus/test_interaction.py @@ -175,30 +175,6 @@ def test_get_from_catalog_raises_warning_if_no_context_step_log(mocker, monkeypa mock_catalog_handler_get.assert_called_once_with("this", run_id="RUN_ID") -def test_put_in_catalog_delegates_to_catalog_handler(mocker, monkeypatch): - mock_context = mocker.MagicMock() - mock_catalog_handler = mocker.MagicMock() - - mock_context.run_context.catalog_handler = mock_catalog_handler - - mock_catalog_handler_put = mocker.MagicMock() - mock_catalog_handler.put = mock_catalog_handler_put - mock_context.run_context.run_id = "RUN_ID" - - mock_catalog_handler.compute_data_folder = "compute_folder" - monkeypatch.setattr(interaction, "context", mock_context) - - mock_file_path = mocker.MagicMock() - mock_path = mocker.MagicMock(return_value=mock_file_path) - mock_file_path.name = "file_name" - mock_file_path.parent = "in_this_folder" - monkeypatch.setattr(interaction, "Path", mock_path) - - interaction.put_in_catalog("this_file") - - mock_catalog_handler_put.assert_called_once_with("this_file", run_id="RUN_ID") - - @pytest.mark.noautofixt def test_get_run_id_returns_from_context(monkeypatch, mocker): mock_context = mocker.MagicMock() diff --git a/tests/magnus/test_nodes.py b/tests/magnus/test_nodes.py index c303c306..2399ba93 100644 --- a/tests/magnus/test_nodes.py +++ b/tests/magnus/test_nodes.py @@ -148,7 +148,7 @@ def test_traversal_node_get_executor_config_defaults_to_empty_dict(instantiable_ name="test", internal_name="test", node_type="test", next_node="next", on_failure="on_failure" ) - assert traversal_class._get_executor_config("I do not exist") == {} + assert traversal_class._get_executor_config("I do not exist") == "" def test_traversal_node_get_executor_returns_configured_config(instantiable_traversal_node): @@ -158,10 +158,10 @@ def test_traversal_node_get_executor_returns_configured_config(instantiable_trav node_type="test", next_node="next", on_failure="on_failure", - overrides={"test": {"key": "value"}}, + overrides={"test": "key"}, ) - assert traversal_class._get_executor_config("test") == {"key": "value"} + assert traversal_class._get_executor_config("test") == "key" def test_executable_node_get_catalog_detaults_to_empty(instantiable_executable_node): diff --git a/tests/magnus/test_tasks.py b/tests/magnus/test_tasks.py index cdb163d7..fb33c9fc 100644 --- a/tests/magnus/test_tasks.py +++ b/tests/magnus/test_tasks.py @@ -175,13 +175,6 @@ def __init__(self): del os.environ[defaults.PARAMETER_PREFIX + "a"] -def test_python_lambda_task_type_execute_command_raises_for_under_and_dunder(): - lambda_exec = tasks.PythonLambdaTaskType(command="_ and __", node_name="dummy") - - with pytest.raises(Exception): - lambda_exec.execute_command() - - def test_notebook_raises_exception_if_command_is_not_a_notebook(): with pytest.raises(Exception): tasks.NotebookTaskType(command="path to notebook") diff --git a/tests/magnus/test_utils.py b/tests/magnus/test_utils.py index 377c278a..2ffb0a98 100644 --- a/tests/magnus/test_utils.py +++ b/tests/magnus/test_utils.py @@ -65,8 +65,8 @@ def test_apply_variables_applies_variables(): def test_apply_variables_applies_known_variables(): apply_to = "${var}_${var1}" - transformed = utils.apply_variables(apply_to, variables={"var": "hello"}) - assert transformed == "hello_${var1}" + with pytest.raises(KeyError): + transformed = utils.apply_variables(apply_to, variables={"var": "hello"}) def test_get_module_and_func_names_raises_exception_for_incorrect_command(): diff --git a/tests/test_examples.py b/tests/test_examples.py index f37174cb..ff16f576 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -1,7 +1,9 @@ +from contextlib import nullcontext, contextmanager import pytest from pathlib import Path import os import importlib +import subprocess from magnus.entrypoints import execute from magnus import exceptions @@ -9,8 +11,11 @@ # (file, is_fail?, kwargs) examples = [ ("concepts/catalog.yaml", False, {"configuration_file": "examples/configs/fs-catalog.yaml"}), - ("concepts/experiment_tracking_env_step.yaml", False, {}), ("concepts/experiment_tracking_env.yaml", False, {}), + ("concepts/experiment_tracking_env_step.yaml", False, {}), + ("concepts/map.yaml", False, {}), + ("concepts/map_shell.yaml", False, {}), + ("concepts/nesting.yaml", False, {}), ("concepts/notebook_api_parameters.yaml", False, {"parameters_file": "examples/concepts/parameters.yaml"}), ("concepts/notebook_env_parameters.yaml", False, {"parameters_file": "examples/concepts/parameters.yaml"}), ("concepts/notebook_native_parameters.yaml", False, {"parameters_file": "examples/concepts/parameters.yaml"}), @@ -43,6 +48,7 @@ def list_examples(): @pytest.mark.parametrize("example", list_examples()) @pytest.mark.no_cover +@pytest.mark.e2e def test_yaml_examples(example): print(f"Testing {example}...") examples_path = Path("examples") @@ -58,6 +64,25 @@ def test_yaml_examples(example): @pytest.mark.parametrize("example", list_examples()) @pytest.mark.no_cover +@pytest.mark.e2e +def test_yaml_examples_argo(example): + print(f"Testing {example}...") + examples_path = Path("examples") + file_path, status, kwargs = example + try: + full_file_path = examples_path / file_path + kwargs.pop("configuration_file", "") + configuration_file = "examples/configs/argo-config.yaml" + execute(configuration_file=configuration_file, pipeline_file=str(full_file_path.resolve()), **kwargs) + subprocess.run(["argo", "lint", "--offline", "argo-pipeline.yaml"], check=True) + except exceptions.ExecutionFailedError: + if not status: + raise + + +@pytest.mark.parametrize("example", list_examples()) +@pytest.mark.no_cover +@pytest.mark.e2e_container def test_yaml_examples_container(example): print(f"Testing {example}...") examples_path = Path("examples") @@ -66,30 +91,49 @@ def test_yaml_examples_container(example): full_file_path = examples_path / file_path kwargs.pop("configuration_file", "") configuration_file = "examples/configs/local-container.yaml" - os.environ["MAGNUS_VAR_default_docker_image"] = "magnus:demo" + os.environ["MAGNUS_VAR_default_docker_image"] = "magnus:3.8" execute(configuration_file=configuration_file, pipeline_file=str(full_file_path), **kwargs) except exceptions.ExecutionFailedError: if not status: raise +@contextmanager +def secrets_env_context(): + os.environ["secret"] = "secret_value" + os.environ["MAGNUS_CONFIGURATION_FILE"] = "examples/configs/secrets-env-default.yaml" + yield + del os.environ["secret"] + del os.environ["MAGNUS_CONFIGURATION_FILE"] + + +# function, success, context python_examples = [ - ("catalog_api", False), - ("catalog", False), - ("contrived", False), - ("mocking", False), - ("on_failure", False), - ("parameters_api", False), - ("parameters", False), - ("python-tasks", False), - ("secrets", False), - ("concepts.catalog", False), - ("concepts.parallel", False), - ("concepts.simple", False), - ("concepts.task_api_parameters", False), - ("concepts.task_env_parameters", False), - ("concepts.task_native_parameters", False), - ("concepts.traversal", False), + ("catalog", False, None), + ("catalog_api", False, None), + ("catalog_simple", False, None), + ("contrived", False, None), + ("mocking", False, None), + ("on_failure", False, None), + ("parameters_api", False, None), + ("parameters", False, None), + ("python-tasks", False, None), + ("secrets", False, None), + ("secrets_env", False, secrets_env_context), + ("concepts.catalog", False, None), + ("concepts.catalog_api", False, None), + ("concepts.catalog_object", False, None), + ("concepts.experiment_tracking_api", False, None), + ("concepts.experiment_tracking_env", False, None), + ("concepts.experiment_tracking_step", False, None), + ("concepts.map", False, None), + ("concepts.nesting", False, None), + ("concepts.parallel", False, None), + ("concepts.simple", False, None), + ("concepts.task_api_parameters", False, None), + ("concepts.task_env_parameters", False, None), + ("concepts.task_native_parameters", False, None), + ("concepts.traversal", False, None), ] @@ -99,16 +143,23 @@ def list_python_examples(): @pytest.mark.parametrize("example", list_python_examples()) -@pytest.mark.no_cover +# @pytest.mark.no_cover +@pytest.mark.e2e def test_python_examples(example): print(f"Testing {example}...") - mod, status = example + mod, status, context = example + + if not context: + context = nullcontext() + else: + context = context() imported_module = importlib.import_module(f"examples.{mod}") f = getattr(imported_module, "main") try: - f() + with context: + f() except exceptions.ExecutionFailedError: if not status: raise diff --git a/tox.ini b/tox.ini index 6cb2a8a2..1104c2bd 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ envlist = python3.8, mypy whitelist_externals = poetry commands = poetry install -E docker -E notebook - poetry run python -m pytest --cov=magnus/ tests/ + poetry run python -m pytest -m "not e2e_container" --cov=magnus/ tests/ [testenv:mypy] whitelist_externals = poetry