From bd99a83476a34b1ea6c6a7d9ca4ceff6191eab3d Mon Sep 17 00:00:00 2001 From: Orfeas Kourkakis Date: Tue, 11 Jun 2024 15:47:52 +0300 Subject: [PATCH 1/2] fix: training-integration uat by updating image --- .../notebooks/training/training-integration.ipynb | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/notebooks/training/training-integration.ipynb b/tests/notebooks/training/training-integration.ipynb index ed81a3b..fa29caa 100644 --- a/tests/notebooks/training/training-integration.ipynb +++ b/tests/notebooks/training/training-integration.ipynb @@ -6,9 +6,15 @@ "source": [ "# Test Training Operator Integration\n", "\n", - "This example notebook is loosely based on [this](https://github.com/kubeflow/training-operator/blob/master/sdk/python/examples/kubeflow-tfjob-sdk.ipynb) upstream example.\n", + "This example notebook is loosely based on the following upstream examples:\n", + "* [TFJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/tensorflow/image-classification/create-tfjob.ipynb)\n", + "* [PyTorchJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/pytorch/image-classification/create-pytorchjob.ipynb)\n", + "* [PaddleJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/paddlepaddle/simple-cpu.yaml)\n", "\n", - "- create training job of type: TFJob, PyTorchJob, and PaddleJob\n", + "Note that the above can get out of sync with the actual testing upstream does, so make sure to also check out [upstream E2E tests](https://github.com/kubeflow/training-operator/tree/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/sdk/python/test/e2e) for updating the notebook.\n", + "\n", + "The workflow for each job (TFJob, PyTorchJob, and PaddleJob) is:\n", + "- create training job\n", "- monitor its execution\n", "- get training logs\n", "- delete job" @@ -411,7 +417,7 @@ "source": [ "PYTORCHJOB_NAME = \"pytorch-dist-mnist-gloo\"\n", "PYTORCHJOB_CONTAINER = \"pytorch\"\n", - "PYTORCHJOB_IMAGE = \"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\"" + "PYTORCHJOB_IMAGE = \"kubeflow/pytorch-dist-mnist:v1-3a360ba\"" ] }, { @@ -644,7 +650,7 @@ "source": [ "PADDLEJOB_NAME = \"paddle-simple-cpu\"\n", "PADDLEJOB_CONTAINER = \"paddle\"\n", - "PADDLEJOB_IMAGE = \"registry.baidubce.com/paddlepaddle/paddle:2.4.0rc0-cpu\"" + "PADDLEJOB_IMAGE = \"docker.io/paddlepaddle/paddle:2.4.0rc0-cpu\"" ] }, { From de5e91cdc79963fc1d90153b8318919184a31615 Mon Sep 17 00:00:00 2001 From: Orfeas Kourkakis Date: Thu, 13 Jun 2024 10:32:30 +0300 Subject: [PATCH 2/2] review: update --- tests/notebooks/training/training-integration.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/notebooks/training/training-integration.ipynb b/tests/notebooks/training/training-integration.ipynb index fa29caa..207ede3 100644 --- a/tests/notebooks/training/training-integration.ipynb +++ b/tests/notebooks/training/training-integration.ipynb @@ -148,7 +148,7 @@ "source": [ "### Define a TFJob\n", "\n", - "Define a TFJob object before deploying it. This TFJob is similar to [this](https://github.com/kubeflow/training-operator/blob/master/sdk/python/examples/kubeflow-tfjob-sdk.ipynb) example." + "Define a TFJob object before deploying it." ] }, { @@ -417,7 +417,8 @@ "source": [ "PYTORCHJOB_NAME = \"pytorch-dist-mnist-gloo\"\n", "PYTORCHJOB_CONTAINER = \"pytorch\"\n", - "PYTORCHJOB_IMAGE = \"kubeflow/pytorch-dist-mnist:v1-3a360ba\"" + "PYTORCHJOB_IMAGE = \"kubeflow/pytorch-dist-mnist:v1-3a360ba\"\n", + "# The image above should be updated with each release with the latest available in the registry." ] }, { @@ -639,7 +640,7 @@ "source": [ "### Define a PaddleJob\n", "\n", - "Define a PaddleJob object before deploying it. This PaddleJob is loosely based on [this](https://github.com/kubeflow/training-operator/blob/11b7a115e6538caeab405344af98f0d5b42a4c96/examples/paddlepaddle/simple-cpu.yaml) example." + "Define a PaddleJob object before deploying it." ] }, {