diff --git a/config/config-defaults.yaml b/config/config-defaults.yaml index d2da8230343..3984c178463 100644 --- a/config/config-defaults.yaml +++ b/config/config-defaults.yaml @@ -87,6 +87,10 @@ data: # no default-resolver-type is specified by default default-resolver-type: + # default-imagepullbackoff-timeout contains the default number of minutes to wait + # before requeuing the TaskRun to retry + # default-imagepullbackoff-timeout: "5" + # default-container-resource-requirements allow users to update default resource requirements # to a init-containers and containers of a pods create by the controller # Onet: All the resource requirements are applied to init-containers and containers diff --git a/docs/additional-configs.md b/docs/additional-configs.md index c4b1c1b49c2..2523a9ff182 100644 --- a/docs/additional-configs.md +++ b/docs/additional-configs.md @@ -31,6 +31,7 @@ installation. - [Verify the transparency logs using `rekor-cli`](#verify-the-transparency-logs-using-rekor-cli) - [Verify Tekton Resources](#verify-tekton-resources) - [Pipelinerun with Affinity Assistant](#pipelineruns-with-affinity-assistant) + - [TaskRuns with `imagePullBackOff` Timeout](#taskruns-with-imagepullbackoff-timeout) - [Next steps](#next-steps) @@ -672,6 +673,26 @@ please take a look at [Trusted Resources](./trusted-resources.md). The cluster operators can review the [guidelines](developers/affinity-assistant.md) to `cordon` a node in the cluster with the tekton controller and the affinity assistant is enabled. +## TaskRuns with `imagePullBackOff` Timeout + +Tekton pipelines has adopted a fail fast strategy with a taskRun failing with `TaskRunImagePullFailed` in case of an +`imagePullBackOff`. This can be limited in some cases, and it generally depends on the infrastructure. To allow the +cluster operators to decide whether to wait in case of an `imagePullBackOff`, a setting is available to configure +the wait time in minutes such that the controller will wait for the specified duration before declaring a failure. +For example, with the following `config-defaults`, the controller does not mark the taskRun as failure for 5 minutes since +the pod is scheduled in case the image pull fails with `imagePullBackOff`. +See issue https://github.com/tektoncd/pipeline/issues/5987 for more details. + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-defaults + namespace: tekton-pipelines +data: + default-imagepullbackoff-timeout: "5" +``` + ## Next steps To get started with Tekton check the [Introductory tutorials][quickstarts], diff --git a/pkg/apis/config/default.go b/pkg/apis/config/default.go index 30ed52ae931..c52da0ba27f 100644 --- a/pkg/apis/config/default.go +++ b/pkg/apis/config/default.go @@ -49,6 +49,8 @@ const ( // default resource requirements, will be applied to all the containers, which has empty resource requirements ResourceRequirementDefaultContainerKey = "default" + DefaultImagePullBackOffTimeout = 0 * time.Minute + defaultTimeoutMinutesKey = "default-timeout-minutes" defaultServiceAccountKey = "default-service-account" defaultManagedByLabelValueKey = "default-managed-by-label-value" @@ -60,6 +62,7 @@ const ( defaultForbiddenEnv = "default-forbidden-env" defaultResolverTypeKey = "default-resolver-type" defaultContainerResourceRequirementsKey = "default-container-resource-requirements" + defaultImagePullBackOffTimeout = "default-imagepullbackoff-timeout" ) // DefaultConfig holds all the default configurations for the config. @@ -79,6 +82,7 @@ type Defaults struct { DefaultForbiddenEnv []string DefaultResolverType string DefaultContainerResourceRequirements map[string]corev1.ResourceRequirements + DefaultImagePullBackOffTimeout time.Duration } // GetDefaultsConfigName returns the name of the configmap containing all @@ -109,6 +113,7 @@ func (cfg *Defaults) Equals(other *Defaults) bool { other.DefaultTaskRunWorkspaceBinding == cfg.DefaultTaskRunWorkspaceBinding && other.DefaultMaxMatrixCombinationsCount == cfg.DefaultMaxMatrixCombinationsCount && other.DefaultResolverType == cfg.DefaultResolverType && + other.DefaultImagePullBackOffTimeout == cfg.DefaultImagePullBackOffTimeout && reflect.DeepEqual(other.DefaultForbiddenEnv, cfg.DefaultForbiddenEnv) } @@ -121,6 +126,7 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) { DefaultCloudEventsSink: DefaultCloudEventSinkValue, DefaultMaxMatrixCombinationsCount: DefaultMaxMatrixCombinationsCount, DefaultResolverType: DefaultResolverTypeValue, + DefaultImagePullBackOffTimeout: DefaultImagePullBackOffTimeout, } if defaultTimeoutMin, ok := cfgMap[defaultTimeoutMinutesKey]; ok { @@ -191,6 +197,14 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) { tc.DefaultContainerResourceRequirements = resourceRequirementsValue } + if defaultImagePullBackOff, ok := cfgMap[defaultImagePullBackOffTimeout]; ok { + timeout, err := time.ParseDuration(defaultImagePullBackOff) + if err != nil { + return nil, fmt.Errorf("failed parsing tracing config %q", defaultImagePullBackOffTimeout) + } + tc.DefaultImagePullBackOffTimeout = timeout + } + return &tc, nil } diff --git a/pkg/apis/config/default_test.go b/pkg/apis/config/default_test.go index c88691d72ca..de873f2f103 100644 --- a/pkg/apis/config/default_test.go +++ b/pkg/apis/config/default_test.go @@ -18,6 +18,7 @@ package config_test import ( "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/tektoncd/pipeline/pkg/apis/config" @@ -43,6 +44,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { DefaultManagedByLabelValue: "something-else", DefaultMaxMatrixCombinationsCount: 256, DefaultResolverType: "git", + DefaultImagePullBackOffTimeout: time.Duration(5) * time.Second, }, fileName: config.GetDefaultsConfigName(), }, @@ -62,12 +64,16 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { }, }, DefaultMaxMatrixCombinationsCount: 256, + DefaultImagePullBackOffTimeout: 0, }, fileName: "config-defaults-with-pod-template", }, { expectedError: true, fileName: "config-defaults-timeout-err", + }, { + expectedError: true, + fileName: "config-defaults-imagepullbackoff-timeout-err", }, // Previously the yaml package did not support UnmarshalStrict, though // it's supported now however it may introduce incompatibility, so we decide @@ -81,6 +87,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { DefaultManagedByLabelValue: config.DefaultManagedByLabelValue, DefaultPodTemplate: &pod.Template{}, DefaultMaxMatrixCombinationsCount: 256, + DefaultImagePullBackOffTimeout: 0, }, }, { @@ -92,6 +99,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { DefaultManagedByLabelValue: config.DefaultManagedByLabelValue, DefaultAAPodTemplate: &pod.AffinityAssistantTemplate{}, DefaultMaxMatrixCombinationsCount: 256, + DefaultImagePullBackOffTimeout: 0, }, }, { @@ -106,6 +114,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { DefaultTimeoutMinutes: 60, DefaultServiceAccount: "default", DefaultManagedByLabelValue: config.DefaultManagedByLabelValue, + DefaultImagePullBackOffTimeout: 0, }, }, { @@ -117,6 +126,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { DefaultMaxMatrixCombinationsCount: 256, DefaultManagedByLabelValue: "tekton-pipelines", DefaultForbiddenEnv: []string{"TEKTON_POWER_MODE", "TEST_ENV", "TEST_TEKTON"}, + DefaultImagePullBackOffTimeout: time.Duration(15) * time.Second, }, }, { @@ -128,6 +138,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { DefaultManagedByLabelValue: "tekton-pipelines", DefaultMaxMatrixCombinationsCount: 256, DefaultContainerResourceRequirements: map[string]corev1.ResourceRequirements{}, + DefaultImagePullBackOffTimeout: 0, }, }, { @@ -142,6 +153,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) { DefaultServiceAccount: "default", DefaultManagedByLabelValue: "tekton-pipelines", DefaultMaxMatrixCombinationsCount: 256, + DefaultImagePullBackOffTimeout: 0, DefaultContainerResourceRequirements: map[string]corev1.ResourceRequirements{ config.ResourceRequirementDefaultContainerKey: { Requests: corev1.ResourceList{ @@ -197,6 +209,7 @@ func TestNewDefaultsFromEmptyConfigMap(t *testing.T) { DefaultManagedByLabelValue: "tekton-pipelines", DefaultServiceAccount: "default", DefaultMaxMatrixCombinationsCount: 256, + DefaultImagePullBackOffTimeout: 0, } verifyConfigFileWithExpectedConfig(t, DefaultsConfigEmptyName, expectedConfig) } @@ -345,6 +358,24 @@ func TestEquals(t *testing.T) { DefaultForbiddenEnv: []string{"TEST_ENV", "TEKTON_POWER_MODE"}, }, expected: true, + }, { + name: "different default ImagePullBackOff timeout", + left: &config.Defaults{ + DefaultImagePullBackOffTimeout: 10, + }, + right: &config.Defaults{ + DefaultImagePullBackOffTimeout: 20, + }, + expected: false, + }, { + name: "same default ImagePullBackOff timeout", + left: &config.Defaults{ + DefaultImagePullBackOffTimeout: 20, + }, + right: &config.Defaults{ + DefaultImagePullBackOffTimeout: 20, + }, + expected: true, }, } diff --git a/pkg/apis/config/testdata/config-defaults-forbidden-env.yaml b/pkg/apis/config/testdata/config-defaults-forbidden-env.yaml index 9f61c21f6b5..c1fffc5e5f2 100644 --- a/pkg/apis/config/testdata/config-defaults-forbidden-env.yaml +++ b/pkg/apis/config/testdata/config-defaults-forbidden-env.yaml @@ -21,3 +21,4 @@ data: default-timeout-minutes: "50" default-service-account: "tekton" default-forbidden-env: "TEST_TEKTON, TEKTON_POWER_MODE,TEST_ENV,TEST_TEKTON" + default-imagepullbackoff-timeout: "15s" diff --git a/pkg/apis/config/testdata/config-defaults-imagepullbackoff-timeout-err.yaml b/pkg/apis/config/testdata/config-defaults-imagepullbackoff-timeout-err.yaml new file mode 100644 index 00000000000..4beea0fe4ba --- /dev/null +++ b/pkg/apis/config/testdata/config-defaults-imagepullbackoff-timeout-err.yaml @@ -0,0 +1,21 @@ +# Copyright 2019 The Tekton Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-defaults + namespace: tekton-pipelines +data: + default-imagepullbackoff-timeout: "not-a-timeout" \ No newline at end of file diff --git a/pkg/apis/config/testdata/config-defaults.yaml b/pkg/apis/config/testdata/config-defaults.yaml index d05c8e3cfc3..7f0b0a72f98 100644 --- a/pkg/apis/config/testdata/config-defaults.yaml +++ b/pkg/apis/config/testdata/config-defaults.yaml @@ -22,3 +22,4 @@ data: default-service-account: "tekton" default-managed-by-label-value: "something-else" default-resolver-type: "git" + default-imagepullbackoff-timeout: "5s" \ No newline at end of file diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 788052f1f2b..f2c5d1a8c6a 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -94,13 +94,15 @@ type Reconciler struct { tracerProvider trace.TracerProvider } +const ImagePullBackOff = "ImagePullBackOff" + var ( // Check that our Reconciler implements taskrunreconciler.Interface _ taskrunreconciler.Interface = (*Reconciler)(nil) // Pod failure reasons that trigger failure of the TaskRun podFailureReasons = map[string]struct{}{ - "ImagePullBackOff": {}, + ImagePullBackOff: {}, "InvalidImageName": {}, } ) @@ -171,7 +173,7 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon } // Check for Pod Failures - if failed, reason, message := c.checkPodFailed(tr); failed { + if failed, reason, message := c.checkPodFailed(ctx, tr); failed { err := c.failTaskRun(ctx, tr, reason, message) return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err) } @@ -222,10 +224,30 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon return nil } -func (c *Reconciler) checkPodFailed(tr *v1.TaskRun) (bool, v1.TaskRunReason, string) { +func (c *Reconciler) checkPodFailed(ctx context.Context, tr *v1.TaskRun) (bool, v1.TaskRunReason, string) { for _, step := range tr.Status.Steps { if step.Waiting != nil { if _, found := podFailureReasons[step.Waiting.Reason]; found { + if step.Waiting.Reason == ImagePullBackOff { + imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout + // only attempt to recover from the imagePullBackOff if specified + if imagePullBackOffTimeOut.Seconds() != 0 { + p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{}) + if err != nil { + message := fmt.Sprintf(`The step %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, step.Name, tr.Name, step.ImageID, err) + return true, v1.TaskRunReasonImagePullFailed, message + } + for _, condition := range p.Status.Conditions { + // check the pod condition to get the time when the pod was scheduled + // keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration + if condition.Type == corev1.PodScheduled { + if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut { + return false, "", "" + } + } + } + } + } image := step.ImageID message := fmt.Sprintf(`The step %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, step.Name, tr.Name, image, step.Waiting.Message) return true, v1.TaskRunReasonImagePullFailed, message @@ -235,6 +257,26 @@ func (c *Reconciler) checkPodFailed(tr *v1.TaskRun) (bool, v1.TaskRunReason, str for _, sidecar := range tr.Status.Sidecars { if sidecar.Waiting != nil { if _, found := podFailureReasons[sidecar.Waiting.Reason]; found { + if sidecar.Waiting.Reason == ImagePullBackOff { + imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout + // only attempt to recover from the imagePullBackOff if specified + if imagePullBackOffTimeOut.Seconds() != 0 { + p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{}) + if err != nil { + message := fmt.Sprintf(`The sidecar %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, sidecar.Name, tr.Name, sidecar.ImageID, err) + return true, v1.TaskRunReasonImagePullFailed, message + } + for _, condition := range p.Status.Conditions { + // check the pod condition to get the time when the pod was scheduled + // keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration + if condition.Type == corev1.PodScheduled { + if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut { + return false, "", "" + } + } + } + } + } image := sidecar.ImageID message := fmt.Sprintf(`The sidecar %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, sidecar.Name, tr.Name, image, sidecar.Waiting.Message) return true, v1.TaskRunReasonImagePullFailed, message diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index d6263abe9f4..0f265e5cf9f 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -2659,13 +2659,15 @@ status: } } -func TestReconcilePodFailuresImageFailures(t *testing.T) { +func TestReconcilePodFailures(t *testing.T) { var stepNumber int8 for _, tc := range []struct { - desc string - reason string - message string - failure string // "step" or "sidecar" + desc string + reason string + message string + failure string // "step" or "sidecar" + imagePullBackOffTimeout string + podNotFound bool }{{ desc: "image pull failed sidecar", reason: "ImagePullBackOff", @@ -2686,6 +2688,44 @@ func TestReconcilePodFailuresImageFailures(t *testing.T) { reason: "InvalidImageName", message: "Invalid image \"whatever\"", failure: "step", + }, { + desc: "image pull failure for the sidecar with non-zero imagePullBackOff timeout", + reason: "ImagePullBackOff", + message: "Back-off pulling image \"whatever\"", + failure: "sidecar", + imagePullBackOffTimeout: "5s", + }, { + desc: "image pull failure for the sidecar with non-zero imagePullBackOff timeout and no pod", + reason: "ImagePullBackOff", + message: "pods \"pod-1\" not found", + failure: "sidecar", + imagePullBackOffTimeout: "5m", + podNotFound: true, + }, { + desc: "invalid image sidecar with non-zero imagePullBackOff timeout", + reason: "InvalidImageName", + message: "Invalid image \"whatever\"", + failure: "sidecar", + imagePullBackOffTimeout: "5h", + }, { + desc: "image pull failure for the step with non-zero imagePullBackOff timeout", + reason: "ImagePullBackOff", + message: "Back-off pulling image \"whatever\"", + failure: "step", + imagePullBackOffTimeout: "5s", + }, { + desc: "image pull failure for the step with non-zero imagePullBackOff timeout and no pod", + reason: "ImagePullBackOff", + message: "pods \"pod-1\" not found", + failure: "step", + imagePullBackOffTimeout: "5m", + podNotFound: true, + }, { + desc: "invalid image step with non-zero imagePullBackOff timeout", + reason: "InvalidImageName", + message: "Invalid image \"whatever\"", + failure: "step", + imagePullBackOffTimeout: "5h", }} { t.Run(tc.desc, func(t *testing.T) { taskRun := parse.MustParseV1TaskRun(t, ` @@ -2700,6 +2740,7 @@ spec: steps: - image: alpine status: + podName: "pod-1" sidecars: - container: step-unnamed-0 name: unnamed-0 @@ -2751,28 +2792,76 @@ status: "Normal Started ", fmt.Sprintf(`Warning Failed The %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever". The pod errored with the message: "%s.`, tc.failure, stepNumber, tc.message), } + d := test.Data{ TaskRuns: []*v1.TaskRun{taskRun}, } + var timeout time.Duration + if tc.imagePullBackOffTimeout != "" { + timeout, _ = time.ParseDuration(tc.imagePullBackOffTimeout) + if timeout.Seconds() != 0 { + d.ConfigMaps = []*corev1.ConfigMap{ + { + ObjectMeta: metav1.ObjectMeta{Name: config.GetDefaultsConfigName(), Namespace: system.Namespace()}, + Data: map[string]string{ + "default-imagepullbackoff-timeout": tc.imagePullBackOffTimeout, + }, + }, + } + } + } + if !tc.podNotFound { + d.Pods = []*corev1.Pod{{ + ObjectMeta: metav1.ObjectMeta{Name: "pod-1", Namespace: "foo"}, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{{ + Type: corev1.PodScheduled, + LastTransitionTime: metav1.Time{Time: time.Now()}, + }}, + }, + }} + } testAssets, cancel := getTaskRunController(t, d) defer cancel() c := testAssets.Controller clients := testAssets.Clients - if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil { - t.Fatalf("Unexpected error when reconciling completed TaskRun : %v", err) - } - newTr, err := clients.Pipeline.TektonV1().TaskRuns(taskRun.Namespace).Get(testAssets.Ctx, taskRun.Name, metav1.GetOptions{}) - if err != nil { - t.Fatalf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", taskRun.Name, err) - } - condition := newTr.Status.GetCondition(apis.ConditionSucceeded) - if d := cmp.Diff(expectedStatus, condition, ignoreLastTransitionTime); d != "" { - t.Fatalf("Did not get expected condition %s", diff.PrintWantGot(d)) - } - err = k8sevent.CheckEventsOrdered(t, testAssets.Recorder.Events, taskRun.Name, wantEvents) - if err != nil { - t.Errorf(err.Error()) + // for a step or a sidecar, controller must continue and retry podCreation with non-zero imagePullBackOff timeout + if tc.reason == "ImagePullBackOff" && timeout.Seconds() != 0 && !tc.podNotFound { + err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)) + if err == nil { + t.Errorf("expected error when reconciling completed TaskRun : %v", err) + } + if isRequeueError, requeueDuration := controller.IsRequeueKey(err); !isRequeueError { + t.Errorf("Expected requeue error, but got: %s", err.Error()) + } else if requeueDuration < 0 { + t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String()) + } + _, err = clients.Pipeline.TektonV1().TaskRuns(taskRun.Namespace).Get(testAssets.Ctx, taskRun.Name, metav1.GetOptions{}) + if err != nil { + t.Errorf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", taskRun.Name, err) + } + } else { + if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil { + t.Fatalf("Unexpected error when reconciling completed TaskRun : %v", err) + } + newTr, err := clients.Pipeline.TektonV1().TaskRuns(taskRun.Namespace).Get(testAssets.Ctx, taskRun.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", taskRun.Name, err) + } + // the error message includes the error if the pod is not found + if tc.podNotFound { + expectedStatus.Message = fmt.Sprintf(`The %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever" and the pod with error: "%s."`, tc.failure, stepNumber, tc.message) + wantEvents[1] = fmt.Sprintf(`Warning Failed The %s "unnamed-%d" in TaskRun "test-imagepull-fail" failed to pull the image "whatever" and the pod with error: "%s.`, tc.failure, stepNumber, tc.message) + } + condition := newTr.Status.GetCondition(apis.ConditionSucceeded) + if d := cmp.Diff(expectedStatus, condition, ignoreLastTransitionTime); d != "" { + t.Fatalf("Did not get expected condition %s", diff.PrintWantGot(d)) + } + err = k8sevent.CheckEventsOrdered(t, testAssets.Recorder.Events, taskRun.Name, wantEvents) + if err != nil { + t.Errorf(err.Error()) + } } }) }