Skip to content

Commit

Permalink
[FEATURE] - Job Retries (#1193)
Browse files Browse the repository at this point in the history
[FEATURE] - job Retries

Currently the controller will retry failed terraform job multile times; in this PR we've allowed the administrator to choose the number of times they are willing to retry a job before deeming it a failure
  • Loading branch information
gambol99 authored Jan 18, 2024
1 parent 6bbeb08 commit 7f883e5
Show file tree
Hide file tree
Showing 9 changed files with 16 additions and 2 deletions.
1 change: 1 addition & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func main() {

flags := cmd.Flags()
flags.Bool("verbose", false, "Enable verbose logging")
flags.IntVar(&config.BackoffLimit, "backoff-limit", 1, "The number of times we are willing to allow a terraform job to error before marking as a failure")
flags.BoolVar(&config.EnableContextInjection, "enable-context-injection", false, "Indicates the controller should inject Configuration context into the terraform variables")
flags.BoolVar(&config.EnableNamespaceProtection, "enable-namespace-protection", false, "Indicates the controller should protect the controller namespace from being deleted")
flags.BoolVar(&config.EnableRevisionUpdateProtection, "enable-revision-update-protection", false, "Indicates we should protect the revisions in use from being updated")
Expand Down
2 changes: 1 addition & 1 deletion pkg/assets/job.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
{{ $key }}: "{{ $value }}"
{{- end }}
spec:
backoffLimit: 2
backoffLimit: {{ default 1 .BackoffLimit }}
completions: 1
parallelism: 1
# retain the jobs for 6 hours
Expand Down
3 changes: 3 additions & 0 deletions pkg/controller/configuration/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ type Controller struct {
// BackendTemplate is the name of the secret in the controller namespace which holds a
// template used to generate the state backend
BackendTemplate string
// BackoffLimit is the amount of times we are allowing a job to failed before deeming
// it a failure
BackoffLimit int
// EnableContextInjection enables the injection of the context into the terraform configuration
// variables. This means we shall inject an number of default variables into the configuration
// such as namespace, name and labels
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/configuration/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func (c *Controller) ensureTerraformDestroy(configuration *terraformv1alpha1.Con
map[string]string{
terraformv1alpha1.RetryAnnotation: configuration.GetAnnotations()[terraformv1alpha1.RetryAnnotation],
}),
BackoffLimit: c.BackoffLimit,
EnableInfraCosts: c.EnableInfracosts,
ExecutorImage: c.ExecutorImage,
ExecutorSecrets: c.ExecutorSecrets,
Expand Down
2 changes: 2 additions & 0 deletions pkg/controller/configuration/ensure.go
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,7 @@ func (c *Controller) ensureTerraformPlan(configuration *terraformv1alpha1.Config
terraformv1alpha1.DriftAnnotation: configuration.GetAnnotations()[terraformv1alpha1.DriftAnnotation],
terraformv1alpha1.RetryAnnotation: configuration.GetAnnotations()[terraformv1alpha1.RetryAnnotation],
}),
BackoffLimit: c.BackoffLimit,
EnableInfraCosts: c.EnableInfracosts,
ExecutorImage: c.ExecutorImage,
ExecutorSecrets: c.ExecutorSecrets,
Expand Down Expand Up @@ -1149,6 +1150,7 @@ func (c *Controller) ensureTerraformApply(configuration *terraformv1alpha1.Confi
state.provider.JobLabels(),
configuration.GetLabels(),
),
BackoffLimit: c.BackoffLimit,
EnableInfraCosts: c.EnableInfracosts,
ExecutorImage: c.ExecutorImage,
ExecutorSecrets: c.ExecutorSecrets,
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/configuration/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func makeFakeController(cc client.Client) *Controller {
kc: kfake.NewSimpleClientset(),
cache: cache.New(5*time.Minute, 10*time.Minute),
recorder: recorder,
BackoffLimit: 2,
EnableInfracosts: false,
EnableWatchers: true,
ExecutorImage: "ghcr.io/appvia/terranetes-executor",
Expand Down
3 changes: 2 additions & 1 deletion pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ func New(cfg *rest.Config, config Config) (*Server, error) {
// @step: ensure the configuration controller is enabled
if err := (&configuration.Controller{
BackendTemplate: config.BackendTemplate,
BackoffLimit: config.BackoffLimit,
ControllerJobLabels: jobLabels,
ControllerNamespace: config.Namespace,
EnableInfracosts: (config.InfracostsSecretName != ""),
EnableTerraformVersions: config.EnableTerraformVersions,
Expand All @@ -181,7 +183,6 @@ func New(cfg *rest.Config, config Config) (*Server, error) {
InfracostsImage: config.InfracostsImage,
InfracostsSecretName: config.InfracostsSecretName,
JobTemplate: config.JobTemplate,
ControllerJobLabels: jobLabels,
PolicyImage: config.PolicyImage,
TerraformImage: config.TerraformImage,
}).Add(mgr); err != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/server/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ type Config struct {
// contains an optional template to use for the backend state - unless this
// is set we use the default backend state i.e. kubernetes state
BackendTemplate string
// BackoffLimit is the number of times we are willing to allow a job to fail
BackoffLimit int
// DriftControllerInterval is the interval for the controller to check for drift
DriftControllerInterval time.Duration
// DriftInterval is the minimum interval between drift checks
Expand Down
3 changes: 3 additions & 0 deletions pkg/utils/jobs/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ type Options struct {
AdditionalJobSecrets []string
// AdditionalJobLabels are additional labels added to the job
AdditionalJobLabels map[string]string
// BackoffLimit is the number of times we are willing to allow a job to fail
// before we give up
BackoffLimit int
// EnableInfraCosts is the flag to enable cost analysis
EnableInfraCosts bool
// ExecutorImage is the image to use for the terraform jobs
Expand Down

0 comments on commit 7f883e5

Please sign in to comment.