diff --git a/client/sm/client.go b/client/sm/client.go index f813da53..8b595a6b 100644 --- a/client/sm/client.go +++ b/client/sm/client.go @@ -70,10 +70,11 @@ type Client interface { } type ServiceManagerError struct { - ErrorType string `json:"error,omitempty"` - Description string `json:"description,omitempty"` - StatusCode int `json:"-"` - BrokerError *common.HTTPStatusCodeError `json:"broker_error,omitempty"` + ErrorType string `json:"error,omitempty"` + Description string `json:"description,omitempty"` + StatusCode int `json:"-"` + BrokerError *common.HTTPStatusCodeError `json:"broker_error,omitempty"` + ResponseHeaders http.Header `json:"-"` } func (e *ServiceManagerError) Error() string { @@ -590,7 +591,8 @@ func handleResponseError(response *http.Response) error { } smError := &ServiceManagerError{ - StatusCode: response.StatusCode, + StatusCode: response.StatusCode, + ResponseHeaders: response.Header, } _ = json.Unmarshal(body, &smError) diff --git a/controllers/servicebinding_controller.go b/controllers/servicebinding_controller.go index d2b5f7a3..fef2f309 100644 --- a/controllers/servicebinding_controller.go +++ b/controllers/servicebinding_controller.go @@ -262,7 +262,7 @@ func (r *ServiceBindingReconciler) createBinding(ctx context.Context, smClient s _, bindingParameters, err := utils.BuildSMRequestParameters(serviceBinding.Namespace, serviceBinding.Spec.ParametersFrom, serviceBinding.Spec.Parameters) if err != nil { log.Error(err, "failed to parse smBinding parameters") - return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.CREATE, err.Error(), serviceBinding) + return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.CREATE, err, serviceBinding) } smBinding, operationURL, bindErr := smClient.Bind(&smClientTypes.ServiceBinding{ @@ -278,13 +278,13 @@ func (r *ServiceBindingReconciler) createBinding(ctx context.Context, smClient s if bindErr != nil { log.Error(err, "failed to create service binding", "serviceInstanceID", serviceInstance.Status.InstanceID) - return utils.HandleError(ctx, r.Client, smClientTypes.CREATE, bindErr, serviceBinding, true) + return utils.HandleError(ctx, r.Client, smClientTypes.CREATE, bindErr, serviceBinding) } if operationURL != "" { var bindingID string if bindingID = sm.ExtractBindingID(operationURL); len(bindingID) == 0 { - return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.CREATE, fmt.Sprintf("failed to extract smBinding ID from operation URL %s", operationURL), serviceBinding) + return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.CREATE, fmt.Errorf("failed to extract smBinding ID from operation URL %s", operationURL), serviceBinding) } serviceBinding.Status.BindingID = bindingID @@ -360,8 +360,7 @@ func (r *ServiceBindingReconciler) delete(ctx context.Context, serviceBinding *v log.Info(fmt.Sprintf("Deleting binding with id %v from SM", serviceBinding.Status.BindingID)) operationURL, unbindErr := smClient.Unbind(serviceBinding.Status.BindingID, nil, utils.BuildUserInfo(ctx, serviceBinding.Spec.UserInfo)) if unbindErr != nil { - // delete will proceed anyway - return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.DELETE, unbindErr.Error(), serviceBinding) + return utils.HandleDeleteError(ctx, r.Client, unbindErr, serviceBinding) } if operationURL != "" { @@ -847,7 +846,7 @@ func (r *ServiceBindingReconciler) handleSecretError(ctx context.Context, op smC log := utils.GetLogger(ctx) log.Error(err, fmt.Sprintf("failed to store secret %s for binding %s", binding.Spec.SecretName, binding.Name)) if apierrors.ReasonForError(err) == metav1.StatusReasonUnknown { - return utils.MarkAsNonTransientError(ctx, r.Client, op, err.Error(), binding) + return utils.MarkAsNonTransientError(ctx, r.Client, op, err, binding) } return utils.MarkAsTransientError(ctx, r.Client, op, err, binding) } diff --git a/controllers/servicebinding_controller_test.go b/controllers/servicebinding_controller_test.go index 0d8d2a3c..1d1033e9 100644 --- a/controllers/servicebinding_controller_test.go +++ b/controllers/servicebinding_controller_test.go @@ -4,11 +4,10 @@ import ( "context" "encoding/json" "errors" - "net/http" - "strings" - "github.com/lithammer/dedent" authv1 "k8s.io/api/authentication/v1" + "net/http" + "strings" "github.com/SAP/sap-btp-service-operator/api/common" "github.com/SAP/sap-btp-service-operator/internal/utils" @@ -400,7 +399,7 @@ var _ = Describe("ServiceBinding controller", func() { }) }) - When("SM returned transient error(429)", func() { + When("SM returned transient error(429) without retry-after header", func() { BeforeEach(func() { errorMessage = "too many requests" fakeClient.BindReturnsOnCall(0, nil, "", &sm.ServiceManagerError{ diff --git a/controllers/serviceinstance_controller.go b/controllers/serviceinstance_controller.go index c5cea1b6..2cb1d798 100644 --- a/controllers/serviceinstance_controller.go +++ b/controllers/serviceinstance_controller.go @@ -176,7 +176,7 @@ func (r *ServiceInstanceReconciler) createInstance(ctx context.Context, smClient if err != nil { // if parameters are invalid there is nothing we can do, the user should fix it according to the error message in the condition log.Error(err, "failed to parse instance parameters") - return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.CREATE, err.Error(), serviceInstance) + return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.CREATE, err, serviceInstance) } provision, provisionErr := smClient.Provision(&smClientTypes.ServiceInstance{ @@ -193,7 +193,7 @@ func (r *ServiceInstanceReconciler) createInstance(ctx context.Context, smClient if provisionErr != nil { log.Error(provisionErr, "failed to create service instance", "serviceOfferingName", serviceInstance.Spec.ServiceOfferingName, "servicePlanName", serviceInstance.Spec.ServicePlanName) - return utils.HandleError(ctx, r.Client, smClientTypes.CREATE, provisionErr, serviceInstance, true) + return utils.HandleError(ctx, r.Client, smClientTypes.CREATE, provisionErr, serviceInstance) } serviceInstance.Status.InstanceID = provision.InstanceID @@ -231,7 +231,7 @@ func (r *ServiceInstanceReconciler) updateInstance(ctx context.Context, smClient _, instanceParameters, err := utils.BuildSMRequestParameters(serviceInstance.Namespace, serviceInstance.Spec.ParametersFrom, serviceInstance.Spec.Parameters) if err != nil { log.Error(err, "failed to parse instance parameters") - return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.UPDATE, fmt.Sprintf("failed to parse parameters: %v", err.Error()), serviceInstance) + return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.UPDATE, err, serviceInstance) } _, operationURL, err := smClient.UpdateInstance(serviceInstance.Status.InstanceID, &smClientTypes.ServiceInstance{ @@ -242,7 +242,7 @@ func (r *ServiceInstanceReconciler) updateInstance(ctx context.Context, smClient if err != nil { log.Error(err, fmt.Sprintf("failed to update service instance with ID %s", serviceInstance.Status.InstanceID)) - return utils.HandleError(ctx, r.Client, smClientTypes.UPDATE, err, serviceInstance, true) + return utils.HandleError(ctx, r.Client, smClientTypes.UPDATE, err, serviceInstance) } if operationURL != "" { @@ -296,7 +296,7 @@ func (r *ServiceInstanceReconciler) deleteInstance(ctx context.Context, serviceI operationURL, deprovisionErr := smClient.Deprovision(serviceInstance.Status.InstanceID, nil, utils.BuildUserInfo(ctx, serviceInstance.Spec.UserInfo)) if deprovisionErr != nil { // delete will proceed anyway - return utils.MarkAsNonTransientError(ctx, r.Client, smClientTypes.DELETE, deprovisionErr.Error(), serviceInstance) + return utils.HandleDeleteError(ctx, r.Client, deprovisionErr, serviceInstance) } if operationURL != "" { diff --git a/internal/utils/condition_utils.go b/internal/utils/condition_utils.go index ae43ec31..6d82cc40 100644 --- a/internal/utils/condition_utils.go +++ b/internal/utils/condition_utils.go @@ -3,10 +3,8 @@ package utils import ( "context" "fmt" - "net/http" "github.com/SAP/sap-btp-service-operator/api/common" - "github.com/SAP/sap-btp-service-operator/client/sm" smClientTypes "github.com/SAP/sap-btp-service-operator/client/sm/types" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -178,32 +176,21 @@ func SetFailureConditions(operationType smClientTypes.OperationCategory, errorMe object.SetConditions(conditions) } -func MarkAsNonTransientError(ctx context.Context, k8sClient client.Client, operationType smClientTypes.OperationCategory, errMsg string, object common.SAPBTPResource) (ctrl.Result, error) { +func MarkAsNonTransientError(ctx context.Context, k8sClient client.Client, operationType smClientTypes.OperationCategory, err error, object common.SAPBTPResource) (ctrl.Result, error) { log := GetLogger(ctx) + errMsg := err.Error() + log.Info(fmt.Sprintf("operation %s of %s encountered a non transient error %s, setting failure conditions", operationType, object.GetControllerName(), errMsg)) SetFailureConditions(operationType, errMsg, object) - if operationType != smClientTypes.DELETE { - log.Info(fmt.Sprintf("operation %s of %s encountered a non transient error %s, giving up operation :(", operationType, object.GetControllerName(), errMsg)) - } object.SetObservedGeneration(object.GetGeneration()) - err := UpdateStatus(ctx, k8sClient, object) - if err != nil { - return ctrl.Result{}, err - } - if operationType == smClientTypes.DELETE { - return ctrl.Result{}, fmt.Errorf(errMsg) - } - return ctrl.Result{}, nil + return ctrl.Result{}, UpdateStatus(ctx, k8sClient, object) } func MarkAsTransientError(ctx context.Context, k8sClient client.Client, operationType smClientTypes.OperationCategory, err error, object common.SAPBTPResource) (ctrl.Result, error) { log := GetLogger(ctx) - //DO NOT REMOVE - 429 error is not reflected to the status - if smError, ok := err.(*sm.ServiceManagerError); !ok || smError.StatusCode != http.StatusTooManyRequests { - SetInProgressConditions(ctx, operationType, err.Error(), object) - log.Info(fmt.Sprintf("operation %s of %s encountered a transient error %s, retrying operation :)", operationType, object.GetControllerName(), err.Error())) - if updateErr := UpdateStatus(ctx, k8sClient, object); updateErr != nil { - return ctrl.Result{}, updateErr - } + log.Info(fmt.Sprintf("operation %s of %s encountered a transient error %s, retrying operation :)", operationType, object.GetControllerName(), err.Error())) + SetInProgressConditions(ctx, operationType, err.Error(), object) + if updateErr := UpdateStatus(ctx, k8sClient, object); updateErr != nil { + return ctrl.Result{}, updateErr } return ctrl.Result{}, err diff --git a/internal/utils/condition_utils_test.go b/internal/utils/condition_utils_test.go index b11c0de6..a044eaf7 100644 --- a/internal/utils/condition_utils_test.go +++ b/internal/utils/condition_utils_test.go @@ -1,6 +1,7 @@ package utils import ( + "fmt" "net/http" "github.com/SAP/sap-btp-service-operator/api/common" @@ -168,9 +169,8 @@ var _ = Describe("Condition Utils", func() { Context("MarkAsNonTransientError", func() { It("should mark as non-transient error and update status", func() { operationType := smClientTypes.CREATE - errorMessage := "Non-transient error" - result, err := MarkAsNonTransientError(ctx, k8sClient, operationType, errorMessage, resource) + result, err := MarkAsNonTransientError(ctx, k8sClient, operationType, fmt.Errorf("Non-transient error"), resource) Expect(err).ToNot(HaveOccurred()) Expect(result).To(Equal(ctrl.Result{})) }) diff --git a/internal/utils/controller_util.go b/internal/utils/controller_util.go index 83e5381d..d368ae0e 100644 --- a/internal/utils/controller_util.go +++ b/internal/utils/controller_util.go @@ -7,6 +7,7 @@ import ( "fmt" "net/http" "strings" + "time" "github.com/SAP/sap-btp-service-operator/api/common" "github.com/SAP/sap-btp-service-operator/client/sm" @@ -128,19 +129,53 @@ func GetLogger(ctx context.Context) logr.Logger { return ctx.Value(LogKey{}).(logr.Logger) } -func HandleError(ctx context.Context, k8sClient client.Client, operationType smClientTypes.OperationCategory, err error, resource common.SAPBTPResource, ignoreNonTransient bool) (ctrl.Result, error) { +func HandleError(ctx context.Context, k8sClient client.Client, operationType smClientTypes.OperationCategory, err error, resource common.SAPBTPResource) (ctrl.Result, error) { log := GetLogger(ctx) var smError *sm.ServiceManagerError - ok := errors.As(err, &smError) - if !ok { - log.Info("unable to cast error to SM error, will be treated as non transient") - return MarkAsNonTransientError(ctx, k8sClient, operationType, err.Error(), resource) - } - if ignoreNonTransient || IsTransientError(smError, log) { + if ok := errors.As(err, &smError); ok { + if smError.StatusCode == http.StatusTooManyRequests { + log.Info(fmt.Sprintf("SM returned 429 (%s), requeueing...", smError.Error())) + return handleRateLimitError(smError, log) + } + + log.Info(fmt.Sprintf("SM returned error: %s", smError.Error())) return MarkAsTransientError(ctx, k8sClient, operationType, smError, resource) } - return MarkAsNonTransientError(ctx, k8sClient, operationType, smError.Error(), resource) + log.Info(fmt.Sprintf("unable to cast error to SM error, will be treated as non transient. (error: %v)", err)) + return MarkAsNonTransientError(ctx, k8sClient, operationType, err, resource) +} + +func handleRateLimitError(smError *sm.ServiceManagerError, log logr.Logger) (ctrl.Result, error) { + retryAfterStr := smError.ResponseHeaders.Get("Retry-After") + if len(retryAfterStr) > 0 { + log.Info(fmt.Sprintf("SM returned 429 with Retry-After: %s, requeueing after it...", retryAfterStr)) + retryAfter, err := time.Parse(time.DateTime, retryAfterStr[:len(time.DateTime)]) // format 2024-11-11 14:59:33 +0000 UTC + if err != nil { + log.Error(err, "failed to parse Retry-After header, using default requeue time") + } else { + timeToRequeue := time.Until(retryAfter) + log.Info(fmt.Sprintf("requeueing after %d minutes, %d seconds", int(timeToRequeue.Minutes()), int(timeToRequeue.Seconds())%60)) + return ctrl.Result{RequeueAfter: timeToRequeue}, nil + } + } + + return ctrl.Result{Requeue: true}, nil +} + +func HandleDeleteError(ctx context.Context, k8sClient client.Client, err error, object common.SAPBTPResource) (ctrl.Result, error) { + log := GetLogger(ctx) + log.Info(fmt.Sprintf("handling delete error: %v", err)) + var smError *sm.ServiceManagerError + if errors.As(err, &smError) && smError.StatusCode == http.StatusTooManyRequests { + return handleRateLimitError(smError, log) + } + + if _, updateErr := MarkAsNonTransientError(ctx, k8sClient, smClientTypes.DELETE, err, object); updateErr != nil { + log.Error(updateErr, "failed to update resource status") + return ctrl.Result{}, updateErr + } + return ctrl.Result{}, err } func IsTransientError(smError *sm.ServiceManagerError, log logr.Logger) bool {