From a391da248aed62f883aeaa2462b7033c071e3642 Mon Sep 17 00:00:00 2001 From: Eran Ifrach Date: Sun, 29 Sep 2024 12:43:37 +0300 Subject: [PATCH] MGMT-18628: fix better waiting for deployment --- deploy/operator/capi/deploy_capi_cluster.sh | 6 +- .../hypershift/deploy_hypershift_cluster.sh | 12 +-- deploy/operator/setup_assisted_operator.sh | 2 +- deploy/operator/setup_hive.sh | 2 +- deploy/operator/utils.sh | 83 ++++++++++++------- deploy/operator/ztp/deploy_spoke_cluster.sh | 6 +- 6 files changed, 65 insertions(+), 46 deletions(-) diff --git a/deploy/operator/capi/deploy_capi_cluster.sh b/deploy/operator/capi/deploy_capi_cluster.sh index a18ec385e74..b6599b671bd 100755 --- a/deploy/operator/capi/deploy_capi_cluster.sh +++ b/deploy/operator/capi/deploy_capi_cluster.sh @@ -166,7 +166,7 @@ for manifest in $(find ${__dir}/generated -type f); do tee < "${manifest}" >(oc apply -f -) done -wait_for_condition "infraenv/${ASSISTED_INFRAENV_NAME}" "ImageCreated" "5m" "${SPOKE_NAMESPACE}" +wait_for_condition "infraenv/${ASSISTED_INFRAENV_NAME}" "condition=ImageCreated" "5m" "${SPOKE_NAMESPACE}" echo "Waiting until at least ${SPOKE_CONTROLPLANE_AGENTS} agents are available..." @@ -233,8 +233,8 @@ hypershift_cli hypershift create cluster agent --name $ASSISTED_CLUSTER_NAME --b # Wait for a hypershift hostedcontrolplane to report ready status wait_for_resource "hostedcontrolplane/${ASSISTED_CLUSTER_NAME}" "${SPOKE_NAMESPACE}-${ASSISTED_CLUSTER_NAME}" wait_for_boolean_field "hostedcontrolplane/${ASSISTED_CLUSTER_NAME}" status.ready "${SPOKE_NAMESPACE}-${ASSISTED_CLUSTER_NAME}" -wait_for_condition "nodepool/$ASSISTED_CLUSTER_NAME" "Ready" "10m" "$SPOKE_NAMESPACE" -wait_for_condition "hostedcluster/$ASSISTED_CLUSTER_NAME" "Available" "10m" "$SPOKE_NAMESPACE" +wait_for_condition "nodepool/$ASSISTED_CLUSTER_NAME" "condition=Ready" "10m" "$SPOKE_NAMESPACE" +wait_for_condition "hostedcluster/$ASSISTED_CLUSTER_NAME" "condition=Available" "10m" "$SPOKE_NAMESPACE" # Scale up echo "Scaling the hosted cluster up to contain ${SPOKE_CONTROLPLANE_AGENTS} worker nodes" diff --git a/deploy/operator/hypershift/deploy_hypershift_cluster.sh b/deploy/operator/hypershift/deploy_hypershift_cluster.sh index 2db00f7af57..8e4d425d102 100755 --- a/deploy/operator/hypershift/deploy_hypershift_cluster.sh +++ b/deploy/operator/hypershift/deploy_hypershift_cluster.sh @@ -62,8 +62,8 @@ oc get hostedcluster ${ASSISTED_CLUSTER_NAME} -n ${HYPERSHIFT_AGENT_NS} || \ echo "Wait for a running hypershift cluster with no worker nodes" wait_for_pods "$SPOKE_NAMESPACE" -wait_for_condition "nodepool/$ASSISTED_CLUSTER_NAME" "Ready" "10m" "$HYPERSHIFT_AGENT_NS" -wait_for_condition "hostedcluster/$ASSISTED_CLUSTER_NAME" "Available" "10m" "$HYPERSHIFT_AGENT_NS" +wait_for_condition "nodepool/$ASSISTED_CLUSTER_NAME" "condition=Ready" "10m" "$HYPERSHIFT_AGENT_NS" +wait_for_condition "hostedcluster/$ASSISTED_CLUSTER_NAME" "condition=Available" "10m" "$HYPERSHIFT_AGENT_NS" echo "Extract spoke kubeconfig" oc extract -n $HYPERSHIFT_AGENT_NS secret/$ASSISTED_CLUSTER_NAME-admin-kubeconfig --to=- > /tmp/$ASSISTED_CLUSTER_NAME-kubeconfig @@ -79,7 +79,7 @@ oc --kubeconfig $SPOKE_KUBECONFIG apply -f ${__root}/hack/crds echo "Apply HypershiftAgentServiceConfig on hub" ansible-playbook "${playbooks_dir}/hasc-playbook.yaml" oc apply -f ${playbooks_dir}/generated/hasc.yaml -n $SPOKE_NAMESPACE -wait_for_condition "hypershiftagentserviceconfigs/hypershift-agent" "DeploymentsHealthy" "20m" "$SPOKE_NAMESPACE" +wait_for_condition "hypershiftagentserviceconfigs/hypershift-agent" "condition=DeploymentsHealthy" "20m" "$SPOKE_NAMESPACE" echo "Create assisted secrets" oc --kubeconfig $SPOKE_KUBECONFIG get secret "${ASSISTED_PULLSECRET_NAME}" -n "${SPOKE_NAMESPACE}" || \ @@ -95,7 +95,7 @@ oc --kubeconfig $SPOKE_KUBECONFIG apply -f ${playbooks_dir}/generated/agentClust oc --kubeconfig $SPOKE_KUBECONFIG apply -f ${playbooks_dir}/generated/infraEnv.yaml -n $SPOKE_NAMESPACE echo "Wait for InfraEnv ImageCreated" -KUBECONFIG=$SPOKE_KUBECONFIG wait_for_condition "infraenv/${ASSISTED_INFRAENV_NAME}" "ImageCreated" "5m" "${SPOKE_NAMESPACE}" +KUBECONFIG=$SPOKE_KUBECONFIG wait_for_condition "infraenv/${ASSISTED_INFRAENV_NAME}" "condition=ImageCreated" "5m" "${SPOKE_NAMESPACE}" export ISO_DOWNLOAD_URL=$(oc get --kubeconfig $SPOKE_KUBECONFIG -n $SPOKE_NAMESPACE infraenv $ASSISTED_INFRAENV_NAME -o jsonpath='{.status.isoDownloadURL}') echo "Apply BareMetalHost on hub" @@ -113,10 +113,10 @@ oc --kubeconfig $SPOKE_KUBECONFIG -n $SPOKE_NAMESPACE patch agent $agent_name -p echo "Waiting until cluster is installed" -KUBECONFIG=$SPOKE_KUBECONFIG wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "Stopped" "90m" "${SPOKE_NAMESPACE}" +KUBECONFIG=$SPOKE_KUBECONFIG wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "condition=Stopped" "90m" "${SPOKE_NAMESPACE}" echo "Cluster installation has been stopped (either for good or bad reasons)" -KUBECONFIG=$SPOKE_KUBECONFIG wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "Completed" "90m" "${SPOKE_NAMESPACE}" +KUBECONFIG=$SPOKE_KUBECONFIG wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "condition=Completed" "90m" "${SPOKE_NAMESPACE}" echo "Cluster has been installed successfully!" diff --git a/deploy/operator/setup_assisted_operator.sh b/deploy/operator/setup_assisted_operator.sh index 37018a9a159..323e249fc14 100755 --- a/deploy/operator/setup_assisted_operator.sh +++ b/deploy/operator/setup_assisted_operator.sh @@ -215,7 +215,7 @@ EOCR oc patch -n ${ASSISTED_NAMESPACE} agentserviceconfig agent --type merge -p '{"spec":{"osImages":'"${OS_IMAGES_CAMELCASE}"'}}' wait_for_operator "assisted-service-operator" "${ASSISTED_NAMESPACE}" - wait_for_condition "agentserviceconfigs/agent" "ReconcileCompleted" "5m" + wait_for_condition "agentserviceconfigs/agent" "condition=ReconcileCompleted" "5m" wait_for_deployment "assisted-service" "${ASSISTED_NAMESPACE}" "5m" wait_for_pod "assisted-image-service" "${ASSISTED_NAMESPACE}" "app=assisted-image-service" diff --git a/deploy/operator/setup_hive.sh b/deploy/operator/setup_hive.sh index 9240f0b4c7a..358efb6fffa 100755 --- a/deploy/operator/setup_hive.sh +++ b/deploy/operator/setup_hive.sh @@ -112,7 +112,7 @@ spec: featureSet: Custom EOF - wait_for_condition "hiveconfig.hive.openshift.io/hive" "Ready" "10m" + wait_for_condition "hiveconfig.hive.openshift.io/hive" "condition=Ready" "10m" } if [ -z "$@" ] || ! declare -F "$@"; then diff --git a/deploy/operator/utils.sh b/deploy/operator/utils.sh index 2a783525d58..8333b26f228 100644 --- a/deploy/operator/utils.sh +++ b/deploy/operator/utils.sh @@ -6,7 +6,7 @@ function wait_for_crd() { crd="$1" namespace="${2:-}" - wait_for_condition "crd/${crd}" "Established" "60s" "${namespace}" + wait_for_condition "crd/${crd}" "condition=Established" "60s" "${namespace}" } function remote_agents() { @@ -26,41 +26,40 @@ function installed_remote_agents() { export -f installed_remote_agents function wait_for_operator() { - subscription="$1" - namespace="${2:-}" - echo "Waiting for operator ${subscription} to get installed on namespace ${namespace}..." - - for _ in $(seq 1 60); do - csv=$(oc -n "${namespace}" get subscription "${subscription}" -o jsonpath='{.status.installedCSV}' || true) - if [[ -n "${csv}" ]]; then - if [[ "$(oc -n "${namespace}" get csv "${csv}" -o jsonpath='{.status.phase}')" == "Succeeded" ]]; then - echo "ClusterServiceVersion (${csv}) is ready" - return 0 - fi - fi - - sleep 10 - done - - echo "Timed out waiting for csv to become ready!" - return 1 -} + subscription="$1" + namespace="${2:-}" + + wait_for_condition "subscriptions.operators.coreos.com/${subscription}" jsonpath='{..status.state}'=AtLatestKnown "30s" "${namespace}" + + csv=$(oc get subscriptions.operators.coreos.com/${subscription} --namespace=${namespace} -o jsonpath='{..status.installedCSV}') + echo "Waiting for CSV ${csv} installation" + if ! [[ $(oc wait "clusterserviceversions.operators.coreos.com/${csv}" --namespace=${namespace} --for=jsonpath='{.status.phase}'="Succeeded" --timeout=30s) ]] + then + echo "ERROR: CSV installation has failed" + oc get "clusterserviceversions.operators.coreos.com/${csv}" --namespace=${namespace} -o json + exit 1 + fi + } function wait_for_pod() { pod="$1" namespace="${2:-}" selector="${3:-}" - wait_for_condition "pod" "Ready" "30m" "${namespace}" "${selector}" + wait_for_condition "pod" "condition=Ready" "30m" "${namespace}" "${selector}" } function wait_for_pods(){ - while [[ $(oc get pods -n $1 -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}'| tr ' ' '\n' | sort -u) != "True" ]]; do - echo "Waiting for pods in namespace $1 to be ready" - oc get pods -n $1 -o 'jsonpath={..status.containerStatuses}' | jq "." - sleep 5; - done - echo "Pods in namespace $1 are ready" + namespace=$1 + + if [[ $(oc wait --namespace "${namespace}" --all --for=condition=Ready pod --timeout 1m) ]]; then + echo "All Pods in namespace ${namespace} are ready"} + else + echo "ERROR: Failed waiting for pods" + # debug output + oc get pods --namespace "${namespace}" + exit 1 + fi } function wait_for_deployment() { @@ -70,11 +69,22 @@ function wait_for_deployment() { echo "Waiting for (deployment) on namespace (${namespace}) with name (${deployment}) to be created..." for i in {1..40}; do - oc get deployment "${deployment}" --namespace="${namespace}" |& grep -ivE "(no resources found|not found)" && break || sleep 10 + oc get deployments.apps "${deployment}" --namespace="${namespace}" |& grep -ivE "(no resources found|not found)" && break || sleep 10 done + if [ $i -eq 40 ]; then + echo "ERROR: failed Waiting for (deployment) on namespace (${namespace}) with name (${deployment}) to be created..." + exit 1 + fi echo "Waiting for (deployment) on namespace (${namespace}) with name (${deployment}) to rollout..." - oc rollout status "deploy/${deployment}" -n "${namespace}" --timeout="${timeout}" + REPLICAS=$(oc get deployments.apps --namespace="${namespace}" "${deployment}" -o jsonpath='{..status.replicas}') + if ! [[ $(oc --namespace="${namespace}" wait --for=jsonpath='{..status.availableReplicas}'="${REPLICAS}" --timeout=5m "deployments.apps/${deployment}") ]]; + then + echo "ERROR: Deployment failed" + oc get --namespace="${namespace}" "deployments.apps/${deployment}" -o json + exit 1 + fi + } function hash() { @@ -97,13 +107,22 @@ function wait_for_condition() { namespace="${4:-}" selector="${5:-}" + counter=1 echo "Waiting for (${object}) on namespace (${namespace}) with labels (${selector}) to be created..." - for i in {1..40}; do - oc get ${object} --selector="${selector}" --namespace=${namespace} |& grep -ivE "(no resources found|not found)" && break || sleep 10 + until [[ $(oc get ${object} --selector="${selector}" --namespace="${namespace}" 2> /dev/null ) ]] + do + if [[ "${counter}" -eq 30 ]]; + then + echo "ERROR: failed Waiting for (${object}) on namespace (${namespace}) with labels (${selector}) to become (${condition})..." + oc get ${object} --selector="${selector}" --namespace="${namespace}" -o json + exit 1 + break + fi + ((counter++)) && sleep 2 done echo "Waiting for (${object}) on namespace (${namespace}) with labels (${selector}) to become (${condition})..." - oc wait -n "${namespace}" --for=condition=${condition} --selector "${selector}" ${object} --timeout=${timeout} + oc wait -n "${namespace}" --all --for=${condition} ${object} --timeout=${timeout} --selector "${selector}" } function wait_for_object_amount() { diff --git a/deploy/operator/ztp/deploy_spoke_cluster.sh b/deploy/operator/ztp/deploy_spoke_cluster.sh index 9394bc723c8..3221db695bd 100755 --- a/deploy/operator/ztp/deploy_spoke_cluster.sh +++ b/deploy/operator/ztp/deploy_spoke_cluster.sh @@ -99,7 +99,7 @@ for manifest in $(find ${__dir}/generated -type f); do oc apply -f "${manifest}" done -wait_for_condition "infraenv/${ASSISTED_INFRAENV_NAME}" "ImageCreated" "5m" "${SPOKE_NAMESPACE}" +wait_for_condition "infraenv/${ASSISTED_INFRAENV_NAME}" "condition=ImageCreated" "5m" "${SPOKE_NAMESPACE}" echo "Waiting until at least ${SPOKE_CONTROLPLANE_AGENTS} agents are available..." @@ -130,10 +130,10 @@ if [ ${SPOKE_CONTROLPLANE_AGENTS} -ne 1 ] && [ "${USER_MANAGED_NETWORKING}" == " fi fi -wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "Stopped" "90m" "${SPOKE_NAMESPACE}" +wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "condition=Stopped" "90m" "${SPOKE_NAMESPACE}" echo "Cluster installation has been stopped (either for good or bad reasons)" -wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "Completed" "1m" "${SPOKE_NAMESPACE}" +wait_for_condition "agentclusterinstall/${ASSISTED_AGENT_CLUSTER_INSTALL_NAME}" "condition=Completed" "1m" "${SPOKE_NAMESPACE}" echo "Cluster has been installed successfully!" wait_for_boolean_field "clusterdeployment/${ASSISTED_CLUSTER_DEPLOYMENT_NAME}" spec.installed "${SPOKE_NAMESPACE}"