From 561a27fb939b486c8ea4739950b036d5906a8158 Mon Sep 17 00:00:00 2001 From: Brian Katyl Date: Tue, 8 Oct 2024 18:35:08 -0600 Subject: [PATCH] Scripts to do offline removal of gvnic driver and installation of known good version (#2375) --- gce_recovery_scripts/README.md | 60 ++++++++ .../recover_windows_instance.sh | 131 ++++++++++++++++++ gce_recovery_scripts/reinstall-gvnic-gq.ps1 | 23 +++ 3 files changed, 214 insertions(+) create mode 100644 gce_recovery_scripts/README.md create mode 100755 gce_recovery_scripts/recover_windows_instance.sh create mode 100644 gce_recovery_scripts/reinstall-gvnic-gq.ps1 diff --git a/gce_recovery_scripts/README.md b/gce_recovery_scripts/README.md new file mode 100644 index 000000000..d52f5bd34 --- /dev/null +++ b/gce_recovery_scripts/README.md @@ -0,0 +1,60 @@ +# GCE Recovery Scripts + +The following are scripts that utilize gcloud to automate the recovery of an instance that is not able to successfully boot. + +## How to use the recovery scripts using Cloud Shell + +1. Open a new Cloud Shell terminal. https://cloud.google.com/shell/docs/using-cloud-shell#start_a_new_session +2. git clone https://github.com/GoogleCloudPlatform/compute-image-tools.git +3. cd ~/compute-image-tools/gce_recovery_scripts +4. run ./recover_windows_instance.sh with the needed parameters. + +## Windows instance recovery + +`recover_windows_instance.sh` automates the creation of a rescue Windows instance that is used to modify the contents of the specified instances boot disk using a startup script. + +### Usage + + The `recover_windows_instance.sh` supports the following parameters: + + 1. Required `instance`: The name of the Windows instance that has the boot disk that needs to be modified. + 2. Required `zone`: The zone to use for compute resources + 3. Required `project`: The name of the project the Compute resources exist in. + 4. Required `script`: The PowerShell script that will be run in the rescue instance to modify the contents the specified instances boot disk. + 5. Optional `label`: If the instance has this label the script will abort. On completion of the modification this label will be added to the instance. + + example: ./recover_windows_instance.sh instance1 us-central1-c my-gcp-project remediationscript.ps1 + example: ./recover_windows_instance.sh instance1 us-central1-c my-gcp-project remediationscript.ps1 remediationprojectlabel + +### recover_windows_instance.sh does the following: + +1. Validates the required input parameters of the script are present. +2. Verifies the instance exists. + - Exits if the instance is not present. +3. Checks that the instance does not have the optionally specified label. + - Exits if the instance has the specified label. +4. Obtain the boot disk for the instance. +5. Stop the instance if it not already TERMINATED. +6. Detaches the boot disk from the instance. +7. Creates a rescue Windows instance, mounting the originial boot disks as a data disk (D:), and specifying the provided script as the startup script (windows-startup-script-ps1). +8. Waiting for 300 second to boot, run the rescue script, and shutdown. +9. Verify the rescue Windows instance is stopped. +10. Detaches the data disk from the rescue Windows instance". +11. Re-attach the boot disk to original instance +12. Deletes the rescue Windows instance +13. If a label has been specified, set that label on the instance. + + +## Windows GVNIC Driver Recovery + +`reinstall-gvnic-gq.ps1` does the following: + +1. Downloads the known good version of the gVNIC Windows driver from gs://gce-windows-drivers-public/release/gvnic-gq +2. Removes all versions of the gVNIC Windows driver from the Windows installed on the D:\ drive. +3. Installs the known good gVNIC 1.0.x version of the driver to the Windows installed on the D:\ drive. +4. Shuts downs the rescue instance. + +`reinstall-gvnic-gq.ps1` can be used as the `recover_windows_instance.sh` remediation script. + +example: ./recover_windows_instance.sh instance1 us-central1-c my-gcp-project reinstall-gvnic-gq.ps1 +example: ./recover_windows_instance.sh instance1 us-central1-c my-gcp-project reinstall-gvnic-gq.ps1 gvnic-remediation \ No newline at end of file diff --git a/gce_recovery_scripts/recover_windows_instance.sh b/gce_recovery_scripts/recover_windows_instance.sh new file mode 100755 index 000000000..214e94ebd --- /dev/null +++ b/gce_recovery_scripts/recover_windows_instance.sh @@ -0,0 +1,131 @@ +# This script will modify the contents of a Windows boot disk using a rescue Windows instance to modify the contents offline. + +instance=$1 +zone=$2 +project=$3 +psscript=$4 +labelkey=$5 + +scriptwaitduration=300 # 5 minutes +maxretries=5 +sleepduration=30 +machineType=e2-highcpu-4 +rescueinstance="rescue-$instance" + +echo "" + +# 1. Verify input +if [[ $instance == "" ]]; then + echo instance parameter not set. + invalid=true +fi +if [[ $zone == "" ]]; then + echo zone parameter not set. + invalid=true +fi +if [[ $project == "" ]]; then + echo project parameter not set. + invalid=true +fi +if [[ $psscript == "" ]]; then + echo script parameter not set. + invalid=true +fi + +if [[ $invalid ]]; then + echo "" + echo Usage of recover_windows_instance.sh instance zone project script.ps1 label + echo "" + echo 1. Required instance: The name of the Windows instance that has the boot disk that needs to be modified. + echo 2. Required zone: The zone to use for compute resources + echo 3. Required project: The name of the project the Compute resources exist in. + echo 4. Required script: The PowerShell script that will be run in the rescue instance to modify the contents the specified instances boot disk. + echo 5. Optional label: If the instance has this label the script will abort. On completion of the modification this label will be added to the instance. + echo "" + echo example: ./recover_windows_instance.sh instance1 us-central1-c my-gcp-project remediationscript.ps1 + echo example: ./recover_windows_instance.sh instance1 us-central1-c my-gcp-project remediationscript.ps1 remediationprojectlabel + echo "" + exit +fi + + +# 2. Verify instance exists +verifyInstanceOutput=$(gcloud compute instances list --zones="${zone}" --project="${project}" --filter=name=$instance) +if [[ $verifyInstanceOutput == *$instance* ]]; then + echo Found instance $instance in zone $zone of project $project +else + echo Unable to find $instance in zone $zone of project $project, aborting. + exit +fi + +# 3. Identify if the instance has the label, if present script exits. +if [[ $labelkey != "" ]]; then + labelcheck=$(gcloud compute instances describe "${instance}" --zone="${zone}" --project="${project}" --format="get(labels[$labelkey])") + if [[ labelcheck != "" ]]; then + echo $labelkey found on $instance in zone $zone of project $project. Exiting. + exit + fi +fi + +# 4. Obtain the boot disk for the instance +bootdisk=$(gcloud compute instances describe "${instance}" --zone="${zone}" --project="${project}" --format='get(disks[0].source)') + + +# 5. Stop the instance if it not already TERMINATED. +vmstatecheck=$(gcloud compute instances describe "${instance}" --zone="${zone}" --project="${project}" --format="value(status)") + +for (( i = 0 ; i < "${maxretries}" ; i++ )); do + if [[ ! "${vmstatecheck}" = "TERMINATED" ]]; then + gcloud compute instances stop "${instance}" --zone="${zone}" --project="${project}" + vmstatecheck=$(gcloud compute instances describe "${instance}" --zone="${zone}" --project="${project}" --format="value(status)") + fi + if [[ ! "${vmstatecheck}" = "TERMINATED" ]]; then + echo "VM not TERMINATED. Waiting ${sleepduration} sec." + if [[ "${i}" -eq "${maxretries}" ]]; then + echo "VM not TERMINATED after ${maxretries} attempts. Giving up." + exit + fi + sleep ${sleepduration} + fi +done + +echo Detaching bootdisk $bootdisk from $instance in zone $zone of project $project. +# 6. Detaches the boot disk from the instance. +gcloud compute instances detach-disk "${instance}" --disk="${bootdisk}" --zone="${zone}" --project="${project}" + +# 7. Creates a rescue Windows instance and mounts theoriginial boot disks as a data driver (D:). +gcloud compute instances create "${rescueinstance}" --image-project=windows-cloud --image-family=windows-2022-core --machine-type=$machineType --zone="${zone}" --project="${project}" --disk=auto-delete=false,name="${bootdisk}" --metadata-from-file=windows-startup-script-ps1="${psscript}" + +# 8. Wait for x seconds to boot, run the rescue script, and shutdown. +sleep "${scriptwaitduration}" + +# 9. Verify the rescue instance is stopped. +for (( i = 0 ; i < "${maxretries}" ; i++ )); do + echo "Querying VM status to check for TERMINATED state." + vmstatecheck=$(gcloud compute instances describe "${rescueinstance}" --zone="${zone}" --project="${project}" --format="value(status)") + if [[ "${vmstatecheck}" = "TERMINATED" ]]; then + echo "VM in TERMINATED state." + break + else + echo "State not TERMINATED. Waiting ${sleepduration} sec." + if [[ "${i}" -eq "${maxretries}" ]]; then + echo "rescue-$instance not TERMINATED after ${maxretries} attempts. Giving up." + break + fi + sleep "${sleepduration}" + fi +done + +# 10. Detach the boot disk from "${rescueinstance}". +gcloud compute instances detach-disk "${rescueinstance}" --disk="${bootdisk}" --zone="${zone}" --project="${project}" + +# 11. Re-attach the boot disk to original instance. +gcloud compute instances attach-disk "${instance}" --disk="${bootdisk}" --boot --zone="${zone}" --project="${project}" + +# 12. Deleting rescue instance. +gcloud compute instances delete "${rescueinstance}" --zone="${zone}" --project="${project}" + +# 13. If a label has been specified, set that label on the instance. +if [[ $labelkey != "" ]]; then + gcloud compute instances add-labels "${instance}" --labels=$labelkey=$(date +'%F_%H%M%S%z') --zone="${zone}" --project="${project}" +fi \ No newline at end of file diff --git a/gce_recovery_scripts/reinstall-gvnic-gq.ps1 b/gce_recovery_scripts/reinstall-gvnic-gq.ps1 new file mode 100644 index 000000000..0be42b60f --- /dev/null +++ b/gce_recovery_scripts/reinstall-gvnic-gq.ps1 @@ -0,0 +1,23 @@ +$gs_path = "gs://gce-windows-drivers-public/release/gvnic-gq" +$destination = "$env:TEMP\gvnic-gq" + +Write-Output "Downloading drivers from $gs_path to $destination" +If (test-path -PathType container $destination) { + Remove-Item -Path $destination -Recurse -Force +} +New-Item -ItemType Directory -Path $destination +& 'gsutil' cp "${gs_path}/*" $destination +Write-Output 'Driver download complete.' + +Write-Output 'Removing all instances of gvnic driver' +Get-WindowsDriver -Path D:\ | ForEach-Object { + if ($_.OriginalFileName -Match 'gvnic.inf') { + Write-Output $_.OriginalFileName + Remove-WindowsDriver -Path D:\ -Driver $_.OriginalFileName + } +} + +Write-Output 'Installing GVNIC GQ driver using Add-WindowsDriver' +Add-WindowsDriver -Path D:\ -Driver $destination -Recurse -Verbose + +Stop-Computer -ComputerName localhost -Force \ No newline at end of file