diff --git a/bin/propolis-server/src/lib/vm/objects.rs b/bin/propolis-server/src/lib/vm/objects.rs index fc762e580..0992a9f4a 100644 --- a/bin/propolis-server/src/lib/vm/objects.rs +++ b/bin/propolis-server/src/lib/vm/objects.rs @@ -267,7 +267,7 @@ impl VmObjectsLocked { // that all devices resume before any vCPUs do. self.resume_kernel_vm(); self.resume_devices(); - self.vcpu_tasks.resume_all(); + self.resume_vcpus(); } /// Resumes this VM's vCPU tasks. @@ -276,7 +276,7 @@ impl VmObjectsLocked { /// needs fine-grained control over the order in which devices and vCPUs /// start. When pausing and resuming a VM that's already been started, use /// [`Self::pause`] and [`Self::resume`] instead. - pub(crate) async fn resume_vcpus(&mut self) { + pub(crate) fn resume_vcpus(&mut self) { self.vcpu_tasks.resume_all(); } @@ -310,7 +310,7 @@ impl VmObjectsLocked { // Resume devices so they're ready to do more work, then resume // vCPUs. self.resume_devices(); - self.vcpu_tasks.resume_all(); + self.resume_vcpus(); } /// Pauses all of a VM's devices. diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 7afc5a0e4..0c2e179b0 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -536,11 +536,12 @@ impl StateDriver { // The start sequence is arranged so that calls to block backends can be // interleaved with processing of requests from the external request - // queue. This allows Nexus to reconfigure Crucible volumes while they - // are being activated, which is necessary to unwedge activations that - // are targeting a downstairs that became invalid between the time Nexus - // passed its address to a VM and the time the VM actually tried to - // connect to it. + // queue. This allows Nexus to reconfigure Crucible backends while they + // are being activated, which can be necessary if the VM's original + // specification specifies a Crucible downstairs server that is offline + // or unavailable. (Downstairs instances can disappear at any time, + // e.g. due to sled failure, so these configurations aren't necessarily + // client errors.) // // Before getting into any of that, handle the synchronous portions of // VM startup. First, ensure that the kernel VM and all its associated @@ -584,7 +585,11 @@ impl StateDriver { info!(log, "starting block backend {}", name); let res = backend.start().await; if let Err(e) = &res { - error!(log, "startup failed for {}: {:?}", name, e); + error!( + log, + "startup failed for block backend {}: {:?}", name, e + ); + return res; } } @@ -614,7 +619,7 @@ impl StateDriver { res = &mut block_backend_fut => { if res.is_ok() { let objects = &self.objects; - objects.lock_exclusive().await.resume_vcpus().await; + objects.lock_exclusive().await.resume_vcpus(); self.publish_steady_state(InstanceState::Running); info!(&self.log, "VM successfully started"); } diff --git a/phd-tests/tests/src/crucible/smoke.rs b/phd-tests/tests/src/crucible/smoke.rs index 984723742..eab416ece 100644 --- a/phd-tests/tests/src/crucible/smoke.rs +++ b/phd-tests/tests/src/crucible/smoke.rs @@ -108,12 +108,16 @@ async fn vcr_replace_during_start_test(ctx: &Framework) { 5, ); + // Configure the disk so that when the VM starts, it will have an invalid + // downstairs address. let spec = config.vm_spec(ctx).await?; let disk_hdl = spec.get_disk_by_device_name(DATA_DISK_NAME).cloned().unwrap(); let disk = disk_hdl.as_crucible().unwrap(); disk.enable_vcr_black_hole(); + // Try to start the VM, but don't wait for it to boot; it should get stuck + // while activating using an invalid downstairs address. let mut vm = ctx.spawn_vm_with_spec(spec, None).await?; vm.launch().await?; @@ -125,9 +129,17 @@ async fn vcr_replace_during_start_test(ctx: &Framework) { .await .unwrap_err(); + // Fix the disk's downstairs address and send a replacement request. This + // should be processed and should allow the VM to boot. disk.disable_vcr_black_hole(); disk.set_generation(2); vm.replace_crucible_vcr(disk).await?; - vm.wait_to_boot().await?; + + assert_eq!(vm.get().await?.instance.state, InstanceState::Running); + + // VCR replacements should continue to be accepted now that the instance is + // running. + disk.set_generation(3); + vm.replace_crucible_vcr(disk).await?; }