Restore race-checking tests (#1376)

Fixing the flakiness that led to #1375. The races in these tests were due to `t.Log` calls occurring after the test finishes, because the workflow (and test suite and tests and...) does not wait for goroutines to shut down. It's an annoying enough issue that I tackled it with gusto in cadence-workflow/cadence#6067 and it's probably worth porting over here too. Though the underlying "shut down and do not wait" behavior is still extremely dangerous and needs to be fixed some day.
cadence-workflow · Nov 1, 2024 · f21e350 · f21e350
1 parent 163f1a9
commit f21e350
Showing 1 changed file with 15 additions and 4 deletions.
diff --git a/internal/context_test.go b/internal/context_test.go
@@ -70,11 +70,11 @@ func TestContextChildParentCancelRace(t *testing.T) {
 	}
 	env.RegisterWorkflow(wf)
 	env.ExecuteWorkflow(wf)
+	assert.True(t, env.IsWorkflowCompleted())
 	assert.NoError(t, env.GetWorkflowError())
 }
 
 func TestContextConcurrentCancelRace(t *testing.T) {
-	t.Skip("This test is racy and not reliable. It is disabled until we can make it reliable.")
 	/*
 		A race condition existed due to concurrently ending goroutines on shutdown (i.e. closing their chan without waiting
 		on them to finish shutdown), which executed... quite a lot of non-concurrency-safe code in a concurrent way.  All
@@ -83,8 +83,13 @@ func TestContextConcurrentCancelRace(t *testing.T) {
 		Context cancellation was one identified by a customer, and it's fairly easy to test.
 		In principle this must be safe to do - contexts are supposed to be concurrency-safe.  Even if ours are not actually
 		safe (for valid reasons), our execution model needs to ensure they *act* like it's safe.
+
+		This intentionally avoids `newTestWorkflowEnv` because the dangling goroutines
+		may shut down after the test finishes -> produce a log, which fails on t.Log races.
+		Ideally the test suite would stop goroutines too, but that isn't reliable currently either.
+		This can be changed if we switch to the server's race-safe zaptest wrapper.
 	*/
-	env := newTestWorkflowEnv(t)
+	env := (&WorkflowTestSuite{}).NewTestWorkflowEnvironment()
 	wf := func(ctx Context) error {
 		ctx, cancel := WithCancel(ctx)
 		racyCancel := func(ctx Context) {
@@ -101,15 +106,20 @@ func TestContextConcurrentCancelRace(t *testing.T) {
 	}
 	env.RegisterWorkflow(wf)
 	env.ExecuteWorkflow(wf)
+	assert.True(t, env.IsWorkflowCompleted())
 	assert.NoError(t, env.GetWorkflowError())
 }
 
 func TestContextAddChildCancelParentRace(t *testing.T) {
-	t.Skip("This test is racy and not reliable. It is disabled until we can make it reliable.")
 	/*
 		It's apparently also possible to race on adding children while propagating the cancel to children.
+
+		This intentionally avoids `newTestWorkflowEnv` because the dangling goroutines
+		may shut down after the test finishes -> produce a log, which fails on t.Log races.
+		Ideally the test suite would stop goroutines too, but that isn't reliable currently either.
+		This can be changed if we switch to the server's race-safe zaptest wrapper.
 	*/
-	env := newTestWorkflowEnv(t)
+	env := (&WorkflowTestSuite{}).NewTestWorkflowEnvironment()
 	wf := func(ctx Context) error {
 		ctx, cancel := WithCancel(ctx)
 		racyCancel := func(ctx Context) {
@@ -131,6 +141,7 @@ func TestContextAddChildCancelParentRace(t *testing.T) {
 	}
 	env.RegisterWorkflow(wf)
 	env.ExecuteWorkflow(wf)
+	assert.True(t, env.IsWorkflowCompleted())
 	assert.NoError(t, env.GetWorkflowError())
 }