From 23cf7bc6a13ae025db0b88bf46d33750ddb63d62 Mon Sep 17 00:00:00 2001 From: Thomas Hallgren Date: Sun, 2 Feb 2025 15:41:28 +0100 Subject: [PATCH 1/5] Using the --proxy-via flag would sometimes cause connection timeouts. Typically, a `telepresence connect --proxy-via =` would fail with a "deadline exceeded" message when several workloads were present in the namespace, the one targeted by the proxy-via didn't yet have an agent installed, and other workloads had an agent. This was due to a race condition in the logic for the agent-based port-forwards in the root daemon. The conditions causing this race are now eliminated. Signed-off-by: Thomas Hallgren --- CHANGELOG.yml | 7 +++ docs/release-notes.md | 6 ++ docs/release-notes.mdx | 4 ++ pkg/client/agentpf/clients.go | 103 +++++++++++++++++++++------------- 4 files changed, 80 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.yml b/CHANGELOG.yml index 0a4006be64..529d5a07d8 100644 --- a/CHANGELOG.yml +++ b/CHANGELOG.yml @@ -36,6 +36,13 @@ items: - version: 2.21.3 date: (TBD) notes: + - type: bugfix + title: Using the --proxy-via flag would sometimes cause connection timeouts. + body: >- + Typically, a `telepresence connect --proxy-via =` would fail with a "deadline exceeded" + message when several workloads were present in the namespace, the one targeted by the proxy-via didn't yet + have an agent installed, and other workloads had an agent. This was due to a race condition in the logic + for the agent-based port-forwards in the root daemon. The conditions causing this race are now eliminated. - type: bugfix title: Fix panic in root daemon when using the "allow conflicting subnets" feature on macOS. body: >- diff --git a/docs/release-notes.md b/docs/release-notes.md index 0d1960bd69..36a7a46cc4 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -2,6 +2,12 @@ [comment]: # (Code generated by relnotesgen. DO NOT EDIT.) # Telepresence Release Notes ## Version 2.21.3 +##
bugfix
Using the --proxy-via flag would sometimes cause connection timeouts.
+
+ +Typically, a `telepresence connect --proxy-via =` would fail with a "deadline exceeded" message when several workloads were present in the namespace, the one targeted by the proxy-via didn't yet have an agent installed, and other workloads had an agent. This was due to a race condition in the logic for the agent-based port-forwards in the root daemon. The conditions causing this race are now eliminated. +
+ ##
bugfix
Fix panic in root daemon when using the "allow conflicting subnets" feature on macOS.
diff --git a/docs/release-notes.mdx b/docs/release-notes.mdx index 2690b298c0..66ee7889a1 100644 --- a/docs/release-notes.mdx +++ b/docs/release-notes.mdx @@ -8,6 +8,10 @@ import { Note, Title, Body } from '@site/src/components/ReleaseNotes' # Telepresence Release Notes ## Version 2.21.3 + + Using the --proxy-via flag would sometimes cause connection timeouts. + Typically, a `telepresence connect --proxy-via =` would fail with a "deadline exceeded" message when several workloads were present in the namespace, the one targeted by the proxy-via didn't yet have an agent installed, and other workloads had an agent. This was due to a race condition in the logic for the agent-based port-forwards in the root daemon. The conditions causing this race are now eliminated. + Fix panic in root daemon when using the "allow conflicting subnets" feature on macOS. A regression was introduced in version 2.21.0, causing a panic due to an unimplemented method in the TUN-device on macOS based clients. diff --git a/pkg/client/agentpf/clients.go b/pkg/client/agentpf/clients.go index 256da6b9fd..19cf59e9b4 100644 --- a/pkg/client/agentpf/clients.go +++ b/pkg/client/agentpf/clients.go @@ -36,9 +36,11 @@ type client struct { session *manager.SessionInfo info *manager.AgentPodInfo ready chan error + remove func() cancelClient context.CancelFunc cancelDialWatch context.CancelFunc tunnelCount int32 + infant bool } func (ac *client) String() string { @@ -49,17 +51,35 @@ func (ac *client) String() string { return fmt.Sprintf("%s.%s:%d", ai.PodName, ai.Namespace, ai.ApiPort) } -func (ac *client) Tunnel(ctx context.Context, opts ...grpc.CallOption) (tunnel.Client, error) { +func (ac *client) ensureConnect(ctx context.Context) error { + ac.Lock() + infant := ac.infant + if infant { + ac.infant = false + } + ac.Unlock() + if infant { + go ac.connect(ctx, func() { + ac.remove() + }) + } select { case err, ok := <-ac.ready: if ok { // Put error back on channel in case this Tunnel is used again before it's deleted. ac.ready <- err - return nil, err + return err } // ready channel is closed. We are ready to go. case <-ctx.Done(): - return nil, ctx.Err() + return ctx.Err() + } + return nil +} + +func (ac *client) Tunnel(ctx context.Context, opts ...grpc.CallOption) (tunnel.Client, error) { + if err := ac.ensureConnect(ctx); err != nil { + return nil, err } tc, err := ac.cli.Tunnel(ctx, opts...) if err != nil { @@ -101,7 +121,14 @@ func (ac *client) connect(ctx context.Context, deleteMe func()) { ac.cli = cli ac.cancelClient = func() { // Need to run this in a separate thread to avoid deadlock. - go conn.Close() + go func() { + ac.Lock() + conn.Close() + ac.cancelClient = nil + ac.cli = nil + ac.infant = true + ac.Unlock() + }() } intercepted := ac.info.Intercepted ac.Unlock() @@ -110,11 +137,11 @@ func (ac *client) connect(ctx context.Context, deleteMe func()) { } } -func (ac *client) busy() bool { +func (ac *client) dormant() bool { ac.Lock() - bzy := ac.cli == nil || ac.info.Intercepted || atomic.LoadInt32(&ac.tunnelCount) > 0 + dormant := !(ac.infant || ac.cli == nil || ac.info.Intercepted) && atomic.LoadInt32(&ac.tunnelCount) == 0 ac.Unlock() - return bzy + return dormant } func (ac *client) intercepted() bool { @@ -124,17 +151,21 @@ func (ac *client) intercepted() bool { return ret } -func (ac *client) cancel() { +func (ac *client) cancel() bool { ac.Lock() cc := ac.cancelClient cdw := ac.cancelDialWatch ac.Unlock() + didCancel := false if cc != nil { + didCancel = true cc() } if cdw != nil { + didCancel = true cdw() } + return didCancel } func (ac *client) setIntercepted(ctx context.Context, k string, status bool) { @@ -168,14 +199,8 @@ func (ac *client) setIntercepted(ctx context.Context, k string, status bool) { func (ac *client) startDialWatcher(ctx context.Context) error { // Not called from the startup go routine, so wait for that routine to finish - select { - case err, ok := <-ac.ready: - if ok { - return err - } - // ready channel is closed. We are ready to go. - case <-ctx.Done(): - return ctx.Err() + if err := ac.ensureConnect(ctx); err != nil { + return err } return ac.startDialWatcherReady(ctx) } @@ -314,11 +339,14 @@ func (s *clients) hasWaiterFor(info *manager.AgentPodInfo) bool { func (s *clients) WatchAgentPods(ctx context.Context, rmc manager.ManagerClient) error { dlog.Debug(ctx, "WatchAgentPods starting") defer func() { - dlog.Debugf(ctx, "WatchAgentPods ending with %d clients still active", s.clients.Size()) + activeCount := 0 s.clients.Range(func(_ string, ac *client) bool { - ac.cancel() + if ac.cancel() { + activeCount++ + } return true }) + dlog.Debugf(ctx, "WatchAgentPods ending with %d clients still active", activeCount) s.disabled.Store(true) }() backoff := 100 * time.Millisecond @@ -509,40 +537,35 @@ func (s *clients) updateClients(ctx context.Context, ais []*manager.AgentPodInfo ac := &client{ ready: make(chan error, 1), session: s.session, - info: ai, + remove: func() { + deleteClient(k) + }, + info: ai, + infant: true, } - go ac.connect(ctx, func() { - deleteClient(k) - }) + dlog.Debugf(ctx, "Adding agent pod %s (%s)", k, net.IP(ai.PodIp)) return ac, false }) } - // Add clients for newly arrived intercepts + // Add clients for newly arrived agents. for k, ai := range aim { - if ai.Intercepted || s.isProxyVIA(ai) || s.hasWaiterFor(ai) { - addClient(k, ai) - } + addClient(k, ai) } - // Terminate all non-intercepting idle agents except the last one. + // Terminate all dormant agents except the last one. + dormantCount := 0 s.clients.Range(func(k string, ac *client) bool { - if s.clients.Size() <= 1 { - return false - } - if !ac.busy() && !s.isProxyVIA(ac.info) && !s.hasWaiterFor(ac.info) { - deleteClient(k) + if ac.dormant() && !s.isProxyVIA(ac.info) && !s.hasWaiterFor(ac.info) { + dormantCount++ + if dormantCount > 1 { + ac.cancel() + } } return true }) - - // Ensure that we have at least one client (if at least one agent exists) - if s.clients.Size() == 0 { - for _, ai := range aim { - k := ai.PodName + "." + ai.Namespace - addClient(k, ai) - break - } + if dormantCount > 1 { + dlog.Debugf(ctx, "Cancelled %d dormant clients", dormantCount-1) } return nil } From 5a0825128741576d1f31344650b501f415b04f5c Mon Sep 17 00:00:00 2001 From: Thomas Hallgren Date: Fri, 10 Jan 2025 13:21:51 +0100 Subject: [PATCH 2/5] Ensure that the correct pod-log is captured after a helm rollback. Signed-off-by: Thomas Hallgren --- integration_test/itest/namespace.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/integration_test/itest/namespace.go b/integration_test/itest/namespace.go index 945e7e43fb..73a47a20f4 100644 --- a/integration_test/itest/namespace.go +++ b/integration_test/itest/namespace.go @@ -160,6 +160,9 @@ func (s *nsPair) RollbackTM(ctx context.Context) { t := getT(ctx) require.NoError(t, err) require.NoError(t, RolloutStatusWait(ctx, s.Namespace, "deploy/traffic-manager")) + assert.Eventually(t, func() bool { + return len(RunningPodNames(ctx, "traffic-manager", s.Namespace)) == 1 + }, 30*time.Second, 5*time.Second) s.CapturePodLogs(ctx, "traffic-manager", "", s.Namespace) } From b14bccc4dea421f79bce5fe9382864020cb818c7 Mon Sep 17 00:00:00 2001 From: Thomas Hallgren Date: Mon, 3 Feb 2025 00:06:36 +0100 Subject: [PATCH 3/5] Ensure that make generate regenerates the docs files. Signed-off-by: Thomas Hallgren --- build-aux/main.mk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/build-aux/main.mk b/build-aux/main.mk index 699d94d010..aa93e02b0d 100644 --- a/build-aux/main.mk +++ b/build-aux/main.mk @@ -86,7 +86,7 @@ protoc: protoc-clean $(tools/protoc) $(tools/protoc-gen-go) $(tools/protoc-gen-g .PHONY: generate generate: ## (Generate) Update generated files that get checked in to Git generate: generate-clean -generate: protoc $(tools/go-mkopensource) $(BUILDDIR)/$(shell go env GOVERSION).src.tar.gz docs-files +generate: protoc $(tools/go-mkopensource) $(BUILDDIR)/$(shell go env GOVERSION).src.tar.gz cd ./rpc && export GOFLAGS=-mod=mod && go mod tidy && go mod vendor && rm -rf vendor cd ./pkg/vif/testdata/router && export GOFLAGS=-mod=mod && go mod tidy && go mod vendor && rm -rf vendor cd ./tools/src/test-report && export GOFLAGS=-mod=mod && go mod tidy && go mod vendor && rm -rf vendor @@ -108,12 +108,15 @@ generate: protoc $(tools/go-mkopensource) $(BUILDDIR)/$(shell go env GOVERSION). rm -rf vendor +generate: docs-files + .PHONY: generate-clean generate-clean: ## (Generate) Delete generated files rm -rf ./rpc/vendor rm -rf ./vendor rm -f DEPENDENCIES.md rm -f DEPENDENCY_LICENSES.md + rm -f docs/release-notes.md* CHANGELOG.yml: FORCE @# Check if the version is in the x.x.x format (GA release) From 1f733e7933c8fcd53700eb372883c6fb34335ed9 Mon Sep 17 00:00:00 2001 From: Thomas Hallgren Date: Mon, 3 Feb 2025 00:15:02 +0100 Subject: [PATCH 4/5] Prepare v2.21.3-test.4 Signed-off-by: Thomas Hallgren --- go.mod | 2 +- pkg/vif/testdata/router/go.mod | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index ca138870d2..aa7d70e304 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ require ( github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.10.0 github.com/telepresenceio/go-fuseftp/rpc v0.5.0 - github.com/telepresenceio/telepresence/rpc/v2 v2.21.2 + github.com/telepresenceio/telepresence/rpc/v2 v2.21.3-test.4 github.com/vishvananda/netlink v1.3.0 golang.org/x/exp v0.0.0-20241210194714-1829a127f884 golang.org/x/net v0.32.0 diff --git a/pkg/vif/testdata/router/go.mod b/pkg/vif/testdata/router/go.mod index d691070a0f..4ab32ff087 100644 --- a/pkg/vif/testdata/router/go.mod +++ b/pkg/vif/testdata/router/go.mod @@ -50,7 +50,7 @@ require ( github.com/rogpeppe/go-internal v1.13.1 // indirect github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/telepresenceio/telepresence/rpc/v2 v2.21.2 // indirect + github.com/telepresenceio/telepresence/rpc/v2 v2.21.3-test.4 // indirect github.com/vishvananda/netlink v1.3.0 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/x448/float16 v0.8.4 // indirect From 0566da495a232cb23a71db327dc4aba497eaf0b2 Mon Sep 17 00:00:00 2001 From: Thomas Hallgren Date: Mon, 3 Feb 2025 07:34:57 +0100 Subject: [PATCH 5/5] Ensure that annotation enabled traffic-agents are uninstalled. Signed-off-by: Thomas Hallgren --- CHANGELOG.yml | 5 ++ cmd/traffic/cmd/manager/mutator/watcher.go | 63 ++++++++++++---------- docs/release-notes.md | 6 +++ docs/release-notes.mdx | 4 ++ 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.yml b/CHANGELOG.yml index 529d5a07d8..068b43fada 100644 --- a/CHANGELOG.yml +++ b/CHANGELOG.yml @@ -48,6 +48,11 @@ items: body: >- A regression was introduced in version 2.21.0, causing a panic due to an unimplemented method in the TUN-device on macOS based clients. + - type: bugfix + title: Ensure that annotation enabled traffic-agents are uninstall when uninstalling the traffic-manager. + body: >- + A traffic-agent injected because the workload had the inject annotation enabled would sometimes not get + uninstalled when the traffic-manager was uninstalled. - version: 2.21.2 date: 2025-01-26 notes: diff --git a/cmd/traffic/cmd/manager/mutator/watcher.go b/cmd/traffic/cmd/manager/mutator/watcher.go index de1db3f4c1..103a542d58 100644 --- a/cmd/traffic/cmd/manager/mutator/watcher.go +++ b/cmd/traffic/cmd/manager/mutator/watcher.go @@ -6,6 +6,7 @@ import ( "slices" "strings" "sync" + "sync/atomic" "time" "github.com/google/go-cmp/cmp" @@ -101,7 +102,7 @@ func (c *configWatcher) isRolloutNeeded(ctx context.Context, wl k8sapi.Workload, if ia, ok := podMeta.GetAnnotations()[agentconfig.InjectAnnotation]; ok { // Annotation controls injection, so no explicit rollout is needed unless the deployment was added before the traffic-manager. // If the annotation changes, there will be an implicit rollout anyway. - if wl.GetCreationTimestamp().After(c.startedAt) { + if wl.GetCreationTimestamp().After(c.startedAt) && c.running.Load() { dlog.Debugf(ctx, "Rollout of %s.%s is not necessary. Pod template has inject annotation %s", wl.GetName(), wl.GetNamespace(), ia) return false @@ -424,6 +425,7 @@ type configWatcher struct { nsLocks *xsync.MapOf[string, *sync.RWMutex] blacklistedPods *xsync.MapOf[string, time.Time] startedAt time.Time + running atomic.Bool rolloutDisabled bool cms []cache.SharedIndexInformer @@ -538,6 +540,7 @@ func (c *configWatcher) SetSelf(self Map) { } func (c *configWatcher) StartWatchers(ctx context.Context) error { + defer c.running.Store(true) c.startedAt = time.Now() ctx, c.cancel = context.WithCancel(ctx) for _, si := range c.svs { @@ -857,36 +860,38 @@ func (c *configWatcher) Start(ctx context.Context) { } func (c *configWatcher) DeleteMapsAndRolloutAll(ctx context.Context) { - c.cancel() // No more updates from watcher - now := meta.NewDeleteOptions(0) - api := k8sapi.GetK8sInterface(ctx).CoreV1() - c.nsLocks.Range(func(ns string, lock *sync.RWMutex) bool { - lock.Lock() - defer lock.Unlock() - wlm, err := data(ctx, ns) - if err != nil { - dlog.Errorf(ctx, "unable to get configmap %s.%s: %v", agentconfig.ConfigMap, ns, err) - return true - } - for k, v := range wlm { - e := &entry{name: k, namespace: ns, value: v} - scx, wl, err := e.workload(ctx) + if c.running.CompareAndSwap(true, false) { + c.cancel() // No more updates from watcher + now := meta.NewDeleteOptions(0) + api := k8sapi.GetK8sInterface(ctx).CoreV1() + c.nsLocks.Range(func(ns string, lock *sync.RWMutex) bool { + lock.Lock() + defer lock.Unlock() + wlm, err := data(ctx, ns) if err != nil { - if !errors.IsNotFound(err) { - dlog.Errorf(ctx, "unable to get workload for %s.%s %s: %v", k, ns, v, err) + dlog.Errorf(ctx, "unable to get configmap %s.%s: %v", agentconfig.ConfigMap, ns, err) + return true + } + for k, v := range wlm { + e := &entry{name: k, namespace: ns, value: v} + scx, wl, err := e.workload(ctx) + if err != nil { + if !errors.IsNotFound(err) { + dlog.Errorf(ctx, "unable to get workload for %s.%s %s: %v", k, ns, v, err) + } + continue } - continue + ac := scx.AgentConfig() + if ac.Create || ac.Manual { + // Deleted before it was generated or manually added, just ignore + continue + } + c.triggerRollout(ctx, wl, nil) } - ac := scx.AgentConfig() - if ac.Create || ac.Manual { - // Deleted before it was generated or manually added, just ignore - continue + if err := api.ConfigMaps(ns).Delete(ctx, agentconfig.ConfigMap, *now); err != nil { + dlog.Errorf(ctx, "unable to delete ConfigMap %s-%s: %v", agentconfig.ConfigMap, ns, err) } - c.triggerRollout(ctx, wl, nil) - } - if err := api.ConfigMaps(ns).Delete(ctx, agentconfig.ConfigMap, *now); err != nil { - dlog.Errorf(ctx, "unable to delete ConfigMap %s-%s: %v", agentconfig.ConfigMap, ns, err) - } - return true - }) + return true + }) + } } diff --git a/docs/release-notes.md b/docs/release-notes.md index 36a7a46cc4..d8fdab81b7 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -14,6 +14,12 @@ Typically, a `telepresence connect --proxy-via =` would fail w A regression was introduced in version 2.21.0, causing a panic due to an unimplemented method in the TUN-device on macOS based clients.
+##
bugfix
Ensure that annotation enabled traffic-agents are uninstall when uninstalling the traffic-manager.
+
+ +A traffic-agent injected because the workload had the inject annotation enabled would sometimes not get uninstalled when the traffic-manager was uninstalled. +
+ ## Version 2.21.2 (January 26) ##
bugfix
Fix panic when agentpf.client creates a Tunnel
diff --git a/docs/release-notes.mdx b/docs/release-notes.mdx index 66ee7889a1..13ad2bc0d6 100644 --- a/docs/release-notes.mdx +++ b/docs/release-notes.mdx @@ -16,6 +16,10 @@ import { Note, Title, Body } from '@site/src/components/ReleaseNotes' Fix panic in root daemon when using the "allow conflicting subnets" feature on macOS. A regression was introduced in version 2.21.0, causing a panic due to an unimplemented method in the TUN-device on macOS based clients. + + Ensure that annotation enabled traffic-agents are uninstall when uninstalling the traffic-manager. + A traffic-agent injected because the workload had the inject annotation enabled would sometimes not get uninstalled when the traffic-manager was uninstalled. + ## Version 2.21.2 (January 26) Fix panic when agentpf.client creates a Tunnel