Skip to content

Commit

Permalink
Merge pull request #3786 from telepresenceio/thallgren/proxy-via-mult…
Browse files Browse the repository at this point in the history
…i-wf

Using the --proxy-via flag would sometimes cause connection timeouts.
  • Loading branch information
thallgren authored Feb 4, 2025
2 parents 3d63e55 + 0566da4 commit 7986105
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 72 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,23 @@ items:
- version: 2.21.3
date: (TBD)
notes:
- type: bugfix
title: Using the --proxy-via flag would sometimes cause connection timeouts.
body: >-
Typically, a `telepresence connect --proxy-via <subnet>=<workflow>` would fail with a "deadline exceeded"
message when several workloads were present in the namespace, the one targeted by the proxy-via didn't yet
have an agent installed, and other workloads had an agent. This was due to a race condition in the logic
for the agent-based port-forwards in the root daemon. The conditions causing this race are now eliminated.
- type: bugfix
title: Fix panic in root daemon when using the "allow conflicting subnets" feature on macOS.
body: >-
A regression was introduced in version 2.21.0, causing a panic due to an unimplemented method in the
TUN-device on macOS based clients.
- type: bugfix
title: Ensure that annotation enabled traffic-agents are uninstall when uninstalling the traffic-manager.
body: >-
A traffic-agent injected because the workload had the inject annotation enabled would sometimes not get
uninstalled when the traffic-manager was uninstalled.
- version: 2.21.2
date: 2025-01-26
notes:
Expand Down
5 changes: 4 additions & 1 deletion build-aux/main.mk
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ protoc: protoc-clean $(tools/protoc) $(tools/protoc-gen-go) $(tools/protoc-gen-g
.PHONY: generate
generate: ## (Generate) Update generated files that get checked in to Git
generate: generate-clean
generate: protoc $(tools/go-mkopensource) $(BUILDDIR)/$(shell go env GOVERSION).src.tar.gz docs-files
generate: protoc $(tools/go-mkopensource) $(BUILDDIR)/$(shell go env GOVERSION).src.tar.gz
cd ./rpc && export GOFLAGS=-mod=mod && go mod tidy && go mod vendor && rm -rf vendor
cd ./pkg/vif/testdata/router && export GOFLAGS=-mod=mod && go mod tidy && go mod vendor && rm -rf vendor
cd ./tools/src/test-report && export GOFLAGS=-mod=mod && go mod tidy && go mod vendor && rm -rf vendor
Expand All @@ -108,12 +108,15 @@ generate: protoc $(tools/go-mkopensource) $(BUILDDIR)/$(shell go env GOVERSION).

rm -rf vendor

generate: docs-files

.PHONY: generate-clean
generate-clean: ## (Generate) Delete generated files
rm -rf ./rpc/vendor
rm -rf ./vendor
rm -f DEPENDENCIES.md
rm -f DEPENDENCY_LICENSES.md
rm -f docs/release-notes.md*

CHANGELOG.yml: FORCE
@# Check if the version is in the x.x.x format (GA release)
Expand Down
63 changes: 34 additions & 29 deletions cmd/traffic/cmd/manager/mutator/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"slices"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/go-cmp/cmp"
Expand Down Expand Up @@ -101,7 +102,7 @@ func (c *configWatcher) isRolloutNeeded(ctx context.Context, wl k8sapi.Workload,
if ia, ok := podMeta.GetAnnotations()[agentconfig.InjectAnnotation]; ok {
// Annotation controls injection, so no explicit rollout is needed unless the deployment was added before the traffic-manager.
// If the annotation changes, there will be an implicit rollout anyway.
if wl.GetCreationTimestamp().After(c.startedAt) {
if wl.GetCreationTimestamp().After(c.startedAt) && c.running.Load() {
dlog.Debugf(ctx, "Rollout of %s.%s is not necessary. Pod template has inject annotation %s",
wl.GetName(), wl.GetNamespace(), ia)
return false
Expand Down Expand Up @@ -424,6 +425,7 @@ type configWatcher struct {
nsLocks *xsync.MapOf[string, *sync.RWMutex]
blacklistedPods *xsync.MapOf[string, time.Time]
startedAt time.Time
running atomic.Bool
rolloutDisabled bool

cms []cache.SharedIndexInformer
Expand Down Expand Up @@ -538,6 +540,7 @@ func (c *configWatcher) SetSelf(self Map) {
}

func (c *configWatcher) StartWatchers(ctx context.Context) error {
defer c.running.Store(true)
c.startedAt = time.Now()
ctx, c.cancel = context.WithCancel(ctx)
for _, si := range c.svs {
Expand Down Expand Up @@ -857,36 +860,38 @@ func (c *configWatcher) Start(ctx context.Context) {
}

func (c *configWatcher) DeleteMapsAndRolloutAll(ctx context.Context) {
c.cancel() // No more updates from watcher
now := meta.NewDeleteOptions(0)
api := k8sapi.GetK8sInterface(ctx).CoreV1()
c.nsLocks.Range(func(ns string, lock *sync.RWMutex) bool {
lock.Lock()
defer lock.Unlock()
wlm, err := data(ctx, ns)
if err != nil {
dlog.Errorf(ctx, "unable to get configmap %s.%s: %v", agentconfig.ConfigMap, ns, err)
return true
}
for k, v := range wlm {
e := &entry{name: k, namespace: ns, value: v}
scx, wl, err := e.workload(ctx)
if c.running.CompareAndSwap(true, false) {
c.cancel() // No more updates from watcher
now := meta.NewDeleteOptions(0)
api := k8sapi.GetK8sInterface(ctx).CoreV1()
c.nsLocks.Range(func(ns string, lock *sync.RWMutex) bool {
lock.Lock()
defer lock.Unlock()
wlm, err := data(ctx, ns)
if err != nil {
if !errors.IsNotFound(err) {
dlog.Errorf(ctx, "unable to get workload for %s.%s %s: %v", k, ns, v, err)
dlog.Errorf(ctx, "unable to get configmap %s.%s: %v", agentconfig.ConfigMap, ns, err)
return true
}
for k, v := range wlm {
e := &entry{name: k, namespace: ns, value: v}
scx, wl, err := e.workload(ctx)
if err != nil {
if !errors.IsNotFound(err) {
dlog.Errorf(ctx, "unable to get workload for %s.%s %s: %v", k, ns, v, err)
}
continue
}
continue
ac := scx.AgentConfig()
if ac.Create || ac.Manual {
// Deleted before it was generated or manually added, just ignore
continue
}
c.triggerRollout(ctx, wl, nil)
}
ac := scx.AgentConfig()
if ac.Create || ac.Manual {
// Deleted before it was generated or manually added, just ignore
continue
if err := api.ConfigMaps(ns).Delete(ctx, agentconfig.ConfigMap, *now); err != nil {
dlog.Errorf(ctx, "unable to delete ConfigMap %s-%s: %v", agentconfig.ConfigMap, ns, err)
}
c.triggerRollout(ctx, wl, nil)
}
if err := api.ConfigMaps(ns).Delete(ctx, agentconfig.ConfigMap, *now); err != nil {
dlog.Errorf(ctx, "unable to delete ConfigMap %s-%s: %v", agentconfig.ConfigMap, ns, err)
}
return true
})
return true
})
}
}
12 changes: 12 additions & 0 deletions docs/release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,24 @@
[comment]: # (Code generated by relnotesgen. DO NOT EDIT.)
# <img src="images/logo.png" height="64px"/> Telepresence Release Notes
## Version 2.21.3
## <div style="display:flex;"><img src="images/bugfix.png" alt="bugfix" style="width:30px;height:fit-content;"/><div style="display:flex;margin-left:7px;">Using the --proxy-via flag would sometimes cause connection timeouts.</div></div>
<div style="margin-left: 15px">

Typically, a `telepresence connect --proxy-via <subnet>=<workflow>` would fail with a "deadline exceeded" message when several workloads were present in the namespace, the one targeted by the proxy-via didn't yet have an agent installed, and other workloads had an agent. This was due to a race condition in the logic for the agent-based port-forwards in the root daemon. The conditions causing this race are now eliminated.
</div>

## <div style="display:flex;"><img src="images/bugfix.png" alt="bugfix" style="width:30px;height:fit-content;"/><div style="display:flex;margin-left:7px;">Fix panic in root daemon when using the "allow conflicting subnets" feature on macOS.</div></div>
<div style="margin-left: 15px">

A regression was introduced in version 2.21.0, causing a panic due to an unimplemented method in the TUN-device on macOS based clients.
</div>

## <div style="display:flex;"><img src="images/bugfix.png" alt="bugfix" style="width:30px;height:fit-content;"/><div style="display:flex;margin-left:7px;">Ensure that annotation enabled traffic-agents are uninstall when uninstalling the traffic-manager.</div></div>
<div style="margin-left: 15px">

A traffic-agent injected because the workload had the inject annotation enabled would sometimes not get uninstalled when the traffic-manager was uninstalled.
</div>

## Version 2.21.2 <span style="font-size: 16px;">(January 26)</span>
## <div style="display:flex;"><img src="images/bugfix.png" alt="bugfix" style="width:30px;height:fit-content;"/><div style="display:flex;margin-left:7px;">Fix panic when agentpf.client creates a Tunnel</div></div>
<div style="margin-left: 15px">
Expand Down
8 changes: 8 additions & 0 deletions docs/release-notes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,18 @@ import { Note, Title, Body } from '@site/src/components/ReleaseNotes'

# Telepresence Release Notes
## Version 2.21.3
<Note>
<Title type="bugfix">Using the --proxy-via flag would sometimes cause connection timeouts.</Title>
<Body>Typically, a `telepresence connect --proxy-via <subnet>=<workflow>` would fail with a "deadline exceeded" message when several workloads were present in the namespace, the one targeted by the proxy-via didn't yet have an agent installed, and other workloads had an agent. This was due to a race condition in the logic for the agent-based port-forwards in the root daemon. The conditions causing this race are now eliminated.</Body>
</Note>
<Note>
<Title type="bugfix">Fix panic in root daemon when using the "allow conflicting subnets" feature on macOS.</Title>
<Body>A regression was introduced in version 2.21.0, causing a panic due to an unimplemented method in the TUN-device on macOS based clients.</Body>
</Note>
<Note>
<Title type="bugfix">Ensure that annotation enabled traffic-agents are uninstall when uninstalling the traffic-manager.</Title>
<Body>A traffic-agent injected because the workload had the inject annotation enabled would sometimes not get uninstalled when the traffic-manager was uninstalled.</Body>
</Note>
## Version 2.21.2 <span style={{fontSize:'16px'}}>(January 26)</span>
<Note>
<Title type="bugfix">Fix panic when agentpf.client creates a Tunnel</Title>
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ require (
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.10.0
github.com/telepresenceio/go-fuseftp/rpc v0.5.0
github.com/telepresenceio/telepresence/rpc/v2 v2.21.2
github.com/telepresenceio/telepresence/rpc/v2 v2.21.3-test.4
github.com/vishvananda/netlink v1.3.0
golang.org/x/exp v0.0.0-20241210194714-1829a127f884
golang.org/x/net v0.32.0
Expand Down
3 changes: 3 additions & 0 deletions integration_test/itest/namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ func (s *nsPair) RollbackTM(ctx context.Context) {
t := getT(ctx)
require.NoError(t, err)
require.NoError(t, RolloutStatusWait(ctx, s.Namespace, "deploy/traffic-manager"))
assert.Eventually(t, func() bool {
return len(RunningPodNames(ctx, "traffic-manager", s.Namespace)) == 1
}, 30*time.Second, 5*time.Second)
s.CapturePodLogs(ctx, "traffic-manager", "", s.Namespace)
}

Expand Down
Loading

0 comments on commit 7986105

Please sign in to comment.