Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

connectivity: Check for unexpected packet drops #2151

Merged
merged 3 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions connectivity/check/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,11 @@ type Parameters struct {
ConnDisruptTestRestartsPath string
ConnDisruptTestXfrmErrorsPath string
ConnDisruptDispatchInterval time.Duration
FlushCT bool
SecondaryNetworkIface string

ExpectedDropReasons []string

FlushCT bool
SecondaryNetworkIface string

K8sVersion string
HelmChartDirectory string
Expand Down
4 changes: 2 additions & 2 deletions connectivity/suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,10 @@ func Run(ctx context.Context, ct *check.ConnectivityTest, addExtraTests func(*ch
// include --include-conn-disrupt-test"
return ct.Run(ctx)
}

ct.NewTest("no-missed-tail-calls").WithScenarios(tests.NoMissedTailCalls())
}

ct.NewTest("no-unexpected-packet-drops").WithScenarios(tests.NoUnexpectedPacketDrops(ct.Params().ExpectedDropReasons))

// Run all tests without any policies in place.
noPoliciesScenarios := []check.Scenario{
tests.PodToPod(),
Expand Down
75 changes: 56 additions & 19 deletions connectivity/tests/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ package tests

import (
"context"
"strconv"
"fmt"
"strings"
"time"

Expand Down Expand Up @@ -75,46 +75,83 @@ func (n *noErrorsInLogs) Run(ctx context.Context, t *check.Test) {

}

// NoMissedTailCalls checks whether there were no drops due to missed (BPF)
// tail calls.
func NoMissedTailCalls() check.Scenario {
return &noMissedTailCalls{}
// NoUnexpectedPacketDrops checks whether there were no drops due to expected
// packet drops.
func NoUnexpectedPacketDrops(expectedDrops []string) check.Scenario {
return &noUnexpectedPacketDrops{expectedDrops}
}

type noMissedTailCalls struct{}
type noUnexpectedPacketDrops struct {
expectedDrops []string
}

func (n *noMissedTailCalls) Name() string {
return "no-missed-tail-calls"
func (n *noUnexpectedPacketDrops) Name() string {
return "no-unexpected-packet-drops"
}

func (n *noMissedTailCalls) Run(ctx context.Context, t *check.Test) {
func (n *noUnexpectedPacketDrops) Run(ctx context.Context, t *check.Test) {
ct := t.Context()

filter := computeExpectedDropReasons(defaults.ExpectedDropReasons, n.expectedDrops)
cmd := []string{
"/bin/sh", "-c",
"cilium metrics list -o json | jq '.[] | select( .name == \"cilium_drop_count_total\" and .labels.reason == \"Missed tail call\" ).value'",
fmt.Sprintf("cilium metrics list -o json | jq '.[] | select((.name == \"cilium_drop_count_total\") and (.labels.reason | IN(%s) | not))'", filter),
}

for _, pod := range ct.CiliumPods() {
pod := pod
stdout, err := pod.K8sClient.ExecInPod(ctx, pod.Pod.Namespace, pod.Pod.Name, defaults.AgentContainerName, cmd)
if err != nil {
t.Fatalf("Error fetching missed tail call drop counts: %s", err)
t.Fatalf("Error fetching packet drop counts: %s", err)
}
countStr := strings.TrimSpace(stdout.String())
if countStr == "" {
return
if countStr != "" {
t.Fatalf("Found unexpected packet drops:\n%s", countStr)
}
}
}

count, err := strconv.Atoi(countStr)
if err != nil {
t.Fatalf("Failed to convert missed tail call drops %q to int: %s", countStr, err)
func computeExpectedDropReasons(defaultReasons, inputReasons []string) string {
filter := ""
if len(inputReasons) > 0 {
// Build final list of drop reasons based on default reasons, added
// reasons (+ prefix), and removed reasons (- prefix).
addedAllDefaults := false
dropReasons := map[string]bool{}
for _, reason := range inputReasons {
if reason[0] == '+' || reason[0] == '-' {
if !addedAllDefaults {
for _, r := range defaultReasons {
dropReasons[r] = true
}
addedAllDefaults = true
}
}
switch reason[0] {
case '+':
dropReasons[reason[1:]] = true
case '-':
dropReasons[reason[1:]] = false
default:
dropReasons[reason] = true
}
}

if count != 0 {
t.Fatalf("Detected drops due to missed tail calls: %d", count)
// Build output string in form of '"reason1", "reason2"'.
i := 0
for reason, isIn := range dropReasons {
if !isIn {
continue
}
if i == 0 {
filter = fmt.Sprintf("%q", reason)
} else {
filter = fmt.Sprintf("%s, %q", filter, reason)
}
i++
}
}

return filter
}

func (n *noErrorsInLogs) checkErrorsInLogs(logs string, t *check.Test) {
Expand Down
69 changes: 69 additions & 0 deletions connectivity/tests/errors_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

package tests

import (
"fmt"
"strings"
"testing"
)

func TestComputeExpectedDropReasons(t *testing.T) {
defaultReasons := []string{"reason0", "reason1"}
tests := []struct {
inputReasons []string
expectedReasons []string
}{
// Empty list of reasons.
{
inputReasons: []string{},
expectedReasons: []string{},
},
// Add a reason to default list.
{
inputReasons: []string{"+reason2"},
expectedReasons: []string{"reason0", "reason1", "reason2"},
},
// Remove a reason from default list.
{
inputReasons: []string{"-reason1"},
expectedReasons: []string{"reason0"},
},
// Add a reason then remove it.
{
inputReasons: []string{"+reason2", "-reason2"},
expectedReasons: []string{"reason0", "reason1"},
},
// Remove a reason then add it back.
{
inputReasons: []string{"-reason1", "+reason1"},
expectedReasons: []string{"reason0", "reason1"},
},
}

for _, test := range tests {
t.Run(fmt.Sprintf("InputReasons: %v", test.inputReasons), func(t *testing.T) {
result := computeExpectedDropReasons(defaultReasons, test.inputReasons)

// computeExpectedDropReasons doesn't guarantee the order of the
// emitted string so instead we check the length and that all
// expected elements are listed.

for _, expReason := range test.expectedReasons {
if !strings.Contains(result, fmt.Sprintf("%q", expReason)) {
t.Errorf("Expected filter to contain %q, but got: %v", expReason, result)
}
}

// We know the expected length because all our test reasons are the same length.
expectedLength := len(test.expectedReasons) * 9
if len(test.expectedReasons) > 1 {
expectedLength += (len(test.expectedReasons) - 1) * 2
}
if len(result) != expectedLength {
t.Errorf("Expected filter of length %d, but got: %v", expectedLength, result)
}
})
}
}
13 changes: 13 additions & 0 deletions defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,17 @@ var (
"authentication.mutual.spire.install.agent.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=NotIn",
"authentication.mutual.spire.install.agent.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]=true",
}

ExpectedDropReasons = []string{
"Policy denied",
"Policy denied by denylist",
"Unsupported L3 protocol",
"Stale or unroutable IP",
"Authentication required",
"Service backend not found",
"Unsupported protocol for NAT masquerade",
"Invalid source ip",
"Unknown L3 target address",
"No tunnel/encapsulation endpoint (datapath BUG!)",
}
)
4 changes: 4 additions & 0 deletions internal/cli/cmd/connectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ func newCmdConnectivityTest(hooks Hooks) *cobra.Command {
cmd.Flags().StringVar(&params.ConnDisruptTestRestartsPath, "conn-disrupt-test-restarts-path", "/tmp/cilium-conn-disrupt-restarts", "Conn disrupt test temporary result file (used internally)")
cmd.Flags().StringVar(&params.ConnDisruptTestXfrmErrorsPath, "conn-disrupt-test-xfrm-errors-path", "/tmp/cilium-conn-disrupt-xfrm-errors", "Conn disrupt test temporary result file (used internally)")
cmd.Flags().DurationVar(&params.ConnDisruptDispatchInterval, "conn-disrupt-dispatch-interval", 0, "TCP packet dispatch interval")

cmd.Flags().StringSliceVar(&params.ExpectedDropReasons, "expected-drop-reasons", defaults.ExpectedDropReasons, "List of expected drop reasons")
pchaigno marked this conversation as resolved.
Show resolved Hide resolved
cmd.Flags().MarkHidden("expected-drop-reasons")

cmd.Flags().BoolVar(&params.FlushCT, "flush-ct", false, "Flush conntrack of Cilium on each node")
cmd.Flags().MarkHidden("flush-ct")
cmd.Flags().StringVar(&params.SecondaryNetworkIface, "secondary-network-iface", "", "Secondary network iface name (e.g., to test NodePort BPF on multiple networks)")
Expand Down
Loading