From 82af069fb5d9609eae66fdbd07e2602412f1adbc Mon Sep 17 00:00:00 2001 From: Marcelo Guerrero Date: Mon, 30 Oct 2023 16:58:16 +0100 Subject: [PATCH] Verify status changes on managed interfaces Users could modify the settings of VFs which have been configured by the sriov operator. This PR starts the reconciliation loop when these changes are detected in the generic plugin. Signed-off-by: Marcelo Guerrero --- pkg/daemon/daemon.go | 164 ++++++++++++--------- pkg/plugins/fake/fake_plugin.go | 4 + pkg/plugins/generic/generic_plugin.go | 25 ++++ pkg/plugins/generic/generic_plugin_test.go | 100 +++++++++++++ pkg/plugins/intel/intel_plugin.go | 9 +- pkg/plugins/k8s/k8s_plugin.go | 6 + pkg/plugins/mellanox/mellanox_plugin.go | 6 + pkg/plugins/mock/mock_plugin.go | 22 ++- pkg/plugins/plugin.go | 10 +- pkg/plugins/virtual/virtual_plugin.go | 5 + 10 files changed, 272 insertions(+), 79 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 07eefb4f8..65964d7bc 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -313,77 +313,60 @@ func (dn *Daemon) nodeStateSyncHandler() error { latest := dn.desiredNodeState.GetGeneration() log.Log.V(0).Info("nodeStateSyncHandler(): new generation", "generation", latest) - if dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() { - if vars.UsingSystemdMode { - serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath) - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host") - return err - } - postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath) - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host") - return err - } - - // if the service doesn't exist we should continue to let the k8s plugin to create the service files - // this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply - // the system service and reboot the node the config-daemon doesn't need to do anything. - if !(serviceEnabled && postNetworkServiceEnabled) { - sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed, - LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+ - "sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)} - } else { - sriovResult, err = systemd.ReadSriovResult() - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host") - return err - } - } - if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed { - log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError) - - // add the error but don't requeue - dn.refreshCh <- Message{ - syncStatus: consts.SyncStatusFailed, - lastSyncError: sriovResult.LastSyncError, - } - <-dn.syncCh - return nil - } - } - log.Log.V(0).Info("nodeStateSyncHandler(): Interface not changed") - if dn.desiredNodeState.Status.LastSyncError != "" || - dn.desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded { - dn.refreshCh <- Message{ - syncStatus: consts.SyncStatusSucceeded, - lastSyncError: "", - } - // wait for writer to refresh the status - <-dn.syncCh + // load plugins if it has not loaded + if len(dn.loadedPlugins) == 0 { + dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins) + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins") + return err } - - return nil } - if dn.desiredNodeState.GetGeneration() == 1 && len(dn.desiredNodeState.Spec.Interfaces) == 0 { - err = dn.HostHelpers.ClearPCIAddressFolder() + if vars.UsingSystemdMode && dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() { + serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath) if err != nil { - log.Log.Error(err, "failed to clear the PCI address configuration") + log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host") + return err + } + postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath) + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host") return err } - log.Log.V(0).Info( - "nodeStateSyncHandler(): interface policy spec not yet set by controller for sriovNetworkNodeState", - "name", dn.desiredNodeState.Name) - if dn.desiredNodeState.Status.SyncStatus != "Succeeded" { + // if the service doesn't exist we should continue to let the k8s plugin to create the service files + // this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply + // the system service and reboot the node the config-daemon doesn't need to do anything. + if !(serviceEnabled && postNetworkServiceEnabled) { + sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed, + LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+ + "sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)} + } else { + sriovResult, err = systemd.ReadSriovResult() + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host") + return err + } + } + if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed { + log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError) + + // add the error but don't requeue dn.refreshCh <- Message{ - syncStatus: "Succeeded", - lastSyncError: "", + syncStatus: consts.SyncStatusFailed, + lastSyncError: sriovResult.LastSyncError, } - // wait for writer to refresh status <-dn.syncCh + return nil } + } + + skip, err := dn.shouldSkipReconciliation(dn.desiredNodeState) + if err != nil { + return err + } + // Reconcile only when there are changes in the spec or status of managed VFs. + if skip { return nil } @@ -404,15 +387,6 @@ func (dn *Daemon) nodeStateSyncHandler() error { } dn.desiredNodeState.Status = updatedState.Status - // load plugins if it has not loaded - if len(dn.loadedPlugins) == 0 { - dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins) - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins") - return err - } - } - reqReboot := false reqDrain := false @@ -564,6 +538,58 @@ func (dn *Daemon) nodeStateSyncHandler() error { return nil } +func (dn *Daemon) shouldSkipReconciliation(latestState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + var err error + + // Skip when SriovNetworkNodeState object has just been created. + if latestState.GetGeneration() == 1 && len(latestState.Spec.Interfaces) == 0 { + err = dn.HostHelpers.ClearPCIAddressFolder() + if err != nil { + log.Log.Error(err, "failed to clear the PCI address configuration") + return false, err + } + + log.Log.V(0).Info( + "shouldSkipReconciliation(): interface policy spec not yet set by controller for sriovNetworkNodeState", + "name", latestState.Name) + if latestState.Status.SyncStatus != consts.SyncStatusSucceeded { + dn.refreshCh <- Message{ + syncStatus: consts.SyncStatusSucceeded, + lastSyncError: "", + } + // wait for writer to refresh status + <-dn.syncCh + } + return true, nil + } + + // Verify changes in the spec of the SriovNetworkNodeState CR. + if dn.currentNodeState.GetGeneration() == latestState.GetGeneration() && !dn.isDrainCompleted() { + genericPlugin, ok := dn.loadedPlugins[GenericPluginName] + if ok { + // Verify changes in the status of the SriovNetworkNodeState CR. + if genericPlugin.CheckStatusChanges(latestState) { + return false, nil + } + } + + log.Log.V(0).Info("shouldSkipReconciliation(): Interface not changed") + if latestState.Status.LastSyncError != "" || + latestState.Status.SyncStatus != consts.SyncStatusSucceeded { + dn.refreshCh <- Message{ + syncStatus: consts.SyncStatusSucceeded, + lastSyncError: "", + } + // wait for writer to refresh the status + <-dn.syncCh + } + + return true, nil + } + + return false, nil +} + // handleDrain: adds the right annotation to the node and nodeState object // returns true if we need to finish the reconcile loop and wait for a new object func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) { diff --git a/pkg/plugins/fake/fake_plugin.go b/pkg/plugins/fake/fake_plugin.go index f7fe6e56f..11c30d6ea 100644 --- a/pkg/plugins/fake/fake_plugin.go +++ b/pkg/plugins/fake/fake_plugin.go @@ -21,6 +21,10 @@ func (f *FakePlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState return false, false, nil } +func (f *FakePlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + func (f *FakePlugin) Apply() error { return nil } diff --git a/pkg/plugins/generic/generic_plugin.go b/pkg/plugins/generic/generic_plugin.go index 274f0dc01..20ef4a49c 100644 --- a/pkg/plugins/generic/generic_plugin.go +++ b/pkg/plugins/generic/generic_plugin.go @@ -139,6 +139,31 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt return } +// CheckStatusChanges verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *GenericPlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool { + log.Log.Info("generic-plugin CheckStatusChanges()") + + changed := false + for _, iface := range new.Spec.Interfaces { + found := false + for _, ifaceStatus := range new.Status.Interfaces { + if iface.PciAddress == ifaceStatus.PciAddress && !iface.ExternallyManaged { + found = true + if sriovnetworkv1.NeedToUpdateSriov(&iface, &ifaceStatus) { + changed = true + log.Log.Info("CheckStatusChanges(): status changed for interface", "address", iface.PciAddress) + } + break + } + } + + if !found { + log.Log.Info("CheckStatusChanges(): no status found for interface", "address", iface.PciAddress) + } + } + return changed +} + func (p *GenericPlugin) syncDriverState() error { for _, driverState := range p.DriverStateMap { if !driverState.DriverLoaded && driverState.NeedDriverFunc(p.DesireState, driverState) { diff --git a/pkg/plugins/generic/generic_plugin_test.go b/pkg/plugins/generic/generic_plugin_test.go index 9f9c1be83..6b75535bb 100644 --- a/pkg/plugins/generic/generic_plugin_test.go +++ b/pkg/plugins/generic/generic_plugin_test.go @@ -137,6 +137,106 @@ var _ = Describe("Generic plugin", func() { Expect(needDrain).To(BeTrue()) }) + It("should not detect changes on status", func() { + networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ + Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ + Interfaces: sriovnetworkv1.Interfaces{{ + PciAddress: "0000:00:00.0", + NumVfs: 1, + VfGroups: []sriovnetworkv1.VfGroup{{ + DeviceType: "netdevice", + PolicyName: "policy-1", + ResourceName: "resource-1", + VfRange: "0-0", + }}}}, + }, + Status: sriovnetworkv1.SriovNetworkNodeStateStatus{ + Interfaces: sriovnetworkv1.InterfaceExts{{ + PciAddress: "0000:00:00.0", + NumVfs: 1, + TotalVfs: 1, + DeviceID: "1015", + Vendor: "15b3", + Name: "sriovif1", + Mtu: 1500, + Mac: "0c:42:a1:55:ee:46", + Driver: "mlx5_core", + EswitchMode: "legacy", + LinkSpeed: "25000 Mb/s", + LinkType: "ETH", + VFs: []sriovnetworkv1.VirtualFunction{{ + PciAddress: "0000:00:00.1", + DeviceID: "1016", + Vendor: "15b3", + VfID: 0, + Name: "sriovif1v0", + Mtu: 1500, + Mac: "8e:d6:2c:62:87:1b", + Driver: "mlx5_core", + }}, + }}, + }, + } + + changed := genericPlugin.CheckStatusChanges(networkNodeState) + Expect(err).ToNot(HaveOccurred()) + Expect(changed).To(BeFalse()) + }) + + It("should detect changes on status", func() { + networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ + Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ + Interfaces: sriovnetworkv1.Interfaces{{ + PciAddress: "0000:00:00.0", + NumVfs: 2, + Mtu: 1500, + VfGroups: []sriovnetworkv1.VfGroup{{ + DeviceType: "netdevice", + PolicyName: "policy-1", + ResourceName: "resource-1", + VfRange: "0-1", + Mtu: 1500, + }}}}, + }, + Status: sriovnetworkv1.SriovNetworkNodeStateStatus{ + Interfaces: sriovnetworkv1.InterfaceExts{{ + PciAddress: "0000:00:00.0", + NumVfs: 2, + TotalVfs: 2, + DeviceID: "1015", + Vendor: "15b3", + Name: "sriovif1", + Mtu: 1500, + Mac: "0c:42:a1:55:ee:46", + Driver: "mlx5_core", + EswitchMode: "legacy", + LinkSpeed: "25000 Mb/s", + LinkType: "ETH", + VFs: []sriovnetworkv1.VirtualFunction{{ + PciAddress: "0000:00:00.1", + DeviceID: "1016", + Vendor: "15b3", + VfID: 0, + Driver: "mlx5_core", + Name: "sriovif1v0", + Mtu: 999, // Bad MTU value, changed by the user application + Mac: "8e:d6:2c:62:87:1b", + }, { + PciAddress: "0000:00:00.2", + DeviceID: "1016", + Vendor: "15b3", + VfID: 0, + Driver: "mlx5_core", + }}, + }}, + }, + } + + changed := genericPlugin.CheckStatusChanges(networkNodeState) + Expect(err).ToNot(HaveOccurred()) + Expect(changed).To(BeTrue()) + }) + It("should load vfio_pci driver", func() { networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ diff --git a/pkg/plugins/intel/intel_plugin.go b/pkg/plugins/intel/intel_plugin.go index 467116ddb..a9174bdba 100644 --- a/pkg/plugins/intel/intel_plugin.go +++ b/pkg/plugins/intel/intel_plugin.go @@ -35,11 +35,16 @@ func (p *IntelPlugin) Spec() string { } // OnNodeStateChange Invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node -func (p *IntelPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) (needDrain bool, needReboot bool, err error) { - log.Log.Info("intel plugin OnNodeStateChange()") +func (p *IntelPlugin) OnNodeStateChange(*sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) { + log.Log.Info("intel-plugin OnNodeStateChange()") return false, false, nil } +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *IntelPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *IntelPlugin) Apply() error { log.Log.Info("intel plugin Apply()") diff --git a/pkg/plugins/k8s/k8s_plugin.go b/pkg/plugins/k8s/k8s_plugin.go index 118073ee5..aa28c0aa6 100644 --- a/pkg/plugins/k8s/k8s_plugin.go +++ b/pkg/plugins/k8s/k8s_plugin.go @@ -154,6 +154,12 @@ func (p *K8sPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) return } +// TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/630 +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *K8sPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *K8sPlugin) Apply() error { log.Log.Info("k8s plugin Apply()") diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 6b1b943de..1aa1f6434 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -171,6 +171,12 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS return } +// TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/631 +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *MellanoxPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *MellanoxPlugin) Apply() error { if p.helpers.IsKernelLockdownMode() { diff --git a/pkg/plugins/mock/mock_plugin.go b/pkg/plugins/mock/mock_plugin.go index b821139c5..044d30f0e 100644 --- a/pkg/plugins/mock/mock_plugin.go +++ b/pkg/plugins/mock/mock_plugin.go @@ -48,6 +48,20 @@ func (mr *MockVendorPluginMockRecorder) Apply() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Apply", reflect.TypeOf((*MockVendorPlugin)(nil).Apply)) } +// CheckStatusChanges mocks base method. +func (m *MockVendorPlugin) CheckStatusChanges(arg0 *v1.SriovNetworkNodeState) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CheckStatusChanges", arg0) + ret0, _ := ret[0].(bool) + return ret0 +} + +// CheckStatusChanges indicates an expected call of CheckStatusChanges. +func (mr *MockVendorPluginMockRecorder) CheckStatusChanges(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckStatusChanges", reflect.TypeOf((*MockVendorPlugin)(nil).CheckStatusChanges), arg0) +} + // Name mocks base method. func (m *MockVendorPlugin) Name() string { m.ctrl.T.Helper() @@ -63,9 +77,9 @@ func (mr *MockVendorPluginMockRecorder) Name() *gomock.Call { } // OnNodeStateChange mocks base method. -func (m *MockVendorPlugin) OnNodeStateChange(new *v1.SriovNetworkNodeState) (bool, bool, error) { +func (m *MockVendorPlugin) OnNodeStateChange(arg0 *v1.SriovNetworkNodeState) (bool, bool, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "OnNodeStateChange", new) + ret := m.ctrl.Call(m, "OnNodeStateChange", arg0) ret0, _ := ret[0].(bool) ret1, _ := ret[1].(bool) ret2, _ := ret[2].(error) @@ -73,9 +87,9 @@ func (m *MockVendorPlugin) OnNodeStateChange(new *v1.SriovNetworkNodeState) (boo } // OnNodeStateChange indicates an expected call of OnNodeStateChange. -func (mr *MockVendorPluginMockRecorder) OnNodeStateChange(new interface{}) *gomock.Call { +func (mr *MockVendorPluginMockRecorder) OnNodeStateChange(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnNodeStateChange", reflect.TypeOf((*MockVendorPlugin)(nil).OnNodeStateChange), new) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnNodeStateChange", reflect.TypeOf((*MockVendorPlugin)(nil).OnNodeStateChange), arg0) } // Spec mocks base method. diff --git a/pkg/plugins/plugin.go b/pkg/plugins/plugin.go index 276e37cf2..3136dfd46 100644 --- a/pkg/plugins/plugin.go +++ b/pkg/plugins/plugin.go @@ -6,12 +6,14 @@ import ( //go:generate ../../bin/mockgen -destination mock/mock_plugin.go -source plugin.go type VendorPlugin interface { - // Return the name of plugin + // Name returns the name of plugin Name() string - // Return the SpecVersion followed by plugin + // Spec returns the SpecVersion followed by plugin Spec() string - // Invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node - OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) + // OnNodeStateChange is invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node + OnNodeStateChange(*sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) // Apply config change Apply() error + // CheckStatusChanges checks status changes on the SriovNetworkNodeState CR for configured VFs. + CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool } diff --git a/pkg/plugins/virtual/virtual_plugin.go b/pkg/plugins/virtual/virtual_plugin.go index a942aaa86..cd43b120b 100644 --- a/pkg/plugins/virtual/virtual_plugin.go +++ b/pkg/plugins/virtual/virtual_plugin.go @@ -67,6 +67,11 @@ func (p *VirtualPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt return } +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *VirtualPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *VirtualPlugin) Apply() error { log.Log.Info("virtual plugin Apply()", "desired-state", p.DesireState.Spec)