Skip to content

Commit

Permalink
Verify status changes on managed interfaces
Browse files Browse the repository at this point in the history
Users could modify the settings of VFs which have been
configured by the sriov operator. This PR starts the
reconciliation loop when these changes are detected in the
generic plugin.

Signed-off-by: Marcelo Guerrero <[email protected]>
  • Loading branch information
mlguerrero12 committed Apr 24, 2024
1 parent 707a757 commit 82af069
Show file tree
Hide file tree
Showing 10 changed files with 272 additions and 79 deletions.
164 changes: 95 additions & 69 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,77 +313,60 @@ func (dn *Daemon) nodeStateSyncHandler() error {
latest := dn.desiredNodeState.GetGeneration()
log.Log.V(0).Info("nodeStateSyncHandler(): new generation", "generation", latest)

if dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() {
if vars.UsingSystemdMode {
serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host")
return err
}
postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host")
return err
}

// if the service doesn't exist we should continue to let the k8s plugin to create the service files
// this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
// the system service and reboot the node the config-daemon doesn't need to do anything.
if !(serviceEnabled && postNetworkServiceEnabled) {
sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed,
LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
"sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
} else {
sriovResult, err = systemd.ReadSriovResult()
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host")
return err
}
}
if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed {
log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError)

// add the error but don't requeue
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusFailed,
lastSyncError: sriovResult.LastSyncError,
}
<-dn.syncCh
return nil
}
}
log.Log.V(0).Info("nodeStateSyncHandler(): Interface not changed")
if dn.desiredNodeState.Status.LastSyncError != "" ||
dn.desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh the status
<-dn.syncCh
// load plugins if it has not loaded
if len(dn.loadedPlugins) == 0 {
dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins")
return err
}

return nil
}

if dn.desiredNodeState.GetGeneration() == 1 && len(dn.desiredNodeState.Spec.Interfaces) == 0 {
err = dn.HostHelpers.ClearPCIAddressFolder()
if vars.UsingSystemdMode && dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() {
serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath)
if err != nil {
log.Log.Error(err, "failed to clear the PCI address configuration")
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host")
return err
}
postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host")
return err
}

log.Log.V(0).Info(
"nodeStateSyncHandler(): interface policy spec not yet set by controller for sriovNetworkNodeState",
"name", dn.desiredNodeState.Name)
if dn.desiredNodeState.Status.SyncStatus != "Succeeded" {
// if the service doesn't exist we should continue to let the k8s plugin to create the service files
// this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
// the system service and reboot the node the config-daemon doesn't need to do anything.
if !(serviceEnabled && postNetworkServiceEnabled) {
sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed,
LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
"sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
} else {
sriovResult, err = systemd.ReadSriovResult()
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host")
return err
}
}
if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed {
log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError)

// add the error but don't requeue
dn.refreshCh <- Message{
syncStatus: "Succeeded",
lastSyncError: "",
syncStatus: consts.SyncStatusFailed,
lastSyncError: sriovResult.LastSyncError,
}
// wait for writer to refresh status
<-dn.syncCh
return nil
}
}

skip, err := dn.shouldSkipReconciliation(dn.desiredNodeState)
if err != nil {
return err
}
// Reconcile only when there are changes in the spec or status of managed VFs.
if skip {
return nil
}

Expand All @@ -404,15 +387,6 @@ func (dn *Daemon) nodeStateSyncHandler() error {
}
dn.desiredNodeState.Status = updatedState.Status

// load plugins if it has not loaded
if len(dn.loadedPlugins) == 0 {
dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins")
return err
}
}

reqReboot := false
reqDrain := false

Expand Down Expand Up @@ -564,6 +538,58 @@ func (dn *Daemon) nodeStateSyncHandler() error {
return nil
}

func (dn *Daemon) shouldSkipReconciliation(latestState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
var err error

// Skip when SriovNetworkNodeState object has just been created.
if latestState.GetGeneration() == 1 && len(latestState.Spec.Interfaces) == 0 {
err = dn.HostHelpers.ClearPCIAddressFolder()
if err != nil {
log.Log.Error(err, "failed to clear the PCI address configuration")
return false, err
}

log.Log.V(0).Info(
"shouldSkipReconciliation(): interface policy spec not yet set by controller for sriovNetworkNodeState",
"name", latestState.Name)
if latestState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh status
<-dn.syncCh
}
return true, nil
}

// Verify changes in the spec of the SriovNetworkNodeState CR.
if dn.currentNodeState.GetGeneration() == latestState.GetGeneration() && !dn.isDrainCompleted() {
genericPlugin, ok := dn.loadedPlugins[GenericPluginName]
if ok {
// Verify changes in the status of the SriovNetworkNodeState CR.
if genericPlugin.CheckStatusChanges(latestState) {
return false, nil
}
}

log.Log.V(0).Info("shouldSkipReconciliation(): Interface not changed")
if latestState.Status.LastSyncError != "" ||
latestState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh the status
<-dn.syncCh
}

return true, nil
}

return false, nil
}

// handleDrain: adds the right annotation to the node and nodeState object
// returns true if we need to finish the reconcile loop and wait for a new object
func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/plugins/fake/fake_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ func (f *FakePlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState
return false, false, nil
}

func (f *FakePlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool {
return false
}

func (f *FakePlugin) Apply() error {
return nil
}
25 changes: 25 additions & 0 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,31 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt
return
}

// CheckStatusChanges verify whether SriovNetworkNodeState CR status present changes on configured VFs.
func (p *GenericPlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool {
log.Log.Info("generic-plugin CheckStatusChanges()")

changed := false
for _, iface := range new.Spec.Interfaces {
found := false
for _, ifaceStatus := range new.Status.Interfaces {
if iface.PciAddress == ifaceStatus.PciAddress && !iface.ExternallyManaged {
found = true
if sriovnetworkv1.NeedToUpdateSriov(&iface, &ifaceStatus) {
changed = true
log.Log.Info("CheckStatusChanges(): status changed for interface", "address", iface.PciAddress)
}
break
}
}

if !found {
log.Log.Info("CheckStatusChanges(): no status found for interface", "address", iface.PciAddress)
}
}
return changed
}

func (p *GenericPlugin) syncDriverState() error {
for _, driverState := range p.DriverStateMap {
if !driverState.DriverLoaded && driverState.NeedDriverFunc(p.DesireState, driverState) {
Expand Down
100 changes: 100 additions & 0 deletions pkg/plugins/generic/generic_plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,106 @@ var _ = Describe("Generic plugin", func() {
Expect(needDrain).To(BeTrue())
})

It("should not detect changes on status", func() {
networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{
Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{
Interfaces: sriovnetworkv1.Interfaces{{
PciAddress: "0000:00:00.0",
NumVfs: 1,
VfGroups: []sriovnetworkv1.VfGroup{{
DeviceType: "netdevice",
PolicyName: "policy-1",
ResourceName: "resource-1",
VfRange: "0-0",
}}}},
},
Status: sriovnetworkv1.SriovNetworkNodeStateStatus{
Interfaces: sriovnetworkv1.InterfaceExts{{
PciAddress: "0000:00:00.0",
NumVfs: 1,
TotalVfs: 1,
DeviceID: "1015",
Vendor: "15b3",
Name: "sriovif1",
Mtu: 1500,
Mac: "0c:42:a1:55:ee:46",
Driver: "mlx5_core",
EswitchMode: "legacy",
LinkSpeed: "25000 Mb/s",
LinkType: "ETH",
VFs: []sriovnetworkv1.VirtualFunction{{
PciAddress: "0000:00:00.1",
DeviceID: "1016",
Vendor: "15b3",
VfID: 0,
Name: "sriovif1v0",
Mtu: 1500,
Mac: "8e:d6:2c:62:87:1b",
Driver: "mlx5_core",
}},
}},
},
}

changed := genericPlugin.CheckStatusChanges(networkNodeState)
Expect(err).ToNot(HaveOccurred())
Expect(changed).To(BeFalse())
})

It("should detect changes on status", func() {
networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{
Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{
Interfaces: sriovnetworkv1.Interfaces{{
PciAddress: "0000:00:00.0",
NumVfs: 2,
Mtu: 1500,
VfGroups: []sriovnetworkv1.VfGroup{{
DeviceType: "netdevice",
PolicyName: "policy-1",
ResourceName: "resource-1",
VfRange: "0-1",
Mtu: 1500,
}}}},
},
Status: sriovnetworkv1.SriovNetworkNodeStateStatus{
Interfaces: sriovnetworkv1.InterfaceExts{{
PciAddress: "0000:00:00.0",
NumVfs: 2,
TotalVfs: 2,
DeviceID: "1015",
Vendor: "15b3",
Name: "sriovif1",
Mtu: 1500,
Mac: "0c:42:a1:55:ee:46",
Driver: "mlx5_core",
EswitchMode: "legacy",
LinkSpeed: "25000 Mb/s",
LinkType: "ETH",
VFs: []sriovnetworkv1.VirtualFunction{{
PciAddress: "0000:00:00.1",
DeviceID: "1016",
Vendor: "15b3",
VfID: 0,
Driver: "mlx5_core",
Name: "sriovif1v0",
Mtu: 999, // Bad MTU value, changed by the user application
Mac: "8e:d6:2c:62:87:1b",
}, {
PciAddress: "0000:00:00.2",
DeviceID: "1016",
Vendor: "15b3",
VfID: 0,
Driver: "mlx5_core",
}},
}},
},
}

changed := genericPlugin.CheckStatusChanges(networkNodeState)
Expect(err).ToNot(HaveOccurred())
Expect(changed).To(BeTrue())
})

It("should load vfio_pci driver", func() {
networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{
Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{
Expand Down
9 changes: 7 additions & 2 deletions pkg/plugins/intel/intel_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,16 @@ func (p *IntelPlugin) Spec() string {
}

// OnNodeStateChange Invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node
func (p *IntelPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) (needDrain bool, needReboot bool, err error) {
log.Log.Info("intel plugin OnNodeStateChange()")
func (p *IntelPlugin) OnNodeStateChange(*sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) {
log.Log.Info("intel-plugin OnNodeStateChange()")
return false, false, nil
}

// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs.
func (p *IntelPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool {
return false
}

// Apply config change
func (p *IntelPlugin) Apply() error {
log.Log.Info("intel plugin Apply()")
Expand Down
6 changes: 6 additions & 0 deletions pkg/plugins/k8s/k8s_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ func (p *K8sPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState)
return
}

// TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/630
// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs.
func (p *K8sPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool {
return false
}

// Apply config change
func (p *K8sPlugin) Apply() error {
log.Log.Info("k8s plugin Apply()")
Expand Down
6 changes: 6 additions & 0 deletions pkg/plugins/mellanox/mellanox_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,12 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
return
}

// TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/631
// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs.
func (p *MellanoxPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool {
return false
}

// Apply config change
func (p *MellanoxPlugin) Apply() error {
if p.helpers.IsKernelLockdownMode() {
Expand Down
Loading

0 comments on commit 82af069

Please sign in to comment.