Skip to content

Commit

Permalink
Merge pull request #57 from stackhpc/bugfix/yoga/ironic-race
Browse files Browse the repository at this point in the history
Ironic nodes with instance reserved in placement
  • Loading branch information
jovial authored Nov 3, 2023
2 parents 3cedfd1 + c77acb4 commit e40b516
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 11 deletions.
15 changes: 15 additions & 0 deletions nova/conf/workarounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,21 @@
help="""
When this is enabled, it will skip version-checking of hypervisors
during live migration.
"""),
cfg.BoolOpt(
'skip_reserve_in_use_ironic_nodes',
default=False,
help="""
This may be useful if you use the Ironic driver, but don't have
automatic cleaning enabled in Ironic. Nova, by default, will mark
Ironic nodes as reserved as soon as they are in use. When you free
the Ironic node (by deleting the nova instance) it takes a while
for Nova to un-reserve that Ironic node in placement. Usually this
is a good idea, because it avoids placement providing an Ironic
as a valid candidate when it is still being cleaned.
Howerver, if you don't use automatic cleaning, it can cause an
extra delay before and Ironic node is available for building a
new Nova instance.
"""),
]

Expand Down
48 changes: 45 additions & 3 deletions nova/tests/unit/virt/ironic/test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,48 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,

self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)

expected = {
'CUSTOM_IRON_NFV': {
'total': 1,
'reserved': 1,
'min_unit': 1,
'max_unit': 1,
'step_size': 1,
'allocation_ratio': 1.0,
},
}
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).inventory
self.assertEqual(expected, result)

@mock.patch.object(ironic_driver.IronicDriver,
'_node_resources_used', return_value=True)
@mock.patch.object(ironic_driver.IronicDriver,
'_node_resources_unavailable', return_value=False)
@mock.patch.object(ironic_driver.IronicDriver, '_node_resource')
@mock.patch.object(ironic_driver.IronicDriver, '_node_from_cache')
def test_update_provider_tree_with_rc_occupied_workaround(self,
mock_nfc, mock_nr, mock_res_unavail, mock_res_used):
"""Ensure that when a node is used, we report the inventory matching
the consumed resources.
"""
self.flags(skip_reserve_in_use_ironic_nodes=True,
group="workarounds")
mock_nr.return_value = {
'vcpus': 24,
'vcpus_used': 24,
'memory_mb': 1024,
'memory_mb_used': 1024,
'local_gb': 100,
'local_gb_used': 100,
'resource_class': 'iron-nfv',
}

self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)

expected = {
'CUSTOM_IRON_NFV': {
'total': 1,
Expand All @@ -944,7 +986,7 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr,
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
self.assertFalse(mock_res_unavail.called)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).inventory
self.assertEqual(expected, result)

Expand Down Expand Up @@ -1015,7 +1057,7 @@ def test_update_provider_tree_no_traits(self, mock_nfc, mock_nr,
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
self.assertFalse(mock_res_unavail.called)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).traits
self.assertEqual(set(), result)

Expand Down Expand Up @@ -1047,7 +1089,7 @@ def test_update_provider_tree_with_traits(self, mock_nfc, mock_nr,
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
self.assertFalse(mock_res_unavail.called)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).traits
self.assertEqual(set(traits), result)

Expand Down
26 changes: 18 additions & 8 deletions nova/virt/ironic/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,15 +886,25 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None):
"""
# nodename is the ironic node's UUID.
node = self._node_from_cache(nodename)

reserved = False
if (not self._node_resources_used(node) and
self._node_resources_unavailable(node)):
LOG.debug('Node %(node)s is not ready for a deployment, '
'reporting resources as reserved for it. Node\'s '
'provision state is %(prov)s, power state is '
'%(power)s and maintenance is %(maint)s.',
{'node': node.uuid, 'prov': node.provision_state,
'power': node.power_state, 'maint': node.maintenance})
if self._node_resources_unavailable(node):
# Operators might mark a node as in maintainance,
# even when an instance is on the node,
# either way lets mark this as reserved
reserved = True

if (self._node_resources_used(node) and
not CONF.workarounds.skip_reserve_in_use_ironic_nodes):
# Make resources as reserved once we have
# and instance here.
# When the allocation is deleted, most likely
# automatic clean will start, so we keep the node
# reserved until it becomes available again.
# In the case without automatic clean, once
# the allocation is removed in placement it
# also stays as reserved until we notice on
# the next periodic its actually available.
reserved = True

info = self._node_resource(node)
Expand Down
11 changes: 11 additions & 0 deletions releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
fixes:
- |
Fixed when placement returns ironic nodes that have just started automatic
cleaning as possible valid candidates. This is done by marking all ironic
nodes with an instance on them as reserved, such that nova only makes them
available once we have double checked Ironic reports the node as available.
If you don't have automatic cleaning on, this might mean it takes longer
than normal for Ironic nodes to become available for new instances.
If you want the old behaviour use the following workaround config:
`[workarounds]skip_reserve_in_use_ironic_nodes=true`

0 comments on commit e40b516

Please sign in to comment.