diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eedb2d341..b78134f882 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ CHANGELOG - Reduce EFA installation time for Ubuntu by ~20 minutes by only holding kernel packages for the installed kernel. - Add GetFunction and GetPolicy permissions to PClusterBuildImageCleanupRole to prevent AccessDenied errors during build image stack deletion. - Fix validation error messages when `DevSettings` is null or `DevSettings/InstanceTypesData` is missing required fields. +- Fix an issue where cfn-hup enters an endless loop on the head node after a rollback to a cluster state older than 24 hours, caused by cfn-signal failing to signal an expired wait condition handle. 3.14.0 ------ diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index 4d771b5beb..67a573e6de 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -1496,6 +1496,12 @@ def _add_head_node(self): "chefUpdate": { "commands": { "chef": { + # This command runs the update recipe and signals CloudFormation with the result. + # The trailing "|| exit 0" ensures cfn-hup always updates its local metadata cache + # (metadata_db.json) regardless of whether cfn-signal succeeds or fails. + # Without this, if cfn-signal fails (e.g., due to an expired wait condition handle + # after a rollback to a state older than 24h), cfn-hup would not update its cache + # and would enter an endless loop, re-triggering the update recipe every minute. "command": ( ". /etc/parallelcluster/pcluster_cookbook_environment.sh; " "cinc-client --local-mode --config /etc/chef/client.rb --log_level info" @@ -1509,6 +1515,7 @@ def _add_head_node(self): f" $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-signal --exit-code=1 --reason='Update failed'" f" --region {self.stack.region} --url {cloudformation_url}" f" '{self.wait_condition_handle.ref}'" + " || exit 0" ), "cwd": "/etc/chef", }