diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index ce0226defb4..bd29c8caa36 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -266,6 +266,12 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool: def ha_recovery_for_consolidation_mode(): """Recovery logic for HA mode.""" + # Touch the signal file here to avoid conflict with + # update_managed_jobs_statuses. Although we run this first and then start + # the deamon, this function is also called in cancel_jobs_by_id. + signal_file = pathlib.Path( + constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser() + signal_file.touch() # No setup recovery is needed in consolidation mode, as the API server # already has all runtime installed. Directly start jobs recovery here. # Refers to sky/templates/kubernetes-ray.yml.j2 for more details. @@ -312,6 +318,7 @@ def ha_recovery_for_consolidation_mode(): f'{datetime.datetime.now()}\n') f.write(f'HA recovery completed at {datetime.datetime.now()}\n') f.write(f'Total recovery time: {time.time() - start} seconds\n') + signal_file.unlink() async def get_job_status(