AMSDeploy tries to kill sleep jobs, this is untested

koparasy · koparasy · commit 5fa1ea5071b8 · 2025-03-03T10:36:31.000-08:00
diff --git a/src/AMSWorkflow/ams/stage.py b/src/AMSWorkflow/ams/stage.py
@@ -525,7 +525,6 @@ def __call__(self):
             while True:
                 # This is a blocking call
                 item = self.i_queue.get(block=True)
-                print(f"{self.__class__.__name__} Received message {total_messages}")
                 total_messages += 1
                 if item.is_terminate():
                     for k, v in data_files.items():
@@ -552,7 +551,6 @@ def __call__(self):
                     data_files[data.domain_name][1] += bytes_written
                     total_bytes_written += data.inputs.size * data.inputs.itemsize
                     total_bytes_written += data.outputs.size * data.outputs.itemsize
-                    print(f"Received Terminate {data.inputs.shape} {data.outputs.shape}")
 
                     if data_files[data.domain_name][1] >= 2 * 1024 * 1024 * 1024:
                         data_files[data.domain_name][0].close()
@@ -562,6 +560,10 @@ def __call__(self):
                             )
                         )
                         del data_files[data.domain_name]
+                if total_messages % 100 == 0:
+                    print(
+                        f"I have processed {total_messages} in total amounting to {total_bytes_written/(1024.0*1024.0)} MB"
+                    )
 
         end = time.time()
         self.datasize_byte = total_bytes_written
@@ -609,9 +611,7 @@ def __call__(self):
 
         with AMSMonitor(obj=self, tag="internal_loop", record=[]):
             while True:
-                print(f"{self.__class__.__name__} Receives messages at queue:", self.i_queue)
                 item = self.i_queue.get(block=True)
-                print(f"{self.__class__.__name__} Received messages at queue:", self.i_queue)
                 if item.is_terminate():
                     print(f"Received Terminate {self.__class__.__name__}")
                     break
@@ -840,12 +840,12 @@ def execute(self, policy):
                 f"Pipeline execute does not support policy: {policy}, please select from  {Pipeline.supported_policies}"
             )
 
-        # self.init_signals()
+        self.init_signals()
         # Create a pipeline of actions and link them with appropriate queues
         self._link_pipeline(policy)
         # Execute them
         self._execute_tasks(policy)
-        # self.release_signals()
+        self.release_signals()
 
     @abstractmethod
     def requires_model_update(self):
diff --git a/src/AMSWorkflow/ams_wf/AMSDeploy.py b/src/AMSWorkflow/ams_wf/AMSDeploy.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 from ams.ams_flux import AMSFluxExecutor
 import time
 from ams.ams_jobs import nested_instance_job_descr, get_echo_job
@@ -7,6 +8,7 @@
 import warnings
 from flux.job import FluxExecutor
 import flux
+from flux.job import kill as fkill
 
 
 def verify_arg(name, uri, nodes):
@@ -27,7 +29,7 @@ def get_partition_uri(root_executor, nnodes, cores_per_node, gpus_per_node, time
     uri = fut.uri()
     nested_instance = flux.Flux(uri)
     nested_instance.rpc("state-machine.wait").get()
-    return uri
+    return uri, fut
 
 
 def main():
@@ -57,6 +59,17 @@ def main():
 
     args = parser.parse_args()
 
+    sleep_time = int(args.sleep_time)
+    if sleep_time == 0:
+        start = int(os.getenv("SLURM_JOB_START_TIME", "0"))
+        end = int(os.getenv("SLURM_JOB_END_TIME", "0"))
+        sleep_time = end - start
+
+    if sleep_time == 0:
+        print("Cannot create background job with 0 time")
+        return
+    print(f"Partions will be allocated for {sleep_time}")
+
     wf_manager = AMSWorkflowManager.from_descr(args.workflow_descr, args.credentials)
     print(wf_manager)
 
@@ -83,31 +96,58 @@ def main():
     # NOTE: We need a AMSFluxExecutor to easily get flux uri because FluxExecutor does not provide the respective API
     # We set track_uri to true to enable the executor to generate futures tracking the uri of submitted jobs
     start = time.time()
-    with AMSFluxExecutor(True, threads=6, handle_args=(args.root_uri,)) as root_executor:
+    with AMSFluxExecutor(True, threads=1, handle_args=(args.root_uri,)) as root_executor:
         print("Spawning Flux executor for root took", time.time() - start)
         start = time.time()
-        domain_uri = get_partition_uri(root_executor, num_domain_nodes, cores_per_node, gpus_per_node, args.sleep_time)
+        domain_uri, domain_future = get_partition_uri(
+            root_executor, num_domain_nodes, cores_per_node, gpus_per_node, str(sleep_time)
+        )
         print("Resolving domain uri took", time.time() - start, domain_uri)
         start = time.time()
+        ml_future = None
+        stage_future = None
 
         if ml_uri is None:
-            ml_uri = get_partition_uri(root_executor, num_ml_nodes, cores_per_node, gpus_per_node, args.sleep_time)
+            (
+                ml_uri,
+                ml_future,
+            ) = get_partition_uri(root_executor, num_ml_nodes, cores_per_node, gpus_per_node, str(sleep_time))
         print("Resolving ML uri  took", time.time() - start, ml_uri)
         start = time.time()
 
         if stage_uri is None:
-            stage_uri = get_partition_uri(
-                root_executor, num_stage_nodes, cores_per_node, gpus_per_node, args.sleep_time
+            stage_uri, stage_future = get_partition_uri(
+                root_executor, num_stage_nodes, cores_per_node, gpus_per_node, str(sleep_time)
             )
         print("Resolving stage uri  took", time.time() - start, stage_uri)
 
         # 1) We first schedule the ML training orchestrator.
-        print("Here")
         wf_manager.start(ml_uri, stage_uri, domain_uri)
-        print("Done")
-
-    return
+        # The root executor should not wait, the partitions have "infinite allocation time. So we forcefully shut them down"
+        print("All internal executors are done ... moving to stopping root job")
+        # TODO: When I get here I need to kill all the jobs of the partitions and exit.
+        print("Stopping domain partition...")
+        domain_handle = flux.Flux(domain_uri)
+        domain_handle.rpc("state-machine.wait").get()
+        print("Cancel ", domain_future.jobid())
+        # fkill(domain_handle, domain_future.jobid())
+
+        if ml_uri:
+            print("Stopiing ml partition")
+            ml_handle = flux.Flux(ml_uri)
+            ml_handle.rpc("state-machine.wait").get()
+        if stage_uri:
+            print("Stopping stager partition")
+            stage_future.cancel()
+            ml_handle = flux.Flux(stage_uri)
+            stage_handle.rpc("state-machine.wait").get()
+        print("Shutting root executor")
+        root_executor.shutdown(wait=False, cancel_futures=True)
+        print("All daemons are down")
+    print("Exiting")
+
+    return 0
 
 
 if __name__ == "__main__":
-    main()
+    sys.exit(main())