update: dppo

tianyyiii · tianyyiii · commit dadd7ec05e6d · 2026-02-25T12:55:23.000-05:00
diff --git a/flowrl/agent/online/dppo.py b/flowrl/agent/online/dppo.py
@@ -24,8 +24,11 @@
 
 @partial(jax.jit, static_argnames=("steps", "min_logprob_std"))
 def jit_compute_chain_log_probs(
-    actor: ContinuousDDPM, obs: jnp.ndarray, chain: jnp.ndarray,
-    steps: int, min_logprob_std: float,
+    actor: ContinuousDDPM, 
+    obs: jnp.ndarray, 
+    chain: jnp.ndarray,
+    steps: int, 
+    min_logprob_std: float,
 ) -> jnp.ndarray:
     ts = quad_t_schedule(steps, n=actor.t_schedule_n,
                          tmin=actor.t_diffusion[0], tmax=actor.t_diffusion[1])
@@ -57,6 +60,24 @@ def step_fn(_, i):
     return jnp.transpose(step_lps, (1, 0))
 
 
+@partial(jax.jit, static_argnames=("steps", "min_logprob_std"))
+def jit_sample_actions(
+    rng: PRNGKey, actor: ContinuousDDPM, obs: jnp.ndarray,
+    steps: int, min_logprob_std: float,
+) -> Tuple[PRNGKey, jnp.ndarray, jnp.ndarray, jnp.ndarray]:
+    B = obs.shape[0]
+    rng, xT_rng = jax.random.split(rng)
+    xT = jax.random.normal(xT_rng, (B, actor.x_dim))
+    rng, action, history = actor.sample(
+        rng, xT, condition=obs, training=False, solver="ddpm",
+    )
+    chain = jnp.transpose(
+        jnp.concatenate([history[0], action[jnp.newaxis]], axis=0), (1, 0, 2))
+    step_lps = jit_compute_chain_log_probs(actor, obs, chain, steps, min_logprob_std)
+    log_prob = step_lps.mean(axis=-1, keepdims=True)
+    return rng, action, chain, log_prob
+
+
 @partial(jax.jit, static_argnames=(
     "gamma", "gae_lambda", "gamma_denoising",
     "clip_epsilon", "clip_epsilon_base", "clip_epsilon_rate",
@@ -246,20 +267,11 @@ def sample_actions(
         self, obs: jnp.ndarray, deterministic: bool = True, num_samples: int = 1,
     ) -> Tuple[jnp.ndarray, Metric]:
         assert num_samples == 1, "DPPO only supports num_samples=1"
-        B = obs.shape[0]
-        self.rng, xT_rng = jax.random.split(self.rng)
-        xT = jax.random.normal(xT_rng, (B, self.act_dim))
-
-        self.rng, action, history = self.actor.sample(
-            self.rng, xT, condition=obs, training=False, solver="ddpm",
-        )
-        chain = jnp.transpose(
-            jnp.concatenate([history[0], action[jnp.newaxis]], axis=0), (1, 0, 2))
-
-        step_lps = jit_compute_chain_log_probs(
-            self.actor, obs, chain,
-            self.cfg.diffusion.steps, self.cfg.diffusion.min_logprob_denoising_std,
+        self.rng, action, chain, log_prob = jit_sample_actions(
+            self.rng, 
+            self.actor, 
+            obs,
+            self.cfg.diffusion.steps, 
+            self.cfg.diffusion.min_logprob_denoising_std,
         )
-        log_prob = step_lps.mean(axis=-1, keepdims=True)
-
         return action, {"log_prob": log_prob, "action_chains": chain}
diff --git a/scripts/isaaclab/dppo.sh b/scripts/isaaclab/dppo.sh
@@ -0,0 +1,43 @@
+# Specify which GPUs to use
+GPUS=(0 1 2 3 4 5 6 7)  # Modify this array to specify which GPUs to use
+SEEDS=(0 1 2 3)
+NUM_EACH_GPU=1
+
+PARALLEL=$((NUM_EACH_GPU * ${#GPUS[@]}))
+
+TASKS=(
+    "Isaac-Ant-v0"
+    "Isaac-Humanoid-v0"
+)
+
+SHARED_ARGS=(
+    "algo=dppo"
+    "log.tag=default"
+)
+
+run_task() {
+    task=$1
+    seed=$2
+    slot=$3
+    num_gpus=${#GPUS[@]}
+    device_idx=$((slot % num_gpus))
+    device=${GPUS[$device_idx]}
+    echo "Running $task $seed on GPU $device"
+    unset CUDA_VISIBLE_DEVICES
+    export CUDA_VISIBLE_DEVICES=$device
+    export XLA_PYTHON_CLIENT_PREALLOCATE="false"
+    command="python3 examples/online/main_isaaclab_onpolicy.py task=$task seed=$seed ${SHARED_ARGS[@]}"
+    if [ -n "$DRY_RUN" ]; then
+        echo $command
+    else
+        echo $command
+        $command
+    fi
+}
+
+. env_parallel.bash
+if [ -n "$DRY_RUN" ]; then
+    env_parallel -P${PARALLEL} run_task {1} {2} {%} ::: ${TASKS[@]} ::: ${SEEDS[@]}
+else
+    env_parallel --bar --results log/parallel/$name -P${PARALLEL} run_task {1} {2} {%} ::: ${TASKS[@]} ::: ${SEEDS[@]}
+fi