fix: keep eval-all llm cuda usage within memory

rupurt · rupurt · commit d95345d5cc3c · 2026-03-27T07:55:49.000-07:00
diff --git a/CONFIGURATION.md b/CONFIGURATION.md
@@ -203,4 +203,9 @@ Uses the **Gemma 3** family of models for semantic reranking. Similar to the Qwe
 | `SIFT_BLOBS_CACHE` | Specific override for the blob store. |
 | `SIFT_MANIFESTS_CACHE` | Specific override for the project manifests. |
 | `SIFT_MODELS_CACHE` | Specific override for downloaded ML models. |
+| `SIFT_DENSE_DEVICE` | Dense embedding device override: `cpu` or `cuda`. |
+| `SIFT_LLM_DEVICE` | Default device override for Candle-backed LLM paths: `cpu` or `cuda`. |
+| `SIFT_QWEN_DEVICE` | Qwen-specific device override: `cpu` or `cuda`. |
+| `SIFT_JINA_DEVICE` | Jina-specific device override: `cpu` or `cuda`. |
+| `SIFT_GEMMA_DEVICE` | Gemma-specific device override: `cpu` or `cuda`. |
 | `HF_TOKEN` | Hugging Face API token for downloading gated models (e.g., Jina Reranker v3 and Gemma 3). |
diff --git a/EVALUATIONS.md b/EVALUATIONS.md
@@ -119,7 +119,22 @@ just sift --cuda eval agentic \
 ```
 
 This `--cuda` switch is handled by the `just` recipe, not by the `sift` CLI itself.
-By default, that recipe keeps the dense embedder on CPU (`SIFT_DENSE_DEVICE=cpu`) so local GPUs can be reserved for Qwen/Jina/Gemma during evals. If you want dense embeddings on CUDA too, override it explicitly with `SIFT_DENSE_DEVICE=cuda just sift --cuda ...`.
+By default, that recipe keeps the dense embedder on CPU (`SIFT_DENSE_DEVICE=cpu`) so local GPUs can be reserved for LLM-backed eval paths. For `eval all`, it also keeps the heavier Jina and Gemma rerankers on CPU by default (`SIFT_JINA_DEVICE=cpu`, `SIFT_GEMMA_DEVICE=cpu`) to avoid CUDA OOM across back-to-back strategy runs.
+
+If you want to override those defaults, you can set:
+
+- `SIFT_DENSE_DEVICE=cuda`
+- `SIFT_LLM_DEVICE=cuda|cpu`
+- `SIFT_QWEN_DEVICE=cuda|cpu`
+- `SIFT_JINA_DEVICE=cuda|cpu`
+- `SIFT_GEMMA_DEVICE=cuda|cpu`
+
+Example:
+
+```bash
+SIFT_JINA_DEVICE=cuda SIFT_GEMMA_DEVICE=cuda \
+  just sift --cuda eval all --dataset scifact
+```
 
 ---
 
diff --git a/justfile b/justfile
@@ -73,14 +73,23 @@ sift *args:
         cargo_args=(--release); \
         env_args=(); \
         sift_args=(); \
+        cuda_enabled=0; \
         for arg in "$@"; do \
             if [ "$arg" = "--cuda" ]; then \
+                cuda_enabled=1; \
                 cargo_args+=(--features cuda); \
                 env_args+=("SIFT_DENSE_DEVICE=${SIFT_DENSE_DEVICE:-cpu}"); \
             else \
                 sift_args+=("$arg"); \
             fi; \
         done; \
+        if [ "$cuda_enabled" -eq 1 ] \
+            && [ "${#sift_args[@]}" -ge 2 ] \
+            && [ "${sift_args[0]}" = "eval" ] \
+            && [ "${sift_args[1]}" = "all" ]; then \
+            env_args+=("SIFT_JINA_DEVICE=${SIFT_JINA_DEVICE:-cpu}"); \
+            env_args+=("SIFT_GEMMA_DEVICE=${SIFT_GEMMA_DEVICE:-cpu}"); \
+        fi; \
         env "${env_args[@]}" cargo run "${cargo_args[@]}" -- "${sift_args[@]}" \
     ' -- {{args}}
 
diff --git a/src/search/adapters/gemma.rs b/src/search/adapters/gemma.rs
@@ -74,7 +74,7 @@ impl GemmaReranker {
         let tokenizer = Tokenizer::from_file(&tokenizer_path)
             .map_err(|m| anyhow!("failed to load tokenizer: {}", m))?;
 
-        let device = super::llm_utils::get_device()?;
+        let device = super::llm_utils::get_device_for("GEMMA")?;
         let vb = load_mmaped_safetensors_with_repair(
             &spec.model_id,
             &spec.revision,
diff --git a/src/search/adapters/jina.rs b/src/search/adapters/jina.rs
@@ -95,7 +95,7 @@ impl JinaReranker {
         let tokenizer = Tokenizer::from_file(&tokenizer_path)
             .map_err(|m| anyhow!("failed to load tokenizer: {}", m))?;
 
-        let device = super::llm_utils::get_device()?;
+        let device = super::llm_utils::get_device_for("JINA")?;
         let vb = load_mmaped_safetensors_with_repair(
             &spec.model_id,
             &spec.revision,
diff --git a/src/search/adapters/llm_utils.rs b/src/search/adapters/llm_utils.rs
@@ -172,15 +172,48 @@ pub fn load_mmaped_safetensors_with_repair(
 }
 
 pub fn get_device() -> Result<Device> {
+    get_device_for("LLM")
+}
+
+pub fn get_device_for(kind: &str) -> Result<Device> {
+    let specific_env = format!("SIFT_{}_DEVICE", kind);
+    let requested_device = match std::env::var(&specific_env) {
+        Ok(value) => Some((specific_env.clone(), value)),
+        Err(_) => std::env::var("SIFT_LLM_DEVICE")
+            .ok()
+            .map(|value| ("SIFT_LLM_DEVICE".to_string(), value)),
+    };
+
+    if let Some((source, value)) = requested_device {
+        match value.to_ascii_lowercase().as_str() {
+            "cpu" => {
+                tracing::info!("Using CPU for {} via {}", kind, source);
+                return Ok(Device::Cpu);
+            }
+            "cuda" => {}
+            other => {
+                bail!(
+                    "unsupported device override '{}' in {} (expected 'cpu' or 'cuda')",
+                    other,
+                    source
+                );
+            }
+        }
+    }
+
     #[cfg(feature = "cuda")]
     {
         match Device::new_cuda(0) {
             Ok(d) => {
-                tracing::info!("Using CUDA device 0");
+                tracing::info!("Using CUDA device 0 for {}", kind);
                 Ok(d)
             }
             Err(e) => {
-                tracing::warn!("Failed to initialize CUDA, falling back to CPU: {:?}", e);
+                tracing::warn!(
+                    "Failed to initialize CUDA for {}, falling back to CPU: {:?}",
+                    kind,
+                    e
+                );
                 Ok(Device::Cpu)
             }
         }
diff --git a/src/search/adapters/qwen.rs b/src/search/adapters/qwen.rs
@@ -76,7 +76,7 @@ impl QwenReranker {
         let tokenizer = Tokenizer::from_file(&tokenizer_path)
             .map_err(|m| anyhow!("failed to load tokenizer: {}", m))?;
 
-        let device = super::llm_utils::get_device()?;
+        let device = super::llm_utils::get_device_for("QWEN")?;
         let vb = load_mmaped_safetensors_with_repair(
             &spec.model_id,
             &spec.revision,