Address PR review: fix heap alloc, D2H copy, input name, EP check

qjia7 · qjia7 · commit ce68bb1fbb33 · 2026-03-27T09:57:07.000+08:00
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
@@ -455,11 +455,10 @@ void DefaultPositionInputs::CreateAndInitializeCompactAttentionMask(DeviceSpan<i
 void DefaultPositionInputs::UpdateCompactAttentionMask() {
   // In compact mode, attention_mask has shape [batch_size, 1] containing total seq len per batch.
   // Each decode step adds one token, so increment each value by 1.
+  // Use CpuSpan() as the source of truth — avoids per-token device-to-host readback.
+  // This is safe because all non-fast-path updates go through CpuSpan() + CopyCpuToDevice().
   auto byte_span = attention_mask_->GetByteSpan();
-  // CopyDeviceToCpu() ensures the CPU buffer reflects the current device contents before reading.
-  // This is needed because the WebGPU fast path writes directly to GPU memory, so CpuSpan() alone
-  // would read stale/uninitialized data.
-  auto cpu_data = byte_span.CopyDeviceToCpu();
+  auto cpu_data = byte_span.CpuSpan();
   if (type_ == Ort::TypeToTensorType<int32_t>) {
     auto* data = reinterpret_cast<int32_t*>(cpu_data.data());
     for (int64_t i = 0; i < attention_mask_shape_[0]; i++)
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -105,6 +105,13 @@ def check_extra_options(kv_pairs, execution_provider):
         )
         kv_pairs["enable_webgpu_graph"] = False
 
+    if kv_pairs.get("compact_attention_mask", False) and execution_provider != "webgpu":
+        print(
+            "WARNING: compact_attention_mask is currently only supported with WebGPU execution provider. Disabling compact_attention_mask."
+        )
+        kv_pairs["compact_attention_mask"] = False
+        kv_pairs["enable_webgpu_graph"] = False
+
 
 def parse_extra_options(kv_items, execution_provider):
     """
diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py
@@ -4463,7 +4463,7 @@ def make_attention_mask_compact_reformatting_for_gqa(self, attn_mask_basename):
 
         # Cast from INT64 to INT32
         cast_name = f"{attn_mask_basename}/Cast"
-        self.make_cast(cast_name, "attention_mask", dtype=ir.DataType.INT32, shape=["batch_size", 1])
+        self.make_cast(cast_name, self.input_names["attention_mask"], dtype=ir.DataType.INT32, shape=["batch_size", 1])
 
         # Reshape from [batch_size, 1] to [batch_size]
         reshape_name = f"{attn_mask_basename}/Reshape"
diff --git a/src/webgpu/interface.cpp b/src/webgpu/interface.cpp
@@ -216,43 +216,36 @@ struct InterfaceImpl : DeviceInterface {
     return true;
   }
 
-  // Compact attention mask: write total_length into a [batch_beam_size, 1] tensor on WebGPU.
-  // This avoids GPU->CPU->GPU round-trips by only doing a single CPU->GPU copy.
+  // Compact attention mask: write total_length into a [1, 1] tensor on WebGPU.
+  // Only supports batch_beam_size == 1 to avoid per-token heap allocation.
+  // For batch_beam_size > 1, returns false to fall back to the CPU path.
   bool UpdateCompactAttentionMask(void* mask_data, int batch_beam_size, int total_length, ONNXTensorElementDataType type) override {
-    if (!ort_allocator_) {
-      throw std::runtime_error("WebGPU allocator not initialized");
+    if (!ort_allocator_ || batch_beam_size != 1) {
+      return false;
     }
 
-    // Prepare the values on CPU using properly aligned buffers and perform a single CPU->GPU copy
+    // Single scalar on the stack — no heap allocation
     if (type == Ort::TypeToTensorType<int32_t>) {
-      const size_t elem_size = sizeof(int32_t);
-      const size_t byte_count = static_cast<size_t>(batch_beam_size) * elem_size;
-      std::vector<int32_t> cpu_buffer(batch_beam_size);
-      for (int i = 0; i < batch_beam_size; ++i) {
-        cpu_buffer[i] = static_cast<int32_t>(total_length);
-      }
+      int32_t value = static_cast<int32_t>(total_length);
+      const size_t byte_count = sizeof(int32_t);
 
       int64_t shape_val = static_cast<int64_t>(byte_count);
       std::span<const int64_t> shape{&shape_val, 1};
       auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-      auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, cpu_buffer.data(), byte_count, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+      auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, &value, byte_count, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
       auto dst_tensor = OrtValue::CreateTensor(*ort_memory_info_, mask_data, byte_count, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
 
       const std::vector<const OrtValue*> src_ptrs = {src_tensor.get()};
       const std::vector<OrtValue*> dst_ptrs = {dst_tensor.get()};
       GetOrtEnv().CopyTensors(src_ptrs, dst_ptrs, nullptr);
     } else {
-      const size_t elem_size = sizeof(int64_t);
-      const size_t byte_count = static_cast<size_t>(batch_beam_size) * elem_size;
-      std::vector<int64_t> cpu_buffer(batch_beam_size);
-      for (int i = 0; i < batch_beam_size; ++i) {
-        cpu_buffer[i] = static_cast<int64_t>(total_length);
-      }
+      int64_t value = static_cast<int64_t>(total_length);
+      const size_t byte_count = sizeof(int64_t);
 
       int64_t shape_val = static_cast<int64_t>(byte_count);
       std::span<const int64_t> shape{&shape_val, 1};
       auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-      auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, cpu_buffer.data(), byte_count, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+      auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, &value, byte_count, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
       auto dst_tensor = OrtValue::CreateTensor(*ort_memory_info_, mask_data, byte_count, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
 
       const std::vector<const OrtValue*> src_ptrs = {src_tensor.get()};