Revert D97166802 (#1326)

generatedunixname89002005232357 · meta-codesync[bot] · commit 9d7373bfd148 · 2026-03-24T19:05:49.000-07:00
Summary: Pull Request resolved: #1326 This diff reverts D97166802 (The context such as a Sandcastle job, Task, SEV, etc. was not provided.) Depends on D97166802 Reviewed By: scotts Differential Revision: D97960430 fbshipit-source-id: 3f76d725701fe0e7113fd7c95f5b9ac1008f9c5c
diff --git a/libkineto/src/CuptiActivity.h b/libkineto/src/CuptiActivity.h
@@ -8,7 +8,6 @@
 
 #pragma once
 
-#include <cuda_occupancy.h>
 #include <cupti.h>
 
 #include <fmt/format.h>
@@ -440,38 +439,11 @@ inline std::string getGraphNodeMetadata(const T& activity) {
 #endif
 }
 
-// Convert limitingFactors bitmask to human-readable string
-// Based on cudaOccLimitingFactor enum from cuda_occupancy.h
-// This can be found in the CUDA toolkit typically /usr/local/cuda/targets/x86_64-linux/include/cuda_occupancy.h
-inline std::string limitingFactorsToString(unsigned int factors) {
-  if (factors == 0) {
-    return "none";
-  }
-  constexpr std::pair<unsigned int, const char*> kFactors[] = {
-      {OCC_LIMIT_WARPS, "WARPS"},
-      {OCC_LIMIT_REGISTERS, "REGS"},
-      {OCC_LIMIT_SHARED_MEMORY, "SMEM"},
-      {OCC_LIMIT_BLOCKS, "BLOCKS"},
-      {OCC_LIMIT_BARRIERS, "BARRIERS"},
-  };
-  std::string result;
-  for (const auto& [mask, name] : kFactors) {
-    if (factors & mask) {
-      if (!result.empty()) {
-        result += "|";
-      }
-      result += name;
-    }
-  }
-  return result;
-}
-
 template <>
 inline const std::string GpuActivity<CUpti_ActivityKernelType>::metadataJson() const {
   const CUpti_ActivityKernelType& kernel = raw();
   float blocksPerSmVal = blocksPerSm(kernel);
   float warpsPerSmVal = warpsPerSm(kernel);
-  OccupancyMetrics occMetrics = computeOccupancyMetrics(kernel);
 
   // clang-format off
 
@@ -484,18 +456,7 @@ inline const std::string GpuActivity<CUpti_ActivityKernelType>::metadataJson() c
       "warps per SM": {},
       "grid": [{}, {}, {}],
       "block": [{}, {}, {}],
-      "est. achieved occupancy %": {},
-      "occupancy": {{
-        "activeBlocksPerMultiprocessor": {},
-        "limitingFactors": "{}",
-        "blockLimitRegs": {},
-        "blockLimitSharedMem": {},
-        "blockLimitWarps": {},
-        "blockLimitBlocks": {},
-        "blockLimitBarriers": {},
-        "allocatedRegistersPerBlock": {},
-        "allocatedSharedMemPerBlock": {}
-      }}{})JSON",
+      "est. achieved occupancy %": {}{})JSON",
       kernel.queued, kernel.deviceId, kernel.contextId,
       kernel.streamId, kernel.correlationId,
       kernel.registersPerThread,
@@ -504,16 +465,7 @@ inline const std::string GpuActivity<CUpti_ActivityKernelType>::metadataJson() c
       std::isinf(warpsPerSmVal) ? "\"inf\"" : std::to_string(warpsPerSmVal),
       kernel.gridX, kernel.gridY, kernel.gridZ,
       kernel.blockX, kernel.blockY, kernel.blockZ,
-      static_cast<int>(std::lround(occMetrics.occupancy * 100.0)),
-      occMetrics.result.activeBlocksPerMultiprocessor,
-      limitingFactorsToString(occMetrics.result.limitingFactors),
-      occMetrics.result.blockLimitRegs,
-      occMetrics.result.blockLimitSharedMem,
-      occMetrics.result.blockLimitWarps,
-      occMetrics.result.blockLimitBlocks,
-      occMetrics.result.blockLimitBarriers,
-      occMetrics.result.allocatedRegistersPerBlock,
-      occMetrics.result.allocatedSharedMemPerBlock,
+      (int) (0.5 + (kernelOccupancy(kernel) * 100.0)),
       getGraphNodeMetadata(kernel)
       );
   // clang-format on
diff --git a/libkineto/src/DeviceProperties.cpp b/libkineto/src/DeviceProperties.cpp
@@ -144,12 +144,8 @@ int smCount([[maybe_unused]] uint32_t deviceId) {
 
 #ifdef HAS_CUPTI
 float blocksPerSm(const CUpti_ActivityKernelType& kernel) {
-  int sm_count = smCount(kernel.deviceId);
-  if (sm_count == 0) {
-    return std::numeric_limits<float>::infinity();
-  }
   return (kernel.gridX * kernel.gridY * kernel.gridZ) /
-      static_cast<float>(sm_count);
+      static_cast<float>(smCount(kernel.deviceId));
 }
 
 float warpsPerSm(const CUpti_ActivityKernelType& kernel) {
@@ -158,52 +154,67 @@ float warpsPerSm(const CUpti_ActivityKernelType& kernel) {
       threads_per_warp;
 }
 
-OccupancyMetrics computeOccupancyMetrics(
-    const CUpti_ActivityKernelType& kernel) {
-  OccupancyMetrics metrics;
-  const std::vector<cudaDeviceProp>& props = deviceProps();
-  if (kernel.deviceId >= props.size()) {
-    LOG(ERROR) << "Invalid deviceId " << kernel.deviceId
-               << " exceeds available devices (" << props.size()
-               << "), skipping occupancy calculation";
-    return metrics;
-  }
-
-  float blocksPerSm = -1.0;
+float kernelOccupancy(const CUpti_ActivityKernelType& kernel) {
+  float blocks_per_sm = -1.0;
   int sm_count = smCount(kernel.deviceId);
-  if (sm_count != 0) {
-    blocksPerSm = (kernel.gridX * kernel.gridY * kernel.gridZ) /
-        static_cast<float>(sm_count);
+  if (sm_count) {
+    blocks_per_sm =
+        (kernel.gridX * kernel.gridY * kernel.gridZ) / (float)sm_count;
   }
+  return kernelOccupancy(
+      kernel.deviceId,
+      kernel.registersPerThread,
+      kernel.staticSharedMemory,
+      kernel.dynamicSharedMemory,
+      kernel.blockX,
+      kernel.blockY,
+      kernel.blockZ,
+      blocks_per_sm);
+}
 
-  cudaOccFuncAttributes occFuncAttr;
-  occFuncAttr.maxThreadsPerBlock = INT_MAX;
-  occFuncAttr.numRegs = kernel.registersPerThread;
-  occFuncAttr.sharedSizeBytes = kernel.staticSharedMemory;
-  occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
-  occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
-  occFuncAttr.maxDynamicSharedSizeBytes = 0;
-  const cudaOccDeviceState occDeviceState = {};
-  int blockSize = kernel.blockX * kernel.blockY * kernel.blockZ;
-  size_t dynamicSmemSize = kernel.dynamicSharedMemory;
-  cudaOccDeviceProp prop(props[kernel.deviceId]);
-  cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
-      &metrics.result,
-      &prop,
-      &occFuncAttr,
-      &occDeviceState,
-      blockSize,
-      dynamicSmemSize);
-  if (status == CUDA_OCC_SUCCESS) {
-    float effectiveBlocksPerSm = std::min<float>(
-        metrics.result.activeBlocksPerMultiprocessor, blocksPerSm);
-    metrics.occupancy = effectiveBlocksPerSm * blockSize /
-        static_cast<float>(props[kernel.deviceId].maxThreadsPerMultiProcessor);
-  } else {
-    LOG_EVERY_N(ERROR, 1000)
-        << "Failed to calculate occupancy, status = " << status;
+float kernelOccupancy(
+    uint32_t deviceId,
+    uint16_t registersPerThread,
+    int32_t staticSharedMemory,
+    int32_t dynamicSharedMemory,
+    int32_t blockX,
+    int32_t blockY,
+    int32_t blockZ,
+    float blocksPerSm) {
+  // Calculate occupancy
+  float occupancy = -1.0;
+  const std::vector<cudaDeviceProp>& props = deviceProps();
+  if (deviceId < props.size()) {
+    cudaOccFuncAttributes occFuncAttr;
+    occFuncAttr.maxThreadsPerBlock = INT_MAX;
+    occFuncAttr.numRegs = registersPerThread;
+    occFuncAttr.sharedSizeBytes = staticSharedMemory;
+    occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
+    occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
+    occFuncAttr.maxDynamicSharedSizeBytes = 0;
+    const cudaOccDeviceState occDeviceState = {};
+    int blockSize = blockX * blockY * blockZ;
+    size_t dynamicSmemSize = dynamicSharedMemory;
+    cudaOccResult occ_result;
+    cudaOccDeviceProp prop(props[deviceId]);
+    cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
+        &occ_result,
+        &prop,
+        &occFuncAttr,
+        &occDeviceState,
+        blockSize,
+        dynamicSmemSize);
+    if (status == CUDA_OCC_SUCCESS) {
+      blocksPerSm = std::min<float>(
+          occ_result.activeBlocksPerMultiprocessor, blocksPerSm);
+      occupancy = blocksPerSm * blockSize /
+          (float)props[deviceId].maxThreadsPerMultiProcessor;
+    } else {
+      LOG_EVERY_N(ERROR, 1000)
+          << "Failed to calculate occupancy, status = " << status;
+    }
   }
-  return metrics;
+  return occupancy;
 }
 #endif // HAS_CUPTI
 
diff --git a/libkineto/src/DeviceProperties.h b/libkineto/src/DeviceProperties.h
@@ -12,7 +12,6 @@
 #include <string>
 
 #ifdef HAS_CUPTI
-#include <cuda_occupancy.h>
 #include <cupti.h>
 #endif
 
@@ -42,15 +41,16 @@ using CUpti_ActivityMemsetType = CUpti_ActivityMemset;
 float blocksPerSm(const CUpti_ActivityKernelType& kernel);
 float warpsPerSm(const CUpti_ActivityKernelType& kernel);
 
-// Occupancy results from CUDA occupancy calculator
-// Returns cudaOccResult from cuda_occupancy.h plus a computed occupancy metric
-struct OccupancyMetrics {
-  float occupancy = -1.0f; // Computed effective occupancy in number of threads
-  cudaOccResult result = {}; // Raw results from cudaOccMaxActiveBlocksPerMultiprocessor
-};
-
-// Return detailed occupancy metrics including limiting factors
-OccupancyMetrics computeOccupancyMetrics(const CUpti_ActivityKernelType& kernel);
+// Return estimated achieved occupancy for a kernel
+float kernelOccupancy(const CUpti_ActivityKernelType& kernel);
+float kernelOccupancy(uint32_t deviceId,
+                      uint16_t registersPerThread,
+                      int32_t staticSharedMemory,
+                      int32_t dynamicSharedMemory,
+                      int32_t blockX,
+                      int32_t blockY,
+                      int32_t blockZ,
+                      float blocks_per_sm);
 #endif
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/test/DevicePropertiesTest.cpp b/libkineto/test/DevicePropertiesTest.cpp