Skip to content

Commit 9d7373b

Browse files
generatedunixname89002005232357meta-codesync[bot]
authored andcommitted
Revert D97166802 (#1326)
Summary: Pull Request resolved: #1326 This diff reverts D97166802 (The context such as a Sandcastle job, Task, SEV, etc. was not provided.) Depends on D97166802 Reviewed By: scotts Differential Revision: D97960430 fbshipit-source-id: 3f76d725701fe0e7113fd7c95f5b9ac1008f9c5c
1 parent 3a61657 commit 9d7373b

4 files changed

Lines changed: 70 additions & 160 deletions

File tree

libkineto/src/CuptiActivity.h

Lines changed: 2 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
#pragma once
1010

11-
#include <cuda_occupancy.h>
1211
#include <cupti.h>
1312

1413
#include <fmt/format.h>
@@ -440,38 +439,11 @@ inline std::string getGraphNodeMetadata(const T& activity) {
440439
#endif
441440
}
442441

443-
// Convert limitingFactors bitmask to human-readable string
444-
// Based on cudaOccLimitingFactor enum from cuda_occupancy.h
445-
// This can be found in the CUDA toolkit typically /usr/local/cuda/targets/x86_64-linux/include/cuda_occupancy.h
446-
inline std::string limitingFactorsToString(unsigned int factors) {
447-
if (factors == 0) {
448-
return "none";
449-
}
450-
constexpr std::pair<unsigned int, const char*> kFactors[] = {
451-
{OCC_LIMIT_WARPS, "WARPS"},
452-
{OCC_LIMIT_REGISTERS, "REGS"},
453-
{OCC_LIMIT_SHARED_MEMORY, "SMEM"},
454-
{OCC_LIMIT_BLOCKS, "BLOCKS"},
455-
{OCC_LIMIT_BARRIERS, "BARRIERS"},
456-
};
457-
std::string result;
458-
for (const auto& [mask, name] : kFactors) {
459-
if (factors & mask) {
460-
if (!result.empty()) {
461-
result += "|";
462-
}
463-
result += name;
464-
}
465-
}
466-
return result;
467-
}
468-
469442
template <>
470443
inline const std::string GpuActivity<CUpti_ActivityKernelType>::metadataJson() const {
471444
const CUpti_ActivityKernelType& kernel = raw();
472445
float blocksPerSmVal = blocksPerSm(kernel);
473446
float warpsPerSmVal = warpsPerSm(kernel);
474-
OccupancyMetrics occMetrics = computeOccupancyMetrics(kernel);
475447

476448
// clang-format off
477449

@@ -484,18 +456,7 @@ inline const std::string GpuActivity<CUpti_ActivityKernelType>::metadataJson() c
484456
"warps per SM": {},
485457
"grid": [{}, {}, {}],
486458
"block": [{}, {}, {}],
487-
"est. achieved occupancy %": {},
488-
"occupancy": {{
489-
"activeBlocksPerMultiprocessor": {},
490-
"limitingFactors": "{}",
491-
"blockLimitRegs": {},
492-
"blockLimitSharedMem": {},
493-
"blockLimitWarps": {},
494-
"blockLimitBlocks": {},
495-
"blockLimitBarriers": {},
496-
"allocatedRegistersPerBlock": {},
497-
"allocatedSharedMemPerBlock": {}
498-
}}{})JSON",
459+
"est. achieved occupancy %": {}{})JSON",
499460
kernel.queued, kernel.deviceId, kernel.contextId,
500461
kernel.streamId, kernel.correlationId,
501462
kernel.registersPerThread,
@@ -504,16 +465,7 @@ inline const std::string GpuActivity<CUpti_ActivityKernelType>::metadataJson() c
504465
std::isinf(warpsPerSmVal) ? "\"inf\"" : std::to_string(warpsPerSmVal),
505466
kernel.gridX, kernel.gridY, kernel.gridZ,
506467
kernel.blockX, kernel.blockY, kernel.blockZ,
507-
static_cast<int>(std::lround(occMetrics.occupancy * 100.0)),
508-
occMetrics.result.activeBlocksPerMultiprocessor,
509-
limitingFactorsToString(occMetrics.result.limitingFactors),
510-
occMetrics.result.blockLimitRegs,
511-
occMetrics.result.blockLimitSharedMem,
512-
occMetrics.result.blockLimitWarps,
513-
occMetrics.result.blockLimitBlocks,
514-
occMetrics.result.blockLimitBarriers,
515-
occMetrics.result.allocatedRegistersPerBlock,
516-
occMetrics.result.allocatedSharedMemPerBlock,
468+
(int) (0.5 + (kernelOccupancy(kernel) * 100.0)),
517469
getGraphNodeMetadata(kernel)
518470
);
519471
// clang-format on

libkineto/src/DeviceProperties.cpp

Lines changed: 58 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -144,12 +144,8 @@ int smCount([[maybe_unused]] uint32_t deviceId) {
144144

145145
#ifdef HAS_CUPTI
146146
float blocksPerSm(const CUpti_ActivityKernelType& kernel) {
147-
int sm_count = smCount(kernel.deviceId);
148-
if (sm_count == 0) {
149-
return std::numeric_limits<float>::infinity();
150-
}
151147
return (kernel.gridX * kernel.gridY * kernel.gridZ) /
152-
static_cast<float>(sm_count);
148+
static_cast<float>(smCount(kernel.deviceId));
153149
}
154150

155151
float warpsPerSm(const CUpti_ActivityKernelType& kernel) {
@@ -158,52 +154,67 @@ float warpsPerSm(const CUpti_ActivityKernelType& kernel) {
158154
threads_per_warp;
159155
}
160156

161-
OccupancyMetrics computeOccupancyMetrics(
162-
const CUpti_ActivityKernelType& kernel) {
163-
OccupancyMetrics metrics;
164-
const std::vector<cudaDeviceProp>& props = deviceProps();
165-
if (kernel.deviceId >= props.size()) {
166-
LOG(ERROR) << "Invalid deviceId " << kernel.deviceId
167-
<< " exceeds available devices (" << props.size()
168-
<< "), skipping occupancy calculation";
169-
return metrics;
170-
}
171-
172-
float blocksPerSm = -1.0;
157+
float kernelOccupancy(const CUpti_ActivityKernelType& kernel) {
158+
float blocks_per_sm = -1.0;
173159
int sm_count = smCount(kernel.deviceId);
174-
if (sm_count != 0) {
175-
blocksPerSm = (kernel.gridX * kernel.gridY * kernel.gridZ) /
176-
static_cast<float>(sm_count);
160+
if (sm_count) {
161+
blocks_per_sm =
162+
(kernel.gridX * kernel.gridY * kernel.gridZ) / (float)sm_count;
177163
}
164+
return kernelOccupancy(
165+
kernel.deviceId,
166+
kernel.registersPerThread,
167+
kernel.staticSharedMemory,
168+
kernel.dynamicSharedMemory,
169+
kernel.blockX,
170+
kernel.blockY,
171+
kernel.blockZ,
172+
blocks_per_sm);
173+
}
178174

179-
cudaOccFuncAttributes occFuncAttr;
180-
occFuncAttr.maxThreadsPerBlock = INT_MAX;
181-
occFuncAttr.numRegs = kernel.registersPerThread;
182-
occFuncAttr.sharedSizeBytes = kernel.staticSharedMemory;
183-
occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
184-
occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
185-
occFuncAttr.maxDynamicSharedSizeBytes = 0;
186-
const cudaOccDeviceState occDeviceState = {};
187-
int blockSize = kernel.blockX * kernel.blockY * kernel.blockZ;
188-
size_t dynamicSmemSize = kernel.dynamicSharedMemory;
189-
cudaOccDeviceProp prop(props[kernel.deviceId]);
190-
cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
191-
&metrics.result,
192-
&prop,
193-
&occFuncAttr,
194-
&occDeviceState,
195-
blockSize,
196-
dynamicSmemSize);
197-
if (status == CUDA_OCC_SUCCESS) {
198-
float effectiveBlocksPerSm = std::min<float>(
199-
metrics.result.activeBlocksPerMultiprocessor, blocksPerSm);
200-
metrics.occupancy = effectiveBlocksPerSm * blockSize /
201-
static_cast<float>(props[kernel.deviceId].maxThreadsPerMultiProcessor);
202-
} else {
203-
LOG_EVERY_N(ERROR, 1000)
204-
<< "Failed to calculate occupancy, status = " << status;
175+
float kernelOccupancy(
176+
uint32_t deviceId,
177+
uint16_t registersPerThread,
178+
int32_t staticSharedMemory,
179+
int32_t dynamicSharedMemory,
180+
int32_t blockX,
181+
int32_t blockY,
182+
int32_t blockZ,
183+
float blocksPerSm) {
184+
// Calculate occupancy
185+
float occupancy = -1.0;
186+
const std::vector<cudaDeviceProp>& props = deviceProps();
187+
if (deviceId < props.size()) {
188+
cudaOccFuncAttributes occFuncAttr;
189+
occFuncAttr.maxThreadsPerBlock = INT_MAX;
190+
occFuncAttr.numRegs = registersPerThread;
191+
occFuncAttr.sharedSizeBytes = staticSharedMemory;
192+
occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
193+
occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
194+
occFuncAttr.maxDynamicSharedSizeBytes = 0;
195+
const cudaOccDeviceState occDeviceState = {};
196+
int blockSize = blockX * blockY * blockZ;
197+
size_t dynamicSmemSize = dynamicSharedMemory;
198+
cudaOccResult occ_result;
199+
cudaOccDeviceProp prop(props[deviceId]);
200+
cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
201+
&occ_result,
202+
&prop,
203+
&occFuncAttr,
204+
&occDeviceState,
205+
blockSize,
206+
dynamicSmemSize);
207+
if (status == CUDA_OCC_SUCCESS) {
208+
blocksPerSm = std::min<float>(
209+
occ_result.activeBlocksPerMultiprocessor, blocksPerSm);
210+
occupancy = blocksPerSm * blockSize /
211+
(float)props[deviceId].maxThreadsPerMultiProcessor;
212+
} else {
213+
LOG_EVERY_N(ERROR, 1000)
214+
<< "Failed to calculate occupancy, status = " << status;
215+
}
205216
}
206-
return metrics;
217+
return occupancy;
207218
}
208219
#endif // HAS_CUPTI
209220

libkineto/src/DeviceProperties.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#include <string>
1313

1414
#ifdef HAS_CUPTI
15-
#include <cuda_occupancy.h>
1615
#include <cupti.h>
1716
#endif
1817

@@ -42,15 +41,16 @@ using CUpti_ActivityMemsetType = CUpti_ActivityMemset;
4241
float blocksPerSm(const CUpti_ActivityKernelType& kernel);
4342
float warpsPerSm(const CUpti_ActivityKernelType& kernel);
4443

45-
// Occupancy results from CUDA occupancy calculator
46-
// Returns cudaOccResult from cuda_occupancy.h plus a computed occupancy metric
47-
struct OccupancyMetrics {
48-
float occupancy = -1.0f; // Computed effective occupancy in number of threads
49-
cudaOccResult result = {}; // Raw results from cudaOccMaxActiveBlocksPerMultiprocessor
50-
};
51-
52-
// Return detailed occupancy metrics including limiting factors
53-
OccupancyMetrics computeOccupancyMetrics(const CUpti_ActivityKernelType& kernel);
44+
// Return estimated achieved occupancy for a kernel
45+
float kernelOccupancy(const CUpti_ActivityKernelType& kernel);
46+
float kernelOccupancy(uint32_t deviceId,
47+
uint16_t registersPerThread,
48+
int32_t staticSharedMemory,
49+
int32_t dynamicSharedMemory,
50+
int32_t blockX,
51+
int32_t blockY,
52+
int32_t blockZ,
53+
float blocks_per_sm);
5454
#endif
5555

5656
} // namespace KINETO_NAMESPACE

libkineto/test/DevicePropertiesTest.cpp

Lines changed: 0 additions & 53 deletions
This file was deleted.

0 commit comments

Comments
 (0)