@@ -144,12 +144,8 @@ int smCount([[maybe_unused]] uint32_t deviceId) {
144144
145145#ifdef HAS_CUPTI
146146float blocksPerSm (const CUpti_ActivityKernelType& kernel) {
147- int sm_count = smCount (kernel.deviceId );
148- if (sm_count == 0 ) {
149- return std::numeric_limits<float >::infinity ();
150- }
151147 return (kernel.gridX * kernel.gridY * kernel.gridZ ) /
152- static_cast <float >(sm_count );
148+ static_cast <float >(smCount (kernel. deviceId ) );
153149}
154150
155151float warpsPerSm (const CUpti_ActivityKernelType& kernel) {
@@ -158,52 +154,67 @@ float warpsPerSm(const CUpti_ActivityKernelType& kernel) {
158154 threads_per_warp;
159155}
160156
161- OccupancyMetrics computeOccupancyMetrics (
162- const CUpti_ActivityKernelType& kernel) {
163- OccupancyMetrics metrics;
164- const std::vector<cudaDeviceProp>& props = deviceProps ();
165- if (kernel.deviceId >= props.size ()) {
166- LOG (ERROR) << " Invalid deviceId " << kernel.deviceId
167- << " exceeds available devices (" << props.size ()
168- << " ), skipping occupancy calculation" ;
169- return metrics;
170- }
171-
172- float blocksPerSm = -1.0 ;
157+ float kernelOccupancy (const CUpti_ActivityKernelType& kernel) {
158+ float blocks_per_sm = -1.0 ;
173159 int sm_count = smCount (kernel.deviceId );
174- if (sm_count != 0 ) {
175- blocksPerSm = (kernel. gridX * kernel. gridY * kernel. gridZ ) /
176- static_cast < float >(sm_count) ;
160+ if (sm_count) {
161+ blocks_per_sm =
162+ (kernel. gridX * kernel. gridY * kernel. gridZ ) / ( float )sm_count ;
177163 }
164+ return kernelOccupancy (
165+ kernel.deviceId ,
166+ kernel.registersPerThread ,
167+ kernel.staticSharedMemory ,
168+ kernel.dynamicSharedMemory ,
169+ kernel.blockX ,
170+ kernel.blockY ,
171+ kernel.blockZ ,
172+ blocks_per_sm);
173+ }
178174
179- cudaOccFuncAttributes occFuncAttr;
180- occFuncAttr.maxThreadsPerBlock = INT_MAX;
181- occFuncAttr.numRegs = kernel.registersPerThread ;
182- occFuncAttr.sharedSizeBytes = kernel.staticSharedMemory ;
183- occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
184- occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
185- occFuncAttr.maxDynamicSharedSizeBytes = 0 ;
186- const cudaOccDeviceState occDeviceState = {};
187- int blockSize = kernel.blockX * kernel.blockY * kernel.blockZ ;
188- size_t dynamicSmemSize = kernel.dynamicSharedMemory ;
189- cudaOccDeviceProp prop (props[kernel.deviceId ]);
190- cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor (
191- &metrics.result ,
192- &prop,
193- &occFuncAttr,
194- &occDeviceState,
195- blockSize,
196- dynamicSmemSize);
197- if (status == CUDA_OCC_SUCCESS) {
198- float effectiveBlocksPerSm = std::min<float >(
199- metrics.result .activeBlocksPerMultiprocessor , blocksPerSm);
200- metrics.occupancy = effectiveBlocksPerSm * blockSize /
201- static_cast <float >(props[kernel.deviceId ].maxThreadsPerMultiProcessor );
202- } else {
203- LOG_EVERY_N (ERROR, 1000 )
204- << " Failed to calculate occupancy, status = " << status;
175+ float kernelOccupancy (
176+ uint32_t deviceId,
177+ uint16_t registersPerThread,
178+ int32_t staticSharedMemory,
179+ int32_t dynamicSharedMemory,
180+ int32_t blockX,
181+ int32_t blockY,
182+ int32_t blockZ,
183+ float blocksPerSm) {
184+ // Calculate occupancy
185+ float occupancy = -1.0 ;
186+ const std::vector<cudaDeviceProp>& props = deviceProps ();
187+ if (deviceId < props.size ()) {
188+ cudaOccFuncAttributes occFuncAttr;
189+ occFuncAttr.maxThreadsPerBlock = INT_MAX;
190+ occFuncAttr.numRegs = registersPerThread;
191+ occFuncAttr.sharedSizeBytes = staticSharedMemory;
192+ occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
193+ occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
194+ occFuncAttr.maxDynamicSharedSizeBytes = 0 ;
195+ const cudaOccDeviceState occDeviceState = {};
196+ int blockSize = blockX * blockY * blockZ;
197+ size_t dynamicSmemSize = dynamicSharedMemory;
198+ cudaOccResult occ_result;
199+ cudaOccDeviceProp prop (props[deviceId]);
200+ cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor (
201+ &occ_result,
202+ &prop,
203+ &occFuncAttr,
204+ &occDeviceState,
205+ blockSize,
206+ dynamicSmemSize);
207+ if (status == CUDA_OCC_SUCCESS) {
208+ blocksPerSm = std::min<float >(
209+ occ_result.activeBlocksPerMultiprocessor , blocksPerSm);
210+ occupancy = blocksPerSm * blockSize /
211+ (float )props[deviceId].maxThreadsPerMultiProcessor ;
212+ } else {
213+ LOG_EVERY_N (ERROR, 1000 )
214+ << " Failed to calculate occupancy, status = " << status;
215+ }
205216 }
206- return metrics ;
217+ return occupancy ;
207218}
208219#endif // HAS_CUPTI
209220
0 commit comments