Occupancy API

CUDA의 cudaOccupancyMaxPotentialBlockSize함수는 kernel함수를 처리하기 위한 적절한 block과 Grid의 크기를 정해준다. 기종에 따라 가변적으로 적절한 크기를 변화시켜야하는 문제를 해결할 수 있다.

int N = 10000000;
int blockSize, minGridSize, gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, kernel, 0, N );
gridSize = ( N + blockSize - 1 ) / blockSize;
gridSize = max( minGridSize, gridSize );
kernel<<< gridSize, blockSize >>> ( array, N )


#include "stdio.h"

__global__ void MyKernel(int *array, int arrayCount) 
  int idx = threadIdx.x + blockIdx.x * blockDim.x; 
  if (idx < arrayCount) 
    array[idx] *= array[idx]; 

void launchMyKernel(int *array, int arrayCount) 
  int blockSize;   // The launch configurator returned block size 
  int minGridSize; // The minimum grid size needed to achieve the 
                   // maximum occupancy for a full device launch 
  int gridSize;    // The actual grid size needed, based on input size 

  cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, 
                                      MyKernel, 0, 0); 
  // Round up according to array size 
  gridSize = (arrayCount + blockSize - 1) / blockSize; 

  MyKernel<<< gridSize, blockSize >>>(array, arrayCount); 


  // calculate theoretical occupancy
  int maxActiveBlocks;
  cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, 
                                                 MyKernel, blockSize, 

  int device;
  cudaDeviceProp props;
  cudaGetDeviceProperties(&props, device);

  float occupancy = (maxActiveBlocks * blockSize / props.warpSize) / 
                    (float)(props.maxThreadsPerMultiProcessor / 

  printf("Launched blocks of size %d. Theoretical occupancy: %f\n", 
         blockSize, occupancy);


template < class T >
__host__ cudaError_t cudaOccupancyMaxPotentialBlockSize ( int* minGridSize, int* blockSize, T func, size_t dynamicSMemSize = 0, int  blockSizeLimit = 0 ) [inline] 

Returns grid and block size that achieves maximum potential occupancy for a device function.

  • Parameters
    • minGridSize: Returned minimum grid size needed to achieve the best potential occupancy
    • blockSize: Returned block size
    • func: Device function symbol
    • dynamicSMemSize: Per-block dynamic shared memory usage intended, in bytes
    • blockSizeLimit: The maximum block size func is designed to work with. 0 means no limit.
  • Description
    • Returns in minGridSize and blocksize a suggested grid / block size pair that achieves the best potential occupancy (i.e. the maximum number of active warps with the smallest number of blocks).

