Skip to content

CUDA:Indexing

CUDA Thread Indexing Cheatsheet.

1D grid of 1D blocks

__device__ int getGlobalIdx_1D_1D()
{
    return (blockIdx.x * blockDim.x) /*GLOBAL_BLOCK_OFFSET*/
           + threadIdx.x /*LOCAL_THREAD_OFFSET*/;
}

1D grid of 2D blocks

__device__ int getGlobalIdx_1D_2D()
{
    return (blockIdx.x * (blockDim.x * blockDim.y)) /*GLOBAL_BLOCK_OFFSET*/
           + (threadIdx.y * blockDim.x) /*LOCAL_THREAD_Y_OFFSET*/
           + threadIdx.x /*LOCAL_THREAD_X_OFFSET*/;
}

1D grid of 3D blocks

__device__ int getGlobalIdx_1D_3D()
{
    return (blockIdx.x * (blockDim.x * blockDim.y * blockDim.z)) /*GLOBAL_BLOCK_OFFSET*/
           + (threadIdx.z * (blockDim.y * blockDim.x)) /*LOCAL_THREAD_X_OFFSET*/
           + (threadIdx.y * blockDim.x) /*LOCAL_THREAD_Y_OFFSET*/
           + threadIdx.x /*LOCAL_THREAD_X_OFFSET*/;
}

2D grid of 1D blocks

__device__ int getGlobalIdx_2D_1D()
{
    int blockId   = blockIdx.y * gridDim.x + blockIdx.x;                
    int threadId = blockId * blockDim.x + threadIdx.x; 
    return threadId;
}

2D grid of 2D blocks

 __device__ int getGlobalIdx_2D_2D()
{
    int blockId = blockIdx.x + blockIdx.y * gridDim.x; 
    int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    return threadId;
}

2D grid of 3D blocks

__device__ int getGlobalIdx_2D_3D()
{
    int blockId = blockIdx.x 
             + blockIdx.y * gridDim.x; 
    int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
               + (threadIdx.z * (blockDim.x * blockDim.y))
               + (threadIdx.y * blockDim.x)
               + threadIdx.x;
    return threadId;
} 

3D grid of 1D blocks

__device__ int getGlobalIdx_3D_1D()
{
    int blockId = blockIdx.x 
             + blockIdx.y * gridDim.x 
             + gridDim.x * gridDim.y * blockIdx.z; 
    int threadId = blockId * blockDim.x + threadIdx.x;
    return threadId;
} 

3D grid of 2D blocks

__device__ int getGlobalIdx_3D_2D()
{
    int blockId = blockIdx.x 
                 + blockIdx.y * gridDim.x 
             + gridDim.x * gridDim.y * blockIdx.z; 
    int threadId = blockId * (blockDim.x * blockDim.y)
              + (threadIdx.y * blockDim.x)
              + threadIdx.x;
    return threadId;
}

3D grid of 3D blocks

__device__ int getGlobalIdx_3D_3D()
{
    int blockId = blockIdx.x 
             + blockIdx.y * gridDim.x 
             + gridDim.x * gridDim.y * blockIdx.z; 
    int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
              + (threadIdx.z * (blockDim.x * blockDim.y))
              + (threadIdx.y * blockDim.x)
              + threadIdx.x;
    return threadId;
}

Warp ID & Lane ID

__forceinline__ __device__ unsigned lane_id()
{
    unsigned ret; 
    asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
    return ret;
}

__forceinline__ __device__ unsigned warp_id()
{
    unsigned ret; 
    asm volatile ("mov.u32 %0, %warpid;" : "=r"(ret));
    return ret;
}

See also

Favorite site

References


  1. Programming_NVIDIA_CUDA_-_1D_array_index.pdf