CUDA:Example

Simple example

#include <cuda.h>
#include <stdio.h>

static void HandleError(cudaError_t err, const char * file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}

#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))

static int const TEST_BLOCK_SIZE = 10;

__global__ void __add__(int * a, int * b, int * c)
{
    int tid = blockIdx.x;
    if (tid < TEST_BLOCK_SIZE) {
        c[tid] = a[tid] + b[tid];
    }
}

bool checkCuda()
{
    int a[TEST_BLOCK_SIZE], b[TEST_BLOCK_SIZE], c[TEST_BLOCK_SIZE];
    int * dev_a, * dev_b, * dev_c;

    // fill the arrays 'a' and 'b' on the CPU.
    for (int i = 0; i < TEST_BLOCK_SIZE; i++) {
        a[i] = i;
        b[i] = i * 2;
    }

    // allocate the memory on the GPU.
    HANDLE_ERROR(cudaMalloc((void**) &dev_a, TEST_BLOCK_SIZE * sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**) &dev_b, TEST_BLOCK_SIZE * sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**) &dev_c, TEST_BLOCK_SIZE * sizeof(int)));

    // copy the arrays 'a' and 'b' to the GPU.
    HANDLE_ERROR(cudaMemcpy(dev_a, a, TEST_BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(dev_b, b, TEST_BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice));

    // Run CUDA method.
    __add__<<<TEST_BLOCK_SIZE, 1>>>(dev_a, dev_b, dev_c);

    // copy the array 'c' back from the GPU to the CPU.
    HANDLE_ERROR(cudaMemcpy(c, dev_c, TEST_BLOCK_SIZE * sizeof(int), cudaMemcpyDeviceToHost));

    // check the results.
    bool result = true;
    for (int i = 0; i < TEST_BLOCK_SIZE; i++) {
        if (c[i] != i * 3) {
            result = false;
            break;
        }
    }

    // free the memory allocated on the GPU
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return result;
}

Array example

이 예제는 텍스처 하나를 어떤 이미지로부터 GPU상의 행렬로 읽어들인다.

cudaArray* cu_array;
texture<float, 2> tex;

// 행렬 할당
cudaMallocArray(&cu_array, cudaCreateChannelDesc<float>(), width, height);

// 이미지 데이터를 행렬로 복사
cudaMemcpy(cu_array, image, width*height, cudaMemcpyHostToDevice);

// 행렬을 텍스처에 연결한다.
cudaBindTexture(tex, cu_array);

// 커널을 실행한다
dim3 blockDim(16, 16, 1);
dim3 gridDim(width / blockDim.x, height / blockDim.y, 1);
kernel<<< gridDim, blockDim, 0 >>>(d_odata, width, height);
cudaUnbindTexture(tex);

__global__ void kernel(float* odata, int height, int width)
{
   unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
   unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
   float c = texfetch(tex, x, y);
   odata[y*width+x] = c;
}

CUDA Error

void printCudaError(char const * message)
{
    cudaError_t err = cudaGetLastError();
    std::string format;

    char const * name = cudaGetErrorName(err);
    char const * body = cudaGetErrorString(err);

    if (message == NULL) {
        format = "CUDA [%d/%s] %s\n";
        fprintf(stderr, format.c_str(), err, name, body);
    } else {
        format = "%s [%d/%s] %s\n";
        fprintf(stderr, format.c_str(), message, err, name, body);
    }
}

CUDA Library

CUDA를 사용하여 라이브러리 작성 후 GCC를 사용하여 LINK하는 방법을 정리한다.

우선 아래와 같이 세 개의 파일을 작성한다. 주의할 점은 __device__ 또는 __global__ 함수를 직접 호출하면 안된다.

// gpu.cu
#include <cuda.h>
#include <stdio.h>
extern "C" void checkErroreCuda()
{
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("%s\n", cudaGetErrorString(err));
    } else {
        printf("No error.\n");
    }
}

// gpu.h
extern "C" {
void checkErroreCuda();
}

// main.cpp
#include "gpu.h"
int main()
{
    checkErroreCuda();
    return 0;
}

아래와 같이 컴파일 하면 된다.

nvcc --cudart static -lib -ccbin g++ -o libgpu.a gpu.cu
g++ -c main.cpp
g++ main.o libgpu.a -L/usr/local/cuda/lib64 -lcuda -lcudart_static

Device Query

CUDA 샘플 프로젝트중 deviceQuery라는 프로젝트가 존재한다. 빌드 후 실행하면 디바이스에 대한 정보를 확인할 수 있다.

CUDA 장치 정보를 확인하는 코드는 아래와 같다. (Caffe 프로젝트의 CMake파일에 사용되었다.)

#include <cstdio>

int main()
{
  int count = 0;

  if (cudaSuccess != cudaGetDeviceCount(&count)) {
    return -1;
  }

  if (count == 0) {
    return -1;
  }

  for (int device = 0; device < count; ++device) {
    cudaDeviceProp prop;
    if (cudaSuccess == cudaGetDeviceProperties(&prop, device)) {
      std::printf("%d.%d", prop.major, prop.minor);
    }
  }
  return 0;
}