Skip to content

OpenCL:Example:ContextWrapper

OpenCL의 Context Wrapper 클래스 구현 예제.

/**
 * @file   OpenCLContext.hpp
 * @brief  OpenCLContext class prototype.
 * @author yourname
 * @date   2018-01-16
 */

#ifndef __INCLUDE_LIBTBAG__LIBTBAG_GPU_OPENCL_OPENCLCONTEXT_HPP__
#define __INCLUDE_LIBTBAG__LIBTBAG_GPU_OPENCL_OPENCLCONTEXT_HPP__

// MS compatible compilers support #pragma once
#if defined(_MSC_VER) && (_MSC_VER >= 1020)
#pragma once
#endif

#include <libtbag/config.h>
#include <libtbag/predef.hpp>
#include <libtbag/gpu/details/GpuDetails.hpp>

// -------------------
NAMESPACE_LIBTBAG_OPEN
// -------------------

namespace gpu    {
namespace opencl {

/** The number of parallel compute cores on the OpenCL device. The minimum value is 1. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_COMPUTE_UNITS = "max_compute_units";
/** Maximum number of work-items in a work-group executing a kernel using the data parallel execution model. @n
 * (Refer to clEnqueueNDRangeKernel). The minimum value is 1. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_WORK_GROUP_SIZE = "max_work_group_size";
/** Maximum dimensions that specify the global and local work-item IDs used by the data parallel execution model. @n
 * (Refer to clEnqueueNDRangeKernel). The minimum value is 3. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = "max_work_item_dimensions";
/** Maximum number of work-items that can be specified in each dimension of the work-group to clEnqueueNDRangeKernel. @n
 * Returns n size_t entries, where n is the value returned by the query for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. @n
 * The minimum value is (1, 1, 1). */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_WORK_ITEM_SIZES = "max_work_item_sizes";

/** The OpenCL device type. Currently supported values are one of or a combination of:
 * - CL_DEVICE_TYPE_CPU
 * - CL_DEVICE_TYPE_GPU
 * - CL_DEVICE_TYPE_ACCELERATOR
 * - CL_DEVICE_TYPE_DEFAULT
 */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_TYPE             = "type";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_TYPE_CPU         = "cpu";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_TYPE_GPU         = "gpu";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_TYPE_ACCELERATOR = "accelerator";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_TYPE_DEFAULT     = "default";

TBAG_API bool isSupport() TBAG_NOEXCEPT;

TBAG_API int             getPlatformCount();
TBAG_API GpuPlatforms    getPlatformList ();
TBAG_API GpuPlatformInfo getPlatformInfo (GpuPlatform const & platform);

TBAG_API int             getDeviceCount(GpuPlatform const & platform);
TBAG_API GpuDevices      getDeviceList (GpuPlatform const & platform);
TBAG_API GpuDeviceInfo   getDeviceInfo (GpuDevice   const & device);

TBAG_API SharedGpuContext createContext(GpuDevice const & device);

/**
 * OpenCLContext class prototype.
 *
 * @author yourname
 * @date   2018-01-13
 */
class TBAG_API OpenCLContext : public GpuContext
{
public:
    struct Kernels
    {
        SharedGpuKernel i;
        SharedGpuKernel u;
        SharedGpuKernel f;
        SharedGpuKernel d;
    };

public:
    OpenCLContext(GpuDevice const & d, GpuId c);
    virtual ~OpenCLContext();

private:
    Err _write(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem,
               std::size_t size, bool blocking = true, GpuEvent * event = nullptr) const;
    Err _read(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem,
              std::size_t size, bool blocking = true, GpuEvent * event = nullptr) const;

public:
    virtual bool isSupport() const TBAG_NOEXCEPT override;
    virtual bool    isHost() const TBAG_NOEXCEPT override;
    virtual bool  isDevice() const TBAG_NOEXCEPT override;
    virtual bool  isStream() const TBAG_NOEXCEPT override;

    virtual Err  createStream(GpuStream & stream) const override;
    virtual Err releaseStream(GpuStream & stream) const override;

    virtual Err  createEvent(GpuStream const & stream, GpuEvent & event) const override;
    virtual Err    syncEvent(GpuEvent  const &  event) const override;
    virtual Err elapsedEvent(GpuEvent        &  event, float * millisec = nullptr) const override;
    virtual Err releaseEvent(GpuEvent        &  event) const override;

    virtual Err  createProgram(std::string const &  source, GpuProgram & program) const override;
    virtual Err   buildProgram(GpuProgram        & program) const override;
    virtual Err releaseProgram(GpuProgram        & program) const override;

    virtual Err  createKernel(GpuProgram const & program, std::string const & kernel_symbol, GpuKernel & kernel) const override;
    virtual Err releaseKernel(GpuKernel        & kernel) const override;

    virtual Err malloc(GpuMemory & mem, std::size_t size) const override;
    virtual Err   free(GpuMemory & mem) const override;

    virtual Err write(GpuStream const & stream, GpuMemory       & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;
    virtual Err  read(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory       & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;

    virtual Err writeAsync(GpuStream const & stream, GpuMemory       & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;
    virtual Err  readAsync(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory       & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;

    virtual Err      copy(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event = nullptr) const override;
    virtual Err copyAsync(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event = nullptr) const override;

    virtual Err  flush(GpuStream const & stream) const override;
    virtual Err finish(GpuStream const & stream) const override;

private:
    Kernels mutable _fill;
public:
    virtual Err fill(GpuStream const & stream, int      * out, int      data, int count, GpuEvent * event) const override;
    virtual Err fill(GpuStream const & stream, unsigned * out, unsigned data, int count, GpuEvent * event) const override;
    virtual Err fill(GpuStream const & stream, float    * out, float    data, int count, GpuEvent * event) const override;
    virtual Err fill(GpuStream const & stream, double   * out, double   data, int count, GpuEvent * event) const override;

private:
    Kernels mutable _add;
public:
    virtual Err add(GpuStream const & stream, int      const * in1, int      const * in2, int      * out, int count, GpuEvent * event) const override;
    virtual Err add(GpuStream const & stream, unsigned const * in1, unsigned const * in2, unsigned * out, int count, GpuEvent * event) const override;
    virtual Err add(GpuStream const & stream, float    const * in1, float    const * in2, float    * out, int count, GpuEvent * event) const override;
    virtual Err add(GpuStream const & stream, double   const * in1, double   const * in2, double   * out, int count, GpuEvent * event) const override;
};

} // namespace opencl
} // namespace gpu

// --------------------
NAMESPACE_LIBTBAG_CLOSE
// --------------------

#endif // __INCLUDE_LIBTBAG__LIBTBAG_GPU_OPENCL_OPENCLCONTEXT_HPP__

Source

/**
 * @file   OpenCLContext.cpp
 * @brief  OpenCLContext class implementation.
 * @author yourname
 * @date   2018-01-16
 */

#include <libtbag/gpu/opencl/OpenCLContext.hpp>
#include <libtbag/log/Log.hpp>
#include <libtbag/string/Format.hpp>
#include <libtbag/string/Environments.hpp>

#include <cassert>
#include <vector>
#include <string>

#if defined(USE_OPENCL)
# if defined(TBAG_PLATFORM_MACOS)
#  include <OpenCL/cl.h>
# else
#  include <CL/cl.h>
# endif
#else
# include <libtbag/dummy/Cl.hpp>
using namespace libtbag::dummy::cl;
#endif

#define TBAG_OPENCL_BACKEND_PROFILE

// -------------------
NAMESPACE_LIBTBAG_OPEN
// -------------------

namespace gpu    {
namespace opencl {

// ---------------
namespace __impl {
// ---------------

//TBAG_CONSTEXPR static char const * const OPENCL_DEFAULT_BUILD_OPTION = "-x clc++"; // AMD ONLY.
TBAG_CONSTEXPR   static char const * const OPENCL_DEFAULT_BUILD_OPTION = "";

TBAG_CONSTEXPR static bool isOpenCLBackendProfile() TBAG_NOEXCEPT
{
#if defined(TBAG_OPENCL_BACKEND_PROFILE)
    return true;
#else
    return false;
#endif
}

template <typename T>
static std::string getOpenCLSource(std::string const & source)
{
    return string::Environments(string::fformat("type={}", GpuMemoryTypeSuffix<T>::getPrefix())).convert(source);
}

template <typename T>
static std::string getOpenCLSymbol(std::string const & name)
{
    return string::fformat("{}_{}", name, GpuMemoryTypeSuffix<T>::getPrefix());
}

template <typename T>
static Err testKernelOrInit(GpuContext const & context, SharedGpuKernel & kernel,
                            std::string const & source_template, std::string const & symbol_name)
{
    if (static_cast<bool>(kernel)) {
        return Err::E_SUCCESS;
    }

    SharedGpuProgram program(GpuProgram::newInstance(context, getOpenCLSource<T>(source_template)));
    Err const BUILD_RESULT = program->build();
    if (isFailure(BUILD_RESULT)) {
        return BUILD_RESULT;
    }

    try {
        kernel.reset(GpuKernel::newInstance(*program, getOpenCLSymbol<T>(symbol_name)));
    } catch (std::bad_alloc & e) {
        return Err::E_BADALLOC;
    }
    return Err::E_SUCCESS;
}

static std::string getBuildLog(cl_program program, cl_device_id device)
{
    std::size_t log_buffer_size = 0;
    std::size_t opt_buffer_size = 0;
    cl_int log_code = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG    , 0, nullptr, &log_buffer_size);
    cl_int opt_code = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_OPTIONS, 0, nullptr, &opt_buffer_size);

    if (log_code != CL_SUCCESS || opt_code != CL_SUCCESS) {
        tDLogW("printBuildLog() OpenCL clGetProgramBuildInfo(nullptr) error: log({}), options({})",
               log_code, opt_code);
        return std::string();
    }

    std::vector<char> log_buffer(log_buffer_size, 0);
    std::vector<char> opt_buffer(opt_buffer_size, 0);

    log_code = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG    , log_buffer.size(), log_buffer.data(), nullptr);
    opt_code = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_OPTIONS, opt_buffer.size(), opt_buffer.data(), nullptr);

    if (log_code != CL_SUCCESS || opt_code != CL_SUCCESS) {
        tDLogW("printBuildLog() OpenCL clGetProgramBuildInfo() error: log({}), options({})",
               log_code, opt_code);
        return std::string();
    }

    return string::fformat("ProgramId({}), DeviceId({}), Options({})\n{}",
                           (std::size_t)program, (std::size_t)device,
                           std::string(opt_buffer.data()), std::string(log_buffer.data()));
}

template <typename KernelType>
static bool setKernelMemories(KernelType kernel, std::vector<container::AnyPod> const & mems)
{
    if (mems.empty()) {
        return false;
    }

    using TypeTable = type::TypeTable;

    cl_uint const MEMS_SIZE = (cl_uint)mems.size();
    cl_int code;

    for (cl_uint i = 0; i < MEMS_SIZE; ++i) {
        // @formatter:off
        switch (mems[i].type()) {
#define _TBAG_XX(n, s, t) case TypeTable::TT_##n: code = clSetKernelArg(kernel, i, mems[i].size(), &(mems[i].data().s)); break;
        TBAG_TYPE_TABLE_MAP(_TBAG_XX)
#undef _TBAG_XX
        case TypeTable::TT_UNKNOWN:
            code = clSetKernelArg(kernel, i, sizeof(cl_mem), (void const *)&(mems[i].data().vp));
            break;
        default:
            code = CL_INVALID_VALUE;
            break;
        }
        // @formatter:on

        if (code != CL_SUCCESS) {
            tDLogE("setKernelMemories() OpenCL clSetKernelArg() index({}) error code: {}", (unsigned)i, code);
            return false;
        }
    }
    return true;
}

template <typename KernelType, typename ... Args>
static bool setKernelArguments(KernelType kernel, Args && ... mems)
{
    return setKernelMemories(kernel, {std::forward<Args>(mems) ...});
}

static std::string getDeviceInfoByString(cl_device_id id, cl_device_info info)
{
    size_t value_size = 0;
    if (clGetDeviceInfo(id, info, 0, nullptr, &value_size) == CL_SUCCESS) {
        std::vector<char> buffer(value_size, '\0');
        if (clGetDeviceInfo(id, info, buffer.size(), buffer.data(), nullptr) == CL_SUCCESS) {
            return std::string(buffer.data());
        }
    }
    return std::string();
}

template <typename NativeType, typename ResultType>
static ResultType getDeviceInfo(cl_device_id id, cl_device_info info, ResultType default_value = ResultType())
{
    NativeType value;
    if (clGetDeviceInfo(id, info, sizeof(NativeType), &value, nullptr) == CL_SUCCESS) {
        return static_cast<ResultType>(value);
    }
    return default_value;
}

static std::size_t getMaxPotentialBlockSize(cl_kernel kernel, cl_device_id device)
{
    size_t group_size = 0;
    cl_int code = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
                                           sizeof(size_t), &group_size, nullptr);
    if (code != CL_SUCCESS) {
        group_size = __impl::getDeviceInfo<size_t, size_t>(device, CL_DEVICE_MAX_WORK_GROUP_SIZE);
    }
    return group_size;
}

// ------------------
} // namespace __impl
// ------------------

bool isSupport() TBAG_NOEXCEPT
{
#if defined(USE_OPENCL)
    return true;
#else
    return false;
#endif
}

int getPlatformCount()
{
    int result = 0;
    cl_uint num_platforms;
    cl_int code = clGetPlatformIDs(0, nullptr, &num_platforms);
    if (code != CL_SUCCESS) {
        tDLogE("getPlatformCount() OpenCL clGetPlatformIDs() error code: {}", code);
        return 0;
    }
    result = static_cast<int>(num_platforms);
    return result;
}

GpuPlatforms getPlatformList()
{
    GpuPlatforms result;
    std::vector<cl_platform_id> platforms((std::size_t)getPlatformCount());
    cl_int code = clGetPlatformIDs((cl_uint)platforms.size(), platforms.data(), nullptr);
    if (code != CL_SUCCESS) {
        tDLogE("getPlatformList() OpenCL clGetPlatformIDs() error code: {}", code);
        return result;
    }
    for (auto & id : platforms) {
        result.emplace_back(GpuType::GT_OPENCL, (std::size_t)id);
    }
    return result;
}

GpuPlatformInfo getPlatformInfo(GpuPlatform const & platform)
{
    checkOpenCLGpuType(platform);
    GpuPlatformInfo info;
    auto get_platform_info = [](cl_platform_id id, cl_platform_info info) -> std::string {
        size_t value_size = 0;
        if (clGetPlatformInfo(id, info, 0, nullptr, &value_size) == CL_SUCCESS) {
            std::vector<char> buffer(value_size, '\0');
            if (clGetPlatformInfo(id, info, buffer.size(), buffer.data(), nullptr) == CL_SUCCESS) {
                return std::string(buffer.data());
            }
        }
        return std::string();
    };

    info.profile    = get_platform_info((cl_platform_id)platform.PLATFORM_ID, CL_PLATFORM_PROFILE);
    info.version    = get_platform_info((cl_platform_id)platform.PLATFORM_ID, CL_PLATFORM_VERSION);
    info.name       = get_platform_info((cl_platform_id)platform.PLATFORM_ID, CL_PLATFORM_NAME);
    info.vendor     = get_platform_info((cl_platform_id)platform.PLATFORM_ID, CL_PLATFORM_VENDOR);
    info.extensions = get_platform_info((cl_platform_id)platform.PLATFORM_ID, CL_PLATFORM_EXTENSIONS);
    return info;
}

int getDeviceCount(GpuPlatform const & platform)
{
    checkOpenCLGpuType(platform);
    int result = 0;
    cl_uint num_devices;
    cl_int code = clGetDeviceIDs((cl_platform_id)platform.PLATFORM_ID, CL_DEVICE_TYPE_ALL, 0, nullptr, &num_devices);
    if (code != CL_SUCCESS) {
        tDLogE("getDeviceCount() OpenCL clGetDeviceIDs() error code: {}", code);
        return 0;
    }
    result = static_cast<int>(num_devices);
    return result;
}

GpuDevices getDeviceList(GpuPlatform const & platform)
{
    checkOpenCLGpuType(platform);
    GpuDevices result;
    std::vector<cl_device_id> devices((std::size_t)opencl::getDeviceCount(platform));
    cl_int code = clGetDeviceIDs((cl_platform_id)platform.PLATFORM_ID, CL_DEVICE_TYPE_ALL,
                                 (cl_uint)devices.size(), devices.data(), nullptr);
    if (code != CL_SUCCESS) {
        tDLogE("getDeviceList() OpenCL clGetDeviceIDs() error code: {}", code);
        return result;
    }
    for (auto & id : devices) {
        result.emplace_back(platform, (std::size_t)id);
    }
    return result;
}

GpuDeviceInfo getDeviceInfo(GpuDevice const & device)
{
    checkOpenCLGpuType(device);
    GpuDeviceInfo info;

    cl_device_id const DEVICE_ID = (cl_device_id)device.DEVICE_ID;
    info.name           = __impl::getDeviceInfoByString(DEVICE_ID, CL_DEVICE_NAME);
    info.device_version = __impl::getDeviceInfoByString(DEVICE_ID, CL_DEVICE_VERSION);
    info.driver_version = __impl::getDeviceInfoByString(DEVICE_ID, CL_DRIVER_VERSION);
    info.global_memory  = __impl::getDeviceInfo<cl_ulong, std::size_t>(DEVICE_ID, CL_DEVICE_GLOBAL_MEM_SIZE);

    auto max_compute_units        = __impl::getDeviceInfo<cl_uint, unsigned>(DEVICE_ID, CL_DEVICE_MAX_COMPUTE_UNITS);
    auto max_work_group_size      = __impl::getDeviceInfo< size_t, size_t>(DEVICE_ID, CL_DEVICE_MAX_WORK_GROUP_SIZE);
    auto max_work_item_dimensions = __impl::getDeviceInfo<cl_uint, unsigned>(DEVICE_ID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);

    info.insert(TBAG_GPU_DEVICE_INFO_MAX_COMPUTE_UNITS       , max_compute_units);
    info.insert(TBAG_GPU_DEVICE_INFO_MAX_WORK_GROUP_SIZE     , max_work_group_size);
    info.insert(TBAG_GPU_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS, max_work_item_dimensions);

    std::vector<size_t> max_work_item_sizes(max_work_item_dimensions);
    if (clGetDeviceInfo(DEVICE_ID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
                        sizeof(size_t) * max_work_item_dimensions,
                        max_work_item_sizes.data(), nullptr) == CL_SUCCESS) {
        for (unsigned i = 0; i < max_work_item_dimensions; ++i) {
            std::string const key = std::string(TBAG_GPU_DEVICE_INFO_MAX_WORK_ITEM_SIZES) + "_" + std::to_string(i);
            info.insert(key, std::to_string(max_work_item_sizes[i]));
        }
    }

    auto type = __impl::getDeviceInfo<cl_device_type, cl_device_type>(DEVICE_ID, CL_DEVICE_TYPE);
    if (type == CL_DEVICE_TYPE_CPU) {
        info.insert(TBAG_GPU_DEVICE_TYPE, TBAG_GPU_DEVICE_TYPE_CPU);
    } else if (type == CL_DEVICE_TYPE_GPU) {
        info.insert(TBAG_GPU_DEVICE_TYPE, TBAG_GPU_DEVICE_TYPE_GPU);
    } else if (type == CL_DEVICE_TYPE_ACCELERATOR) {
        info.insert(TBAG_GPU_DEVICE_TYPE, TBAG_GPU_DEVICE_TYPE_ACCELERATOR);
    } else {
        assert(type == CL_DEVICE_TYPE_DEFAULT);
        info.insert(TBAG_GPU_DEVICE_TYPE, TBAG_GPU_DEVICE_TYPE_DEFAULT);
    }

    return info;
}

SharedGpuContext createContext(GpuDevice const & device)
{
    checkOpenCLGpuType(device);
    cl_int code;
    cl_context context = clCreateContext(nullptr, 1, (cl_device_id const *)&device.DEVICE_ID, nullptr, nullptr, &code);
    if (code == CL_SUCCESS) {
        return SharedGpuContext(new OpenCLContext(device, (GpuId)context), [](OpenCLContext * context){
            cl_int code = clReleaseContext((cl_context)context->CONTEXT_ID);
            if (code != CL_SUCCESS) {
                tDLogE("createContext() OpenCL clReleaseContext() error code: {}", code);
            }
        });
    } else {
        tDLogE("createContext() OpenCL clCreateContext() error code: {}", code);
    }
    return SharedGpuContext();
}

// -----------------------------
// OpenCLContext implementation.
// -----------------------------

OpenCLContext::OpenCLContext(GpuDevice const & d, GpuId c) : GpuContext(d, c)
{
    // EMPTY.
}

OpenCLContext::~OpenCLContext()
{
    // EMPTY.
}

Err OpenCLContext::_write(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem,
                          std::size_t size, bool blocking, GpuEvent * event) const
{
    cl_int code = clEnqueueWriteBuffer(stream.castId<cl_command_queue>(), gpu_mem.cast<cl_mem>(),
                                       (blocking ? CL_TRUE : CL_FALSE),
                                       0, host_mem.size(), host_mem.data(), 0, nullptr,
                                       (cl_event*)(event == nullptr ? nullptr : &event->atId()));
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::_write({}) OpenCL clEnqueueWriteBuffer() error code: {}",
               (blocking ? "BLOCKING" : "NON-BLOCKING"), code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::_read(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem,
                         std::size_t size, bool blocking, GpuEvent * event) const
{
    cl_int code = clEnqueueReadBuffer(stream.castId<cl_command_queue>(), gpu_mem.cast<cl_mem>(),
                                      (blocking ? CL_TRUE : CL_FALSE),
                                      0, host_mem.size(), host_mem.data(), 0, nullptr,
                                      (cl_event*)(event == nullptr ? nullptr : &event->atId()));
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::_read({}) OpenCL clEnqueueReadBuffer() error code: {}",
               (blocking ? "BLOCKING" : "NON-BLOCKING"), code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

bool OpenCLContext::isSupport() const TBAG_NOEXCEPT
{
    return opencl::isSupport();
}

bool OpenCLContext::isHost() const TBAG_NOEXCEPT
{
    return !isDevice();
}

bool OpenCLContext::isDevice() const TBAG_NOEXCEPT
{
    return __impl::getDeviceInfo<cl_device_type, cl_device_type>((cl_device_id)getDeviceId(),
                                                                 CL_DEVICE_TYPE, CL_DEVICE_TYPE_DEFAULT) == CL_DEVICE_TYPE_GPU;
}

bool OpenCLContext::isStream() const TBAG_NOEXCEPT
{
    return true;
}

Err OpenCLContext::createStream(GpuStream & stream) const
{
    if (stream.isSameContext(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code;
    cl_command_queue_properties properties = 0;
    if (__impl::isOpenCLBackendProfile()) {
        properties |= CL_QUEUE_PROFILING_ENABLE;
    }
    cl_command_queue native_stream = clCreateCommandQueue((cl_context)CONTEXT_ID,
                                                          (cl_device_id)DEVICE_ID,
                                                          properties, &code);
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::createStream() OpenCL clCreateCommandQueue() error code: {}", code);
        return Err::E_OPENCL;
    }

    stream.setId(native_stream);
    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::createStream({})", getContextId(), stream.getId());
    return Err::E_SUCCESS;
}

Err OpenCLContext::releaseStream(GpuStream & stream) const
{
    if (stream.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::releaseStream({})", getContextId(), stream.getId());
    cl_int code = clReleaseCommandQueue(stream.castId<cl_command_queue>());
    stream.clearId();
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::releaseStream() OpenCL clReleaseCommandQueue() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::createEvent(GpuStream const & stream, GpuEvent & event) const
{
    if (stream.validate(*this) == false || event.isSameContext(*this) == false) {
        return Err::E_ILLARGS;
    }
    event.setStart(0);
    event.setStop(0);
    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::createEvent(s:{}, e:{})",
             getContextId(), stream.getId(), event.getId());
    return Err::E_SUCCESS;
}

Err OpenCLContext::syncEvent(GpuEvent const & event) const
{
    if (event.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code = clWaitForEvents(1, (cl_event const *)&event.atId());
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::syncEvent() OpenCL clWaitForEvents() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::elapsedEvent(GpuEvent & event, float * millisec) const
{
    if (event.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_ulong start_nano, stop_nano;
    cl_int start_code = clGetEventProfilingInfo(event.castId<cl_event>(), CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_nano, nullptr);
    cl_int  stop_code = clGetEventProfilingInfo(event.castId<cl_event>(), CL_PROFILING_COMMAND_END  , sizeof(cl_ulong),  &stop_nano, nullptr);

    if (start_code != CL_SUCCESS || stop_code != CL_SUCCESS) {
        tDLogE("OpenCLContext::elapsedEvent() OpenCL clGetEventProfilingInfo() error code: start({}), stop({})",
               start_code, stop_code);
        return Err::E_OPENCL;
    }

    if (millisec != nullptr) {
        *millisec = (stop_nano - start_nano) * 1.0e-6f;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::releaseEvent(GpuEvent & event) const
{
    if (event.validate(*this) == false) {
        return Err::E_ILLARGS;
    }
    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::releaseEvent({})", getContextId(), event.getId());
    event.clearIds();
    return Err::E_SUCCESS;
}

Err OpenCLContext::createProgram(std::string const & source, GpuProgram & program) const
{
    if (source.empty() || program.isSameContext(*this) == false) {
        return Err::E_ILLARGS;
    }

    char const * c_source = source.c_str();
    cl_int code;
    cl_program native_program = clCreateProgramWithSource((cl_context)CONTEXT_ID, 1, &c_source, nullptr, &code);
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::createProgram() OpenCL clCreateProgramWithSource() error code: {}", code);
        return Err::E_OPENCL;
    }

    program.setId(native_program);
    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::createProgram({})", getContextId(), program.getId());
    return Err::E_SUCCESS;
}

Err OpenCLContext::buildProgram(GpuProgram & program) const
{
    if (program.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code = clBuildProgram(program.castId<cl_program>(), 1, (cl_device_id const *)&DEVICE_ID,
                                 __impl::OPENCL_DEFAULT_BUILD_OPTION, nullptr, nullptr);
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::buildProgram() OpenCL clBuildProgram() error code: {}", code);
        tDLogW("Build Log: {}", __impl::getBuildLog(program.castId<cl_program>(), castDeviceId<cl_device_id>()));
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::releaseProgram(GpuProgram & program) const
{
    if (program.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::releaseProgram({})", getContextId(), program.getId());
    cl_int code = clReleaseProgram(program.castId<cl_program>());
    program.clearId();
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::releaseProgram() OpenCL clReleaseProgram() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::createKernel(GpuProgram const & program, std::string const & kernel_symbol, GpuKernel & kernel) const
{
    if (program.validate(*this) == false || kernel_symbol.empty() || kernel.isSameContext(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code;
    cl_kernel native_kernel = clCreateKernel(program.castId<cl_program>(), kernel_symbol.c_str(), &code);
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::createKernel() OpenCL clCreateKernel() error code: {}", code);
        return Err::E_OPENCL;
    }

    kernel.setId(native_kernel);
    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::createKernel(p:{}, k:{}) Symbol: {}",
             getContextId(), program.getId(), kernel.getId(), kernel_symbol);
    return Err::E_SUCCESS;
}

Err OpenCLContext::releaseKernel(GpuKernel & kernel) const
{
    if (kernel.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::releaseKernel({})", getContextId(), kernel.getId());
    cl_int code = clReleaseKernel(kernel.castId<cl_kernel>());
    kernel.clearId();
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::releaseKernel() OpenCL clReleaseKernel() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::malloc(GpuMemory & mem, std::size_t size) const
{
    if (mem.isSameContext(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code;
    cl_mem native_memory = clCreateBuffer((cl_context)CONTEXT_ID, CL_MEM_READ_WRITE, size, nullptr, &code);
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::malloc({}) OpenCL clCreateBuffer() error code: {}", size, code);
        return Err::E_OPENCL;
    }

    mem.set((void*)native_memory, size, size);
    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::malloc({}) OpenCL clCreateBuffer() MEM:{} CAP:{} SIZE:{}",
             getContextId(), size, mem.data(), mem.capacity(), mem.size());
    return Err::E_SUCCESS;
}

Err OpenCLContext::free(GpuMemory & mem) const
{
    if (mem.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    tDLogIfD(isGpuVerbose(), "OpenCLContext({})::free() OpenCL clReleaseMemObject() MEM:{} CAP:{} SIZE:{}",
             getContextId(), mem.data(), mem.capacity(), mem.size());

    cl_int code = clReleaseMemObject(mem.cast<cl_mem>());
    mem.clear();
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::free() OpenCL clReleaseMemObject() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::write(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event) const
{
    if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
        return Err::E_ILLARGS;
    }
    return _write(stream, gpu_mem, host_mem, size, true, event);
}

Err OpenCLContext::read(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem, std::size_t size, GpuEvent * event) const
{
    if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
        return Err::E_ILLARGS;
    }
    return _read(stream, gpu_mem, host_mem, size, true, event);
}

Err OpenCLContext::writeAsync(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event) const
{
    if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
        return Err::E_ILLARGS;
    }
    return _write(stream, gpu_mem, host_mem, size, false, event);
}

Err OpenCLContext::readAsync(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem, std::size_t size, GpuEvent * event) const
{
    if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
        return Err::E_ILLARGS;
    }
    return _read(stream, gpu_mem, host_mem, size, false, event);
}

Err OpenCLContext::copy(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event) const
{
    if (validateMemory(stream, src, dest, size) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code = clEnqueueCopyBuffer(stream.castId<cl_command_queue>(), src.cast<cl_mem>(), dest.cast<cl_mem>(),
                                      0, 0, size, 0, nullptr, (cl_event*)(event == nullptr ? nullptr : &event->atId()));
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::copy() OpenCL clEnqueueCopyBuffer() error code: {}", code);
        return Err::E_OPENCL;
    }

    cl_int finish_code = clFinish(stream.castId<cl_command_queue>());
    if (finish_code != CL_SUCCESS) {
        tDLogE("OpenCLContext::copy() OpenCL clFinish() error code: {}", finish_code);
        return Err::E_OPENCL;
    }

    return Err::E_SUCCESS;
}

Err OpenCLContext::copyAsync(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event) const
{
    if (validateMemory(stream, src, dest, size) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code = clEnqueueCopyBuffer(stream.castId<cl_command_queue>(), src.cast<cl_mem>(), dest.cast<cl_mem>(),
                                      0, 0, size, 0, nullptr, (cl_event*)(event == nullptr ? nullptr : &event->atId()));
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::copyAsync() OpenCL clEnqueueCopyBuffer() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::flush(GpuStream const & stream) const
{
    if (stream.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code = clFlush(stream.castId<cl_command_queue>());
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::flush() OpenCL clFlush() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

Err OpenCLContext::finish(GpuStream const & stream) const
{
    if (stream.validate(*this) == false) {
        return Err::E_ILLARGS;
    }

    cl_int code = clFinish(stream.castId<cl_command_queue>());
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::finish() OpenCL clFinish() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

TBAG_CONSTEXPR static char const * const _TBAG_OPENCL_SOURCE_FILL = R"(
__kernel void fill_${type}(__global ${type} * out, ${type} data)
{
    uint w = get_global_size(0);
    uint y = get_global_id(1);
    uint x = get_global_id(0);
    uint i = y * w + x;
    out[i] = data;
}
)";

template <typename T>
static Err fillByOpenCL(GpuContext const & context, GpuStream const & stream, SharedGpuKernel & kernel,
                       T * out, T data, int count, GpuEvent * event)
{
    if (stream.validate(context) == false) {
        return Err::E_ILLARGS;
    }

    Err const INIT_CODE = __impl::testKernelOrInit<T>(context, kernel, _TBAG_OPENCL_SOURCE_FILL, "fill");
    if (isFailure(INIT_CODE)) {
        return INIT_CODE;
    }
    assert(static_cast<bool>(kernel));

    if (__impl::setKernelArguments(kernel->castId<cl_kernel>(), out, data) == false) {
        tDLogE("OpenCLContext::fillByOpenCL() OpenCL argument error.");
        return Err::E_OPENCL;
    }

    std::size_t global_work_size[1] = { static_cast<std::size_t>(count) };
    // local_work_size can also be a NULL value in which case the OpenCL implementation
    // will determine how to be break the global work-items into appropriate work-group instances.
    cl_int code = clEnqueueNDRangeKernel(stream.castId<cl_command_queue>(), kernel->castId<cl_kernel>(),
                                         1, nullptr, global_work_size, nullptr, 0, nullptr,
                                         (cl_event*)(event == nullptr ? nullptr : &event->atId()));
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::fillByOpenCL() OpenCL clEnqueueNDRangeKernel() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

// @formatter:off
Err OpenCLContext::fill(GpuStream const & stream, int * out, int data, int count, GpuEvent * event) const
{ return fillByOpenCL(*this, stream, _fill.i, out, data, count, event); }
Err OpenCLContext::fill(GpuStream const & stream, unsigned * out, unsigned data, int count, GpuEvent * event) const
{ return fillByOpenCL(*this, stream, _fill.u, out, data, count, event); }
Err OpenCLContext::fill(GpuStream const & stream, float * out, float data, int count, GpuEvent * event) const
{ return fillByOpenCL(*this, stream, _fill.f, out, data, count, event); }
Err OpenCLContext::fill(GpuStream const & stream, double * out, double data, int count, GpuEvent * event) const
{ return fillByOpenCL(*this, stream, _fill.d, out, data, count, event); }
// @formatter:on

TBAG_CONSTEXPR static char const * const _TBAG_OPENCL_SOURCE_ADD = R"(
__kernel void add_${type}(__global ${type} * in1, __global ${type} * in2, __global ${type} * out)
{
    uint w = get_global_size(0);
    uint x = get_global_id(0);
    uint y = get_global_id(1);
    uint i = (y * w) + x;
    out[i] = in1[i] + in2[i];
}
)";

template <typename T>
static Err addByOpenCL(GpuContext const & context, GpuStream const & stream, SharedGpuKernel & kernel,
                       T const * in1, T const * in2, T * out, int count, GpuEvent * event)
{
    if (stream.validate(context) == false) {
        return Err::E_ILLARGS;
    }

    Err const INIT_CODE = __impl::testKernelOrInit<T>(context, kernel, _TBAG_OPENCL_SOURCE_ADD, "add");
    if (isFailure(INIT_CODE)) {
        return INIT_CODE;
    }
    assert(static_cast<bool>(kernel));

    if (__impl::setKernelArguments(kernel->castId<cl_kernel>(), in1, in2, out) == false) {
        tDLogE("OpenCLContext::fillByOpenCL() OpenCL argument error.");
        return Err::E_OPENCL;
    }

    std::size_t global_work_size[1] = { static_cast<std::size_t>(count) };
    // local_work_size can also be a NULL value in which case the OpenCL implementation
    // will determine how to be break the global work-items into appropriate work-group instances.
    cl_int code = clEnqueueNDRangeKernel(stream.castId<cl_command_queue>(), kernel->castId<cl_kernel>(),
                                         1, nullptr, global_work_size, nullptr, 0, nullptr,
                                         (cl_event*)(event == nullptr ? nullptr : &event->atId()));
    if (code != CL_SUCCESS) {
        tDLogE("OpenCLContext::addByOpenCL() OpenCL clEnqueueNDRangeKernel() error code: {}", code);
        return Err::E_OPENCL;
    }
    return Err::E_SUCCESS;
}

// @formatter:off
Err OpenCLContext::add(GpuStream const & stream, int const * in1, int const * in2, int * out, int count, GpuEvent * event) const
{ return addByOpenCL(*this, stream, _add.i, in1, in2, out, count, event); }
Err OpenCLContext::add(GpuStream const & stream, unsigned const * in1, unsigned const * in2, unsigned * out, int count, GpuEvent * event) const
{ return addByOpenCL(*this, stream, _add.u, in1, in2, out, count, event); }
Err OpenCLContext::add(GpuStream const & stream, float const * in1, float const * in2, float * out, int count, GpuEvent * event) const
{ return addByOpenCL(*this, stream, _add.f, in1, in2, out, count, event); }
Err OpenCLContext::add(GpuStream const & stream, double const * in1, double const * in2, double * out, int count, GpuEvent * event) const
{ return addByOpenCL(*this, stream, _add.d, in1, in2, out, count, event); }
// @formatter:on

} // namespace opencl
} // namespace gpu

// --------------------
NAMESPACE_LIBTBAG_CLOSE
// --------------------

See also