CUDA:Example:ContextWrapper
CUDA의 Context Wrapper 클래스 구현 예제.
Header
/**
* @file CudaContext.hpp
* @brief CudaContext class prototype.
* @author username
* @date 2018-01-16
*/
#ifndef __INCLUDE_LIBTBAG__LIBTBAG_GPU_CUDA_CUDACONTEXT_HPP__
#define __INCLUDE_LIBTBAG__LIBTBAG_GPU_CUDA_CUDACONTEXT_HPP__
// MS compatible compilers support #pragma once
#if defined(_MSC_VER) && (_MSC_VER >= 1020)
#pragma once
#endif
#include <libtbag/config.h>
#include <libtbag/predef.hpp>
#include <libtbag/gpu/details/GpuDetails.hpp>
// -------------------
NAMESPACE_LIBTBAG_OPEN
// -------------------
namespace gpu {
namespace cuda {
/** is the maximum amount of shared memory available to a thread block in bytes; @n
* this amount is shared by all thread blocks simultaneously resident on a multiprocessor; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_SHARED_MEM_PER_BLOCK = "sharedMemPerBlock";
/** is the maximum number of 32-bit registers available to a thread block; @n
* this number is shared by all thread blocks simultaneously resident on a multiprocessor; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_REGS_PER_BLOCK = "regsPerBlock";
/** is the warp size in threads; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_WARP_SIZE = "warpSize";
/** is the maximum pitch in bytes allowed by the memory copy functions @n
* that involve memory regions allocated through cudaMallocPitch(); */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MEM_PITCH = "memPitch";
/** is the maximum number of threads per block; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_THREADS_PER_BLOCK = "maxThreadsPerBlock";
/** contains the maximum size of each dimension of a block; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_THREADS_DIM_0 = "maxThreadsDim0";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_THREADS_DIM_1 = "maxThreadsDim1";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_THREADS_DIM_2 = "maxThreadsDim2";
/** contains the maximum size of each dimension of a grid; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_GRID_SIZE_0 = "maxGridSize0";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_GRID_SIZE_1 = "maxGridSize1";
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MAX_GRID_SIZE_2 = "maxGridSize2";
/** is the clock frequency in kilohertz; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_CLOCK_RATE = "clockRate";
/** is the total amount of constant memory available on the device in bytes; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_TOTAL_CONST_MEM = "totalConstMem";
/** is the alignment requirement; texture base addresses that are aligned @n
* to textureAlignment bytes do not need an offset applied to texture fetches; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_TEXTURE_ALIGNMENT = "textureAlignment";
/** is 1 if the device can concurrently copy memory between host and device while executing a kernel, or 0 if not; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_DEVICE_OVERLAP = "deviceOverlap";
/** is the number of multiprocessors on the device; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_MULTI_PROCESSOR_COUNT = "multiProcessorCount";
/** is 1 if there is a run time limit for kernels executed on the device, or 0 if not. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_KERNEL_EXEC_TIMEOUT_ENABLED = "kernelExecTimeoutEnabled";
/** is 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_INTEGRATED = "integrated";
/** is 1 if the device can map host memory into the CUDA address space @n
* for use with cudaHostAlloc()/cudaHostGetDevicePointer(), or 0 if not; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_CAN_MAP_HOST_MEMORY = "canMapHostMemory";
/** is the compute mode that the device is currently in. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_COMPUTE_MODE = "computeMode";
/** Default mode - Device is not restricted and multiple threads can use cudaSetDevice() with this device. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_COMPUTE_MODE_DEFAULT = "cudaComputeModeDefault";
/** Compute-exclusive mode - Only one thread will be able to use cudaSetDevice() with this device. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_COMPUTE_MODE_EXCLUSIVE = "cudaComputeModeExclusive";
/** Compute-prohibited mode - No threads can use cudaSetDevice() with this device. @n
* Any errors from calling cudaSetDevice() with an exclusive (and occupied) or prohibited device @n
* will only show up after a non-device management runtime function is called. @n
* At that time, cudaErrorNoDevice will be returned. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_COMPUTE_MODE_PROHIBITED = "cudaComputeModeProhibited";
/** is 1 if the device supports executing multiple kernels within the same context simultaneously, or 0 if not. @n
* It is not guaranteed that multiple kernels will be resident on the device concurrently @n
* so this feature should not be relied upon for correctness; */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_CONCURRENT_KERNELS = "concurrentKernels";
/** is 1 if the device has ECC support turned on, or 0 if not. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_ECC_ENABLED = "ECCEnabled";
/** is the PCI bus identifier of the device. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_PCI_BUS_ID = "pciBusID";
/** is the PCI device (sometimes called slot) identifier of the device. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_PCI_DEVICE_ID = "pciDeviceID";
/** is 1 if the device is using a TCC driver or 0 if not. */
TBAG_CONSTEXPR char const * const TBAG_GPU_DEVICE_INFO_TCC_DRIVER = "tccDriver";
TBAG_API bool isSupport() TBAG_NOEXCEPT;
TBAG_API int getPlatformCount();
TBAG_API GpuPlatforms getPlatformList ();
TBAG_API GpuPlatformInfo getPlatformInfo (GpuPlatform const & platform);
TBAG_API int getDeviceCount(GpuPlatform const & platform);
TBAG_API GpuDevices getDeviceList (GpuPlatform const & platform);
TBAG_API GpuDeviceInfo getDeviceInfo (GpuDevice const & device);
TBAG_API SharedGpuContext createContext(GpuDevice const & device);
/**
* CudaContext class prototype.
*
* @author username
* @date 2018-01-13
*/
class TBAG_API CudaContext : public GpuContext
{
public:
CudaContext(GpuDevice const & d, GpuId c);
virtual ~CudaContext();
public:
/**
* Maximum Thread Blocks Per Multiprocessor.
*/
struct OccupancyInfo
{
std::size_t max_group; ///< Limited by Max Warps or Max Blocks per Multiprocessor.
std::size_t registers; ///< Limited by Registers per Multiprocessor.
std::size_t shared_memory; ///< Limited by Shared Memory per Multiprocessor.
};
public:
Err calcOccupancy(int thread_per_block,
int registers_per_thread,
int shared_memory_per_block,
OccupancyInfo & result);
public:
/**
* Theoretical bandwidth can be calculated using hardware specifications available in the product literature.
*
* @return
* Theoretical bandwidth (GByte/s).
*
* @remarks
* - Note: Some calculations use 10243 instead of 109 for the final calculation. @n
* In such a case, the bandwidth would be 165.4GB/s. It is important to @n
* use the same divisor when calculating theoretical and effective @n
* bandwidth so that the comparison is valid. @n
* - Note: When ECC is enabled, the effective maximum bandwidth is reduced by @n
* approximately 20% due to the additional traffic for the memory @n
* checksums, though the exact impact of ECC on bandwidth depends on the @n
* memory access pattern.
*
* @see <http://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#theoretical-bandwidth-calculation>
*/
static double calcTheoreticalBandwidth(double memory_clock_rate_ghz,
int memory_interface_bit,
int memory_interface_lane,
bool enable_ecc = false);
/**
* Effective bandwidth is calculated by timing specific program activities
* and by knowing how data is accessed by the program.
*
* @return
* Effective bandwidth (GByte/s).
*
* @remarks
* - Note: Some calculations use 10243 instead of 109 for the final calculation. @n
* In such a case, the bandwidth would be 165.4GB/s. It is important to @n
* use the same divisor when calculating theoretical and effective @n
* bandwidth so that the comparison is valid. @n
*
* @see <http://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#theoretical-bandwidth-calculation>
*/
static double calcEffectiveBandwidth(int read_byte_by_kernel, int write_byte_by_kernel, int seconds);
public:
static bool setBankSizeByDefault();
static bool setBankSizeBy4Byte();
static bool setBankSizeBy8Byte();
public:
virtual bool isSupport() const TBAG_NOEXCEPT override;
virtual bool isHost() const TBAG_NOEXCEPT override;
virtual bool isDevice() const TBAG_NOEXCEPT override;
virtual bool isStream() const TBAG_NOEXCEPT override;
virtual Err createStream(GpuStream & stream) const override;
virtual Err releaseStream(GpuStream & stream) const override;
virtual Err createEvent(GpuStream const & stream, GpuEvent & event) const override;
virtual Err syncEvent(GpuEvent const & event) const override;
virtual Err elapsedEvent(GpuEvent & event, float * millisec = nullptr) const override;
virtual Err releaseEvent(GpuEvent & event) const override;
virtual Err malloc(GpuMemory & memory, std::size_t size) const override;
virtual Err free(GpuMemory & memory) const override;
virtual Err mallocHost(HostMemory & memory, std::size_t size, HostMemoryFlag flag = HostMemoryFlag::HMF_DEFAULT) const override;
virtual Err freeHost(HostMemory & memory) const override;
virtual Err write(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err read(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err writeAsync(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err readAsync(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err copy(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err copy(GpuStream const & stream, HostMemory const & src, HostMemory & dest, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err copyAsync(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err copyAsync(GpuStream const & stream, HostMemory const & src, HostMemory & dest, std::size_t size, GpuEvent * event = nullptr) const override;
virtual Err flush(GpuStream const & stream) const override;
virtual Err finish(GpuStream const & stream) const override;
public:
virtual Err fill(GpuStream const & stream, int * out, int data, int count, GpuEvent * event) const override;
virtual Err fill(GpuStream const & stream, unsigned * out, unsigned data, int count, GpuEvent * event) const override;
virtual Err fill(GpuStream const & stream, float * out, float data, int count, GpuEvent * event) const override;
virtual Err fill(GpuStream const & stream, double * out, double data, int count, GpuEvent * event) const override;
virtual Err add(GpuStream const & stream, int const * in1, int const * in2, int * out, int count, GpuEvent * event) const override;
virtual Err add(GpuStream const & stream, unsigned const * in1, unsigned const * in2, unsigned * out, int count, GpuEvent * event) const override;
virtual Err add(GpuStream const & stream, float const * in1, float const * in2, float * out, int count, GpuEvent * event) const override;
virtual Err add(GpuStream const & stream, double const * in1, double const * in2, double * out, int count, GpuEvent * event) const override;
};
} // namespace cuda
} // namespace gpu
// --------------------
NAMESPACE_LIBTBAG_CLOSE
// --------------------
#endif // __INCLUDE_LIBTBAG__LIBTBAG_GPU_CUDA_CUDACONTEXT_HPP__
Source
/**
* @file CudaContext.cpp
* @brief CudaContext class implementation.
* @author username
* @date 2018-01-16
*/
#include <libtbag/gpu/cuda/CudaContext.hpp>
#include <libtbag/log/Log.hpp>
#include <libtbag/string/Format.hpp>
#include <libtbag/string/StringUtils.hpp>
#include <libtbag/gpu/cuda/CudaRaw.h>
#if defined(USE_CUDA)
# include <cuda.h>
# include <cuda_runtime.h>
#else
# include <libtbag/dummy/Cuda.hpp>
using namespace libtbag::dummy::cuda;
#endif
// -------------------
NAMESPACE_LIBTBAG_OPEN
// -------------------
namespace gpu {
namespace cuda {
// ---------------
namespace __impl {
// ---------------
static void startEvent(GpuStream const & stream, GpuEvent * event)
{
cudaError_t code = cudaEventRecord(event->castStart<cudaEvent_t>(), stream.castId<cudaStream_t>());
if (code != cudaSuccess) {
tDLogW("startEvent() CUDA cudaEventRecord() error: {}", cudaGetErrorString(code));
}
}
static void stopEvent(GpuStream const & stream, GpuEvent * event)
{
cudaError_t code = cudaEventRecord(event->castStop<cudaEvent_t>(), stream.castId<cudaStream_t>());
if (code != cudaSuccess) {
tDLogW("stopEvent() CUDA cudaEventRecord() error: {}", cudaGetErrorString(code));
}
}
struct CudaEventGuard : private Noncopyable
{
GpuStream const & stream;
GpuEvent * event;
CudaEventGuard(GpuStream const & q, GpuEvent * e = nullptr) : stream(q), event(e)
{
if (event != nullptr) {
startEvent(stream, event);
}
}
~CudaEventGuard()
{
if (event != nullptr) {
stopEvent(stream, event);
}
}
};
struct CudaDeviceGuard : private Noncopyable
{
int current_id = 0;
int change_id = 0;
bool exchange = false;
explicit CudaDeviceGuard(GpuDevice const & device) : CudaDeviceGuard( device.getDeviceId()) { /* EMPTY. */ }
explicit CudaDeviceGuard(GpuContext const & context) : CudaDeviceGuard(context.getDeviceId()) { /* EMPTY. */ }
CudaDeviceGuard(GpuId const & device_id) : current_id(0), change_id(0), exchange(false)
{
if (device_id == UNKNOWN_ID) {
return;
}
cudaError_t get_code = cudaGetDevice(¤t_id);
if (get_code != cudaSuccess) {
tDLogE("CudaDeviceGuard::CudaDeviceGuard() CUDA cudaGetDevice() error: {}", cudaGetErrorString(get_code));
return;
}
change_id = (int)device_id;
if (current_id == change_id) {
return;
}
cudaError_t set_code = cudaSetDevice(change_id);
if (set_code == cudaSuccess) {
exchange = true;
} else {
tDLogE("CudaDeviceGuard::CudaDeviceGuard() CUDA cudaSetDevice() error: {}", cudaGetErrorString(set_code));
}
}
~CudaDeviceGuard()
{
if (exchange == false) {
return;
}
cudaError_t set_code = cudaSetDevice(current_id);
if (set_code == cudaSuccess) {
tDLogE("CudaDeviceGuard::~CudaDeviceGuard() CUDA cudaSetDevice() error: {}", cudaGetErrorString(set_code));
}
}
};
// ------------------
} // namespace __impl
// ------------------
bool isSupport() TBAG_NOEXCEPT
{
#if defined(USE_CUDA)
return true;
#else
return false;
#endif
}
int getPlatformCount()
{
return 1;
}
GpuPlatforms getPlatformList()
{
return {GpuPlatform(GpuType::GT_CUDA, 0)};
}
GpuPlatformInfo getPlatformInfo(GpuPlatform const & platform)
{
checkCudaGpuType(platform);
GpuPlatformInfo info;
info.name = "CUDA";
info.vendor = "NVIDIA";
int driver_version = 0;
cudaError_t driver_code = cudaDriverGetVersion(&driver_version);
if (driver_code == cudaSuccess) {
info.version += string::fformat("DRIVER({})", driver_version);
} else {
tDLogE("getPlatformInfo() CUDA cudaDriverGetVersion() error: {}", cudaGetErrorString(driver_code));
}
int runtime_version = 0;
cudaError_t runtime_code = cudaRuntimeGetVersion(&runtime_version);
if (runtime_code == cudaSuccess) {
info.version += string::fformat("RUNTIME({})", runtime_version);
} else {
tDLogE("getPlatformInfo() CUDA cudaRuntimeGetVersion() error: {}", cudaGetErrorString(runtime_code));
}
#if defined(CUDA_VERSION)
info.version += string::fformat("API({})", TO_STRING(CUDA_VERSION));
#endif
return info;
}
int getDeviceCount(GpuPlatform const & platform)
{
checkCudaGpuType(platform);
int result = 0;
cudaError_t code = cudaGetDeviceCount(&result);
if (code != cudaSuccess) {
tDLogE("getDeviceCount() CUDA cudaGetDeviceCount() error: {}", cudaGetErrorString(code));
return 0;
}
return result;
}
GpuDevices getDeviceList(GpuPlatform const & platform)
{
checkCudaGpuType(platform);
GpuDevices result;
for (int i = 0; i < cuda::getDeviceCount(platform); ++i) {
result.emplace_back(GpuDevice(platform, i));
}
return result;
}
GpuDeviceInfo getDeviceInfo(GpuDevice const & device)
{
checkCudaGpuType(device);
GpuDeviceInfo info;
cudaDeviceProp prop;
cudaError_t code = cudaGetDeviceProperties(&prop, device.getDeviceId());
if (code != cudaSuccess) {
tDLogE("getDeviceInfo() CUDA cudaGetDeviceProperties() error: {}", cudaGetErrorString(code));
return info;
}
info.name = prop.name;
info.device_version = string::fformat("{}.{}", prop.major, prop.minor);
info.global_memory = prop.totalGlobalMem;
int driver_version = 0;
cudaError_t driver_code = cudaDriverGetVersion(&driver_version);
if (driver_code == cudaSuccess) {
info.driver_version = std::to_string(driver_version);
} else {
tDLogE("getDeviceInfo() CUDA cudaDriverGetVersion() error: {}", cudaGetErrorString(driver_code));
}
info.insert(TBAG_GPU_DEVICE_INFO_SHARED_MEM_PER_BLOCK , prop.sharedMemPerBlock);
info.insert(TBAG_GPU_DEVICE_INFO_REGS_PER_BLOCK , prop.regsPerBlock);
info.insert(TBAG_GPU_DEVICE_INFO_WARP_SIZE , prop.warpSize);
info.insert(TBAG_GPU_DEVICE_INFO_MEM_PITCH , prop.memPitch);
info.insert(TBAG_GPU_DEVICE_INFO_MAX_THREADS_PER_BLOCK , prop.maxThreadsPerBlock);
info.insert(TBAG_GPU_DEVICE_INFO_MAX_THREADS_DIM_0 , prop.maxThreadsDim[0]);
info.insert(TBAG_GPU_DEVICE_INFO_MAX_THREADS_DIM_1 , prop.maxThreadsDim[1]);
info.insert(TBAG_GPU_DEVICE_INFO_MAX_THREADS_DIM_2 , prop.maxThreadsDim[2]);
info.insert(TBAG_GPU_DEVICE_INFO_MAX_GRID_SIZE_0 , prop.maxGridSize[0]);
info.insert(TBAG_GPU_DEVICE_INFO_MAX_GRID_SIZE_1 , prop.maxGridSize[1]);
info.insert(TBAG_GPU_DEVICE_INFO_MAX_GRID_SIZE_2 , prop.maxGridSize[2]);
info.insert(TBAG_GPU_DEVICE_INFO_CLOCK_RATE , prop.clockRate);
info.insert(TBAG_GPU_DEVICE_INFO_TOTAL_CONST_MEM , prop.totalConstMem);
info.insert(TBAG_GPU_DEVICE_INFO_TEXTURE_ALIGNMENT , prop.textureAlignment);
info.insert(TBAG_GPU_DEVICE_INFO_DEVICE_OVERLAP , prop.deviceOverlap);
info.insert(TBAG_GPU_DEVICE_INFO_MULTI_PROCESSOR_COUNT , prop.multiProcessorCount);
info.insert(TBAG_GPU_DEVICE_INFO_KERNEL_EXEC_TIMEOUT_ENABLED, prop.kernelExecTimeoutEnabled);
info.insert(TBAG_GPU_DEVICE_INFO_INTEGRATED , prop.integrated);
info.insert(TBAG_GPU_DEVICE_INFO_CAN_MAP_HOST_MEMORY , prop.canMapHostMemory);
if (prop.computeMode == cudaComputeModeExclusive) {
info.insert(TBAG_GPU_DEVICE_INFO_COMPUTE_MODE, TBAG_GPU_DEVICE_INFO_COMPUTE_MODE_EXCLUSIVE);
} else if (prop.computeMode == cudaComputeModeProhibited) {
info.insert(TBAG_GPU_DEVICE_INFO_COMPUTE_MODE, TBAG_GPU_DEVICE_INFO_COMPUTE_MODE_PROHIBITED);
} else {
assert(prop.computeMode == cudaComputeModeDefault);
info.insert(TBAG_GPU_DEVICE_INFO_COMPUTE_MODE, TBAG_GPU_DEVICE_INFO_COMPUTE_MODE_DEFAULT);
}
info.insert(TBAG_GPU_DEVICE_INFO_CONCURRENT_KERNELS , prop.concurrentKernels);
info.insert(TBAG_GPU_DEVICE_INFO_ECC_ENABLED , prop.ECCEnabled);
info.insert(TBAG_GPU_DEVICE_INFO_PCI_BUS_ID , prop.pciBusID);
info.insert(TBAG_GPU_DEVICE_INFO_PCI_DEVICE_ID , prop.pciDeviceID);
info.insert(TBAG_GPU_DEVICE_INFO_TCC_DRIVER , prop.tccDriver);
return info;
}
SharedGpuContext createContext(GpuDevice const & device)
{
checkCudaGpuType(device);
return SharedGpuContext(new CudaContext(device, 0));
}
// ---------------------------
// CudaContext implementation.
// ---------------------------
CudaContext::CudaContext(GpuDevice const & d, GpuId c) : GpuContext(d, c)
{
// EMPTY.
}
CudaContext::~CudaContext()
{
// EMPTY.
}
Err CudaContext::calcOccupancy(int thread_per_block,
int registers_per_thread,
int shared_memory_per_block,
OccupancyInfo & result)
{
// Streaming Processor (SP) == {CUDA CORE}
// Streaming Multiprocessors (SM) == Multiple SP
// GPC (Graphics Processing Clusters) == Multiple SM
// Graphic Card == Multiple SM
cudaDeviceProp prop;
cudaError_t code = cudaGetDeviceProperties(&prop, castDeviceId<int>());
if (code != cudaSuccess) {
tDLogE("CudaContext::calcOccupancy() CUDA cudaGetDeviceProperties() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
int const NUMBER_OF_SM = prop.multiProcessorCount;
int const NUMBER_OF_SP = 0; // Number of CUDA cores.
return Err::E_SUCCESS;
}
double CudaContext::calcTheoreticalBandwidth(double memory_clock_rate_ghz,
int memory_interface_bit,
int memory_interface_lane,
bool enable_ecc)
{
double const MEMORY_CLOCK_RATE_HZ = memory_clock_rate_ghz * 10e+9; // GHz -> Hz
double const MEMORY_INTERFACE_BYTE = memory_interface_bit / (double)8; // Bit -> Byte
double const BANDWIDTH_BYTE = MEMORY_CLOCK_RATE_HZ * MEMORY_INTERFACE_BYTE * memory_interface_lane;
double const BANDWIDTH_GBYTE = BANDWIDTH_BYTE / 10e+9; // Byte -> GByte
if (enable_ecc) {
return BANDWIDTH_GBYTE * 0.75; // Reduced by approximately 20%
} else {
return BANDWIDTH_GBYTE;
}
}
double CudaContext::calcEffectiveBandwidth(int read_byte_by_kernel, int write_byte_by_kernel, int seconds)
{
int const RW_BYTE = read_byte_by_kernel + write_byte_by_kernel;
double const RW_GBYTE = RW_BYTE / (double)10e+9; // Byte -> GByte
return RW_GBYTE / seconds;
}
bool CudaContext::setBankSizeByDefault()
{
return cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeDefault) == cudaSuccess;
}
bool CudaContext::setBankSizeBy4Byte()
{
return cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte) == cudaSuccess;
}
bool CudaContext::setBankSizeBy8Byte()
{
return cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte) == cudaSuccess;
}
// @formatter:off
bool CudaContext::isSupport() const TBAG_NOEXCEPT { return cuda::isSupport(); }
bool CudaContext::isHost () const TBAG_NOEXCEPT { return false; }
bool CudaContext::isDevice () const TBAG_NOEXCEPT { return true; }
// @formatter:on
bool CudaContext::isStream() const TBAG_NOEXCEPT
{
cudaDeviceProp prop;
cudaError_t code = cudaGetDeviceProperties(&prop, castDeviceId<int>());
if (code != cudaSuccess) {
tDLogE("CudaContext::isStream() CUDA cudaGetDeviceProperties() error: {}", cudaGetErrorString(code));
return false;
}
return prop.deviceOverlap == 1;
}
Err CudaContext::createStream(GpuStream & stream) const
{
if (stream.isSameContext(*this) == false) {
return Err::E_ILLARGS;
}
cudaStream_t native_stream;
cudaError_t code = cudaStreamCreate(&native_stream);
if (code != cudaSuccess) {
tDLogE("CudaContext::createStream() CUDA cudaStreamCreate() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
stream.setId(native_stream);
return Err::E_SUCCESS;
}
Err CudaContext::releaseStream(GpuStream & stream) const
{
if (stream.validate(*this) == false) {
return Err::E_ILLARGS;
}
cudaError_t code = cudaStreamDestroy(stream.castId<cudaStream_t>());
stream.clearId();
if (code != cudaSuccess) {
tDLogE("CudaContext::releaseStream() CUDA cudaStreamDestroy() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::createEvent(GpuStream const & stream, GpuEvent & event) const
{
if (stream.validate(*this) == false || event.isSameContext(*this) == false) {
return Err::E_ILLARGS;
}
cudaEvent_t native_start;
cudaError_t start_code = cudaEventCreate(&native_start);
if (start_code != cudaSuccess) {
tDLogE("CudaContext::createEvent() CUDA cudaEventCreate() start error: {}", cudaGetErrorString(start_code));
return Err::E_CUDA;
}
cudaEvent_t native_stop;
cudaError_t stop_code = cudaEventCreate(&native_stop);
if (stop_code != cudaSuccess) {
cudaEventDestroy(native_start);
tDLogE("CudaContext::createEvent() CUDA cudaEventCreate() stop error: {}", cudaGetErrorString(stop_code));
return Err::E_CUDA;
}
event.setStart(native_start);
event.setStop(native_stop);
return Err::E_SUCCESS;
}
Err CudaContext::syncEvent(GpuEvent const & event) const
{
if (event.validate(*this) == false) {
return Err::E_ILLARGS;
}
cudaError_t start_code = cudaEventSynchronize(event.castStart<cudaEvent_t>());
cudaError_t stop_code = cudaEventSynchronize(event.castStop<cudaEvent_t>());
if (start_code != cudaSuccess || stop_code != cudaSuccess) {
tDLogE("CudaContext::syncEvent() CUDA cudaEventSynchronize() error: start({}), stop({})",
cudaGetErrorString(start_code), cudaGetErrorString(stop_code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::elapsedEvent(GpuEvent & event, float * millisec) const
{
if (event.validate(*this) == false) {
return Err::E_ILLARGS;
}
float elapsed_time = 0.0f;
cudaError_t code = cudaEventElapsedTime(&elapsed_time, event.castStart<cudaEvent_t>(), event.castStop<cudaEvent_t>());
if (code != cudaSuccess) {
tDLogE("CudaContext::elapsedEvent() CUDA cudaEventElapsedTime() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
if (millisec != nullptr) {
*millisec = elapsed_time;
}
return Err::E_SUCCESS;
}
Err CudaContext::releaseEvent(GpuEvent & event) const
{
if (event.validate(*this) == false) {
return Err::E_ILLARGS;
}
cudaError_t start_code = cudaEventDestroy(event.castStart<cudaEvent_t>());
cudaError_t stop_code = cudaEventDestroy(event.castStop<cudaEvent_t>());
event.clearIds();
if (start_code != cudaSuccess || stop_code != cudaSuccess) {
tDLogE("CudaContext::releaseEvent() CUDA cudaEventDestroy() error: start({}), stop({})",
cudaGetErrorString(start_code), cudaGetErrorString(stop_code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::malloc(GpuMemory & memory, std::size_t size) const
{
if (memory.isSameContext(*this) == false) {
return Err::E_ILLARGS;
}
__impl::CudaDeviceGuard const LOCK(*this);
void * data = nullptr;
cudaError_t code = cudaMalloc(&data, size);
if (code != cudaSuccess) {
tDLogE("CudaContext::malloc({}) CUDA cudaMalloc() error: {}", size, cudaGetErrorString(code));
return Err::E_CUDA;
}
memory.set(data, size, size);
tDLogIfD(isGpuVerbose(), "CudaContext::malloc({}) CUDA cudaMalloc() MEM:{} CAP:{} SIZE:{}",
size, memory.data(), memory.capacity(), memory.size());
return Err::E_SUCCESS;
}
Err CudaContext::free(GpuMemory & memory) const
{
if (memory.validate(*this) == false) {
return Err::E_ILLARGS;
}
tDLogIfD(isGpuVerbose(), "CudaContext::free() CUDA cudaFree() MEM:{} CAP:{} SIZE:{}",
memory.data(), memory.capacity(), memory.size());
__impl::CudaDeviceGuard const LOCK(*this);
cudaError_t code = cudaFree(memory.data());
memory.clear();
if (code != cudaSuccess) {
tDLogE("CudaContext::free() CUDA cudaFree() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::mallocHost(HostMemory & memory, std::size_t size, HostMemoryFlag flag) const
{
if (memory.isSameContext(*this) == false) {
return Err::E_ILLARGS;
}
if (HostMemoryFlag::HMF_DEFAULT == flag) {
return GpuContext::mallocHost(memory, size, HostMemoryFlag::HMF_DEFAULT);
}
if (HostMemoryFlag::HMF_PINNED != flag) {
tDLogE("CudaContext::mallocHost() Unsupported flag: {}", static_cast<int>(flag));
return Err::E_ILLARGS;
}
assert(flag == HostMemoryFlag::HMF_PINNED);
__impl::CudaDeviceGuard const LOCK(*this);
void * data = nullptr;
cudaError_t code = cudaMallocHost(&data, size);
if (code != cudaSuccess) {
tDLogE("CudaContext::mallocHost({}) CUDA cudaMallocHost() error: {}", size, cudaGetErrorString(code));
return Err::E_CUDA;
}
memory.set(data, size, size, flag);
tDLogIfD(isGpuVerbose(), "CudaContext::mallocHost({}) CUDA cudaMallocHost() MEM:{} CAP:{} SIZE:{}",
size, memory.data(), memory.capacity(), memory.size());
return Err::E_SUCCESS;
}
Err CudaContext::freeHost(HostMemory & memory) const
{
if (memory.validate(*this) == false) {
return Err::E_ILLARGS;
}
tDLogIfD(isGpuVerbose(), "CudaContext::freeHost() CUDA cudaFreeHost() MEM:{} CAP:{} SIZE:{}",
memory.data(), memory.capacity(), memory.size());
__impl::CudaDeviceGuard const LOCK(*this);
cudaError_t code = cudaFreeHost(memory.data());
memory.clear();
if (code != cudaSuccess) {
tDLogE("CudaContext::free() CUDA cudaFreeHost() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::write(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpy(gpu_mem.data(), host_mem.data(), size, cudaMemcpyHostToDevice);
if (code != cudaSuccess) {
tDLogE("CudaContext::write() CUDA cudaMemcpy() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::read(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpy(host_mem.data(), gpu_mem.data(), size, cudaMemcpyDeviceToHost);
if (code != cudaSuccess) {
tDLogE("CudaContext::read() CUDA cudaMemcpy() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::writeAsync(GpuStream const & stream, GpuMemory & gpu_mem, HostMemory const & host_mem, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpyAsync(gpu_mem.data(), host_mem.data(), size, cudaMemcpyHostToDevice,
stream.castId<cudaStream_t>());
if (code != cudaSuccess) {
tDLogE("CudaContext::writeAsync() CUDA cudaMemcpyAsync() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::readAsync(GpuStream const & stream, GpuMemory const & gpu_mem, HostMemory & host_mem, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, gpu_mem, host_mem, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpyAsync(host_mem.data(), gpu_mem.data(), size, cudaMemcpyDeviceToHost,
stream.castId<cudaStream_t>());
if (code != cudaSuccess) {
tDLogE("CudaContext::readAsync() CUDA cudaMemcpyAsync() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::copy(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, src, dest, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpy(dest.data(), src.data(), size, cudaMemcpyDeviceToDevice);
if (code != cudaSuccess) {
tDLogE("CudaContext::copy() CUDA cudaMemcpy(D2D) error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::copy(GpuStream const & stream, HostMemory const & src, HostMemory & dest, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, src, dest, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpy(dest.data(), src.data(), size, cudaMemcpyHostToHost);
if (code != cudaSuccess) {
tDLogE("CudaContext::copy() CUDA cudaMemcpy(H2H) error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::copyAsync(GpuStream const & stream, GpuMemory const & src, GpuMemory & dest, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, src, dest, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpyAsync(dest.data(), src.data(), size, cudaMemcpyDeviceToDevice,
stream.castId<cudaStream_t>());
if (code != cudaSuccess) {
tDLogE("CudaContext::copyAsync() CUDA cudaMemcpyAsync(D2D) error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::copyAsync(GpuStream const & stream, HostMemory const & src, HostMemory & dest, std::size_t size, GpuEvent * event) const
{
if (validateMemory(stream, src, dest, size) == false) {
return Err::E_ILLARGS;
}
__impl::CudaEventGuard const EVENT_LOCK(stream, event);
cudaError_t code = cudaMemcpyAsync(dest.data(), src.data(), size, cudaMemcpyHostToHost,
stream.castId<cudaStream_t>());
if (code != cudaSuccess) {
tDLogE("CudaContext::copyAsync() CUDA cudaMemcpyAsync(H2H) error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
Err CudaContext::flush(GpuStream const & stream) const
{
if (stream.validate(*this) == false) {
return Err::E_ILLARGS;
}
return Err::E_SUCCESS;
}
Err CudaContext::finish(GpuStream const & stream) const
{
if (stream.validate(*this) == false) {
return Err::E_ILLARGS;
}
cudaError_t code = cudaStreamSynchronize(stream.castId<cudaStream_t>());
if (code != cudaSuccess) {
tDLogE("CudaContext::finish() CUDA cudaDeviceSynchronize() error: {}", cudaGetErrorString(code));
return Err::E_CUDA;
}
return Err::E_SUCCESS;
}
// @formatter:off
Err CudaContext::fill(GpuStream const & stream, int * out, int data, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaFill_i(out, data, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
Err CudaContext::fill(GpuStream const & stream, unsigned * out, unsigned data, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaFill_u(out, data, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
Err CudaContext::fill(GpuStream const & stream, float * out, float data, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaFill_f(out, data, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
Err CudaContext::fill(GpuStream const & stream, double * out, double data, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaFill_d(out, data, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
// @formatter:on
// @formatter:off
Err CudaContext::add(GpuStream const & stream, int const * in1, int const * in2, int * out, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaAdd_i(in1, in2, out, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
Err CudaContext::add(GpuStream const & stream, unsigned const * in1, unsigned const * in2, unsigned * out, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaAdd_u(in1, in2, out, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
Err CudaContext::add(GpuStream const & stream, float const * in1, float const * in2, float * out, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaAdd_f(in1, in2, out, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
Err CudaContext::add(GpuStream const & stream, double const * in1, double const * in2, double * out, int count, GpuEvent * event) const
{ __impl::CudaEventGuard g(stream, event); return tbCudaAdd_d(in1, in2, out, count) == TB_TRUE ? Err::E_SUCCESS : Err::E_UNKNOWN; }
// @formatter:on
} // namespace cuda
} // namespace gpu
// --------------------
NAMESPACE_LIBTBAG_CLOSE
// --------------------