diff --git a/src/core/cuda_kernel.cpp b/src/core/cuda_kernel.cpp index 5a798f5..458417c 100644 --- a/src/core/cuda_kernel.cpp +++ b/src/core/cuda_kernel.cpp @@ -146,3 +146,41 @@ void Kernel::setSizes(dim3 globalSize, dim3 localSize) _gridDim = globalSize; _blockDim = localSize; } + + + + + + +/*! + * Sets the amount of dynamic shared memory per thread block to allocate. + * + * @param size Dynamic shared memory size per thread block in bytes. + */ +void Kernel::setSharedMemory(unsigned int size) +{ + _sharedMemBytes = size; +} + + + + + + +/*! + * Get the occupancy of this kernel for a given block size. + * + * @param blockSize Number of threads per block. + */ +int Kernel::getMaxActiveBlocksPerMultiprocessor(int blockSize) const +{ + int numBlocks; + CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, _kernel, blockSize, _sharedMemBytes); + if ( result != CUDA_SUCCESS ) + { + E_MAKE_EXCEPTION(e); + throwError(&e,result); + } + + return numBlocks; +} diff --git a/src/core/cuda_kernel.h b/src/core/cuda_kernel.h index e015162..b0500e6 100644 --- a/src/core/cuda_kernel.h +++ b/src/core/cuda_kernel.h @@ -30,7 +30,8 @@ namespace CUDA void setSizes(dim3 globalSize, dim3 localSize); template void setArgument(int index, T value); template void setBuffer(int index, Buffer* buffer); - void setSharedMemory(unsigned int size) { _sharedMemBytes = size; } + void setSharedMemory(unsigned int size); + int getMaxActiveBlocksPerMultiprocessor(int blockSize) const; private: /*! * The CUDA kernel of this object. diff --git a/src/example/core/mathtransform_cuda_kernel.cpp b/src/example/core/mathtransform_cuda_kernel.cpp index ff03374..9137589 100644 --- a/src/example/core/mathtransform_cuda_kernel.cpp +++ b/src/example/core/mathtransform_cuda_kernel.cpp @@ -53,10 +53,10 @@ ::CUDA::Event MathTransform::CUDA::Kernel::execute(const ::CUDA::Stream& stream, // Set the work sizes. The global work size is determined by the row size, but // it must also be a multiple of the local work size, so it is rounded up // accordingly. - int localWorkSize = 1; - int workgroupSize = (buffer->size() + localWorkSize - 1) / localWorkSize; + int blockSize = 1; + int gridSize = (buffer->size() + blockSize - 1) / blockSize; - setSizes(workgroupSize * localWorkSize, localWorkSize); + setSizes(gridSize, blockSize); // Execute this object's CUDA kernel with the given stream, returning its // generated CUDA event.