Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions src/core/cuda_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,41 @@ void Kernel::setSizes(dim3 globalSize, dim3 localSize)
_gridDim = globalSize;
_blockDim = localSize;
}






/*!
* Sets the amount of dynamic shared memory per thread block to allocate.
*
* @param size Dynamic shared memory size per thread block in bytes.
*/
void Kernel::setSharedMemory(unsigned int size)
{
_sharedMemBytes = size;
}






/*!
* Get the occupancy of this kernel for a given block size.
*
* @param blockSize Number of threads per block.
*/
int Kernel::getMaxActiveBlocksPerMultiprocessor(int blockSize) const
{
int numBlocks;
CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, _kernel, blockSize, _sharedMemBytes);
if ( result != CUDA_SUCCESS )
{
E_MAKE_EXCEPTION(e);
throwError(&e,result);
}

return numBlocks;
}
3 changes: 2 additions & 1 deletion src/core/cuda_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ namespace CUDA
void setSizes(dim3 globalSize, dim3 localSize);
template<class T> void setArgument(int index, T value);
template<class T> void setBuffer(int index, Buffer<T>* buffer);
void setSharedMemory(unsigned int size) { _sharedMemBytes = size; }
void setSharedMemory(unsigned int size);
int getMaxActiveBlocksPerMultiprocessor(int blockSize) const;
private:
/*!
* The CUDA kernel of this object.
Expand Down
6 changes: 3 additions & 3 deletions src/example/core/mathtransform_cuda_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ ::CUDA::Event MathTransform::CUDA::Kernel::execute(const ::CUDA::Stream& stream,
// Set the work sizes. The global work size is determined by the row size, but
// it must also be a multiple of the local work size, so it is rounded up
// accordingly.
int localWorkSize = 1;
int workgroupSize = (buffer->size() + localWorkSize - 1) / localWorkSize;
int blockSize = 1;
int gridSize = (buffer->size() + blockSize - 1) / blockSize;

setSizes(workgroupSize * localWorkSize, localWorkSize);
setSizes(gridSize, blockSize);

// Execute this object's CUDA kernel with the given stream, returning its
// generated CUDA event.
Expand Down