diff --git a/src/core/cuda_kernel.cpp b/src/core/cuda_kernel.cpp
index 5a798f5..458417c 100644
--- a/src/core/cuda_kernel.cpp
+++ b/src/core/cuda_kernel.cpp
@@ -146,3 +146,41 @@ void Kernel::setSizes(dim3 globalSize, dim3 localSize)
    _gridDim = globalSize;
    _blockDim = localSize;
 }
+
+
+
+
+
+
+/*!
+ * Sets the amount of dynamic shared memory per thread block to allocate.
+ *
+ * @param size Dynamic shared memory size per thread block in bytes.
+ */
+void Kernel::setSharedMemory(unsigned int size)
+{
+   _sharedMemBytes = size;
+}
+
+
+
+
+
+
+/*!
+ * Get the occupancy of this kernel for a given block size.
+ *
+ * @param blockSize Number of threads per block.
+ */
+int Kernel::getMaxActiveBlocksPerMultiprocessor(int blockSize) const
+{
+   int numBlocks;
+   CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, _kernel, blockSize, _sharedMemBytes);
+   if ( result != CUDA_SUCCESS )
+   {
+      E_MAKE_EXCEPTION(e);
+      throwError(&e,result);
+   }
+
+   return numBlocks;
+}
diff --git a/src/core/cuda_kernel.h b/src/core/cuda_kernel.h
index e015162..b0500e6 100644
--- a/src/core/cuda_kernel.h
+++ b/src/core/cuda_kernel.h
@@ -30,7 +30,8 @@ namespace CUDA
       void setSizes(dim3 globalSize, dim3 localSize);
       template<class T> void setArgument(int index, T value);
       template<class T> void setBuffer(int index, Buffer<T>* buffer);
-      void setSharedMemory(unsigned int size) { _sharedMemBytes = size; }
+      void setSharedMemory(unsigned int size);
+      int getMaxActiveBlocksPerMultiprocessor(int blockSize) const;
    private:
       /*!
        * The CUDA kernel of this object.
diff --git a/src/example/core/mathtransform_cuda_kernel.cpp b/src/example/core/mathtransform_cuda_kernel.cpp
index ff03374..9137589 100644
--- a/src/example/core/mathtransform_cuda_kernel.cpp
+++ b/src/example/core/mathtransform_cuda_kernel.cpp
@@ -53,10 +53,10 @@ ::CUDA::Event MathTransform::CUDA::Kernel::execute(const ::CUDA::Stream& stream,
    // Set the work sizes. The global work size is determined by the row size, but
    // it must also be a multiple of the local work size, so it is rounded up
    // accordingly.
-   int localWorkSize = 1;
-   int workgroupSize = (buffer->size() + localWorkSize - 1) / localWorkSize;
+   int blockSize = 1;
+   int gridSize = (buffer->size() + blockSize - 1) / blockSize;
 
-   setSizes(workgroupSize * localWorkSize, localWorkSize);
+   setSizes(gridSize, blockSize);
 
    // Execute this object's CUDA kernel with the given stream, returning its 
    // generated CUDA event.