Hello,
I need to process data stored in three dimensional array.
When I did it plane by plane there was no problem:
texture< uchar, 3, cudaReadModeNormalizedFloat > tex;
const dim3 blockSize( 16, 16, 1 );
__global__ void my_kenrel( float *d_output, uint imageW, uint imageH, uint imageD, uint z )
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
if ((x >= imageW) || (y >= imageH) || (z >= imageD) ) return;
.....
.....
uint i = z * imageW * imageH + y * imageW + x;
d_output[ i ] = something;
}
extern "C"
{
int run_kernel( cudaArray * dInArray, void * dOutData, int nWidth, int nHeight, int nDepth )
{
dim3 gridSize( nWidth / blockSize.x + 1, nHeight / blockSize.y + 1, 1 );
for (int z = 0; z < nDepth; z++ )
{
my_kenrel <<< gridSize, blockSize >>> ( (float *)( dOutData ), nWidth, nHeight, nDepth, z );
}
}
}
however, when I tried to use three dimensional grid, the kernel was not executed:
texture< uchar, 3, cudaReadModeNormalizedFloat > tex;
const dim3 blockSize( 8, 8, 8 );
__global__ void my_kenrel( float *d_output, uint imageW, uint imageH, uint imageD )
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z;
if ((x >= imageW) || (y >= imageH) || (z >= imageD) ) return;
.....
.....
uint i = z * imageW * imageH + y * imageW + x;
d_output[ i ] = something;
}
extern "C"
{
int run_kernel( cudaArray * dInArray, void * dOutData, int nWidth, int nHeight, int nDepth )
{
dim3 gridSize( nWidth / blockSize.x + 1, nHeight / blockSize.y + 1, nDepth / blockSize.z + 1 );
my_kenrel <<< gridSize, blockSize >>> ( (float *)( dOutData ), nWidth, nHeight, nDepth );
}
}
I do not understand the problem. I have heard, that maximum block size for the GTX480 is 1024, but in both cases block size is smaller.
Are there also any grid size restrictions?
Thank you for your help.
Jakub