[beginner] memory access

if I put width = 65535 and height = 1 execution time ~ 0.47 sec
if I put width = 1 and height =65535 execution time ~ 0.06 sec WHY?

and then…also this:

if I put nThreads(16,16) and dimBlockW = (sizeW+15)/16; dimBlockH = (sizeH+15)/16; is more fast than : nThreads(32,16) and dimBlockW = (sizeW+31)/32 dimBlockH = (sizeH+15)/16; WHY? with nThreads(16,16) is not memory access coalescing! or not? :wallbash:

for example :

global void kernel(float *dataIn,uchar4 *dataOut,int w,int h,int step)
{

float data;

int x = (blockIdx.x) * blockDim.x + threadIdx.x;		
int y = (blockIdx.y) * blockDim.y + threadIdx.y;	


if ((x<w) && (y<h))
{
	for (int k=0; k<step; k++)
		  data += __fmul_rn(dataIn[y*w +x],0.2);


	dataOut[y*w +x] = make_uchar4(data,44,44,44);
}

}

int main(int argc, char* argv)
{

float timeCuda,timeHost;

    int sizeW = 65535;

    int sizeH = 1;

    int step = 15000;


    float  *dataIn;
uchar4 *dataOut;

cudaMalloc(&dataOut,sizeW*sizeH*sizeof(uchar4));
cudaMalloc(&dataIn,sizeW*sizeH*sizeof(float));


dim3 nThreads(32,16);

int dimBlockW = (sizeW+31)/32;

int dimBlockH = (sizeH+15)/16;

dim3 nBlocks(dimBlockW,dimBlockH);

unsigned int timer = 0;

    printf("\n CUDA start\n");

cutilCheckError( cutCreateTimer( &timer));

cutilCheckError( cutStartTimer( timer));
				
kernel<<< nBlocks, nThreads >>> (dataIn,dataOut,sizeW,sizeH,step);				

cutilSafeCall(cudaThreadSynchronize());
				
cutilCheckError( cutStopTimer( timer));

timeCuda = cutGetTimerValue( timer)/1000;

printf("\n CUDA end time: %f (s)\n", timeCuda);

cutilCheckError( cutDeleteTimer( timer));


cudaFree(dataOut);
cudaFree(dataIn);

}

if I put width = 65535 and height = 1 execution time ~ 0.47 sec
if I put width = 1 and height =65535 execution time ~ 0.06 sec WHY?

and then…also this:

if I put nThreads(16,16) and dimBlockW = (sizeW+15)/16; dimBlockH = (sizeH+15)/16; is more fast than : nThreads(32,16) and dimBlockW = (sizeW+31)/32 dimBlockH = (sizeH+15)/16; WHY? with nThreads(16,16) is not memory access coalescing! or not? :wallbash:

for example :

global void kernel(float *dataIn,uchar4 *dataOut,int w,int h,int step)
{

float data;

int x = (blockIdx.x) * blockDim.x + threadIdx.x;		
int y = (blockIdx.y) * blockDim.y + threadIdx.y;	


if ((x<w) && (y<h))
{
	for (int k=0; k<step; k++)
		  data += __fmul_rn(dataIn[y*w +x],0.2);


	dataOut[y*w +x] = make_uchar4(data,44,44,44);
}

}

int main(int argc, char* argv)
{

float timeCuda,timeHost;

    int sizeW = 65535;

    int sizeH = 1;

    int step = 15000;


    float  *dataIn;
uchar4 *dataOut;

cudaMalloc(&dataOut,sizeW*sizeH*sizeof(uchar4));
cudaMalloc(&dataIn,sizeW*sizeH*sizeof(float));


dim3 nThreads(32,16);

int dimBlockW = (sizeW+31)/32;

int dimBlockH = (sizeH+15)/16;

dim3 nBlocks(dimBlockW,dimBlockH);

unsigned int timer = 0;

    printf("\n CUDA start\n");

cutilCheckError( cutCreateTimer( &timer));

cutilCheckError( cutStartTimer( timer));
				
kernel<<< nBlocks, nThreads >>> (dataIn,dataOut,sizeW,sizeH,step);				

cutilSafeCall(cudaThreadSynchronize());
				
cutilCheckError( cutStopTimer( timer));

timeCuda = cutGetTimerValue( timer)/1000;

printf("\n CUDA end time: %f (s)\n", timeCuda);

cutilCheckError( cutDeleteTimer( timer));


cudaFree(dataOut);
cudaFree(dataIn);

}

I think this is problem of effective bandwidth.

for example, width = 1, then according to your setting

int x = (blockIdx.x) * blockDim.x + threadIdx.x; 

int y = (blockIdx.y) * blockDim.y + threadIdx.y;

first half-warp is threadIdx.y =0, threadIdx.x = 0,1,2,3,…,15

However only threadIdx.x=0 is not out-of-array bound, that is,

first half-warp loads a cache line to L2 cache, but only one thread fetches the data,

so effective bandwidth is 1/16.

this is not correct, global memory accessing is based on half-warp, even on Fermi.

I think this is problem of effective bandwidth.

for example, width = 1, then according to your setting

int x = (blockIdx.x) * blockDim.x + threadIdx.x; 

int y = (blockIdx.y) * blockDim.y + threadIdx.y;

first half-warp is threadIdx.y =0, threadIdx.x = 0,1,2,3,…,15

However only threadIdx.x=0 is not out-of-array bound, that is,

first half-warp loads a cache line to L2 cache, but only one thread fetches the data,

so effective bandwidth is 1/16.

this is not correct, global memory accessing is based on half-warp, even on Fermi.

I have a gtx 460, and then compute with
capacity 2.1.
I read the SDK (with difficulty)
help me to clarify basic concepts.

A silly example, which only serves as an example to understand how to maximize access to global memory
(Setting the number of threads per block, grid size for access coalescing):

global void kernel (uchar4 dataOut *, int width, int height)
{
float red;

int x = (blockIdx.x) * blockDim.x threadIdx.x;
int y = (blockIdx.y) * blockDim.y threadIdx.y;

if ((x <width) & & (y <height))
{
   ....................
    ....................
    calculating the variable "red" that depends on the indices x and y

    ....................
    ....................

    dataOut [y * width x] = make_uchar4 (red, 0,0,255);
}

}

int main (int argc, char * argv )
{
int sizeW = atoi (argv [1]); <------ (any values … so do not necessarily multiples of a number set)

int sizeH = atoi (argv [2]); <------ (any values .... so do not necessarily multiples of a number set)

uchar4 *dataOut;

cudaMalloc (& dataOut, sizeW * sizeH * sizeof (uchar4));

dim3 nThreads (?,?);

dim3 nBlocks (?,?);

kernel <<<nBlocks, nThreads>>> (dataOut, sizeW, sizeH);
..............

...............

..................

cudaFree (dataOut);

}

then having a video card with compute capacity 2.1 what is the best setting (again for
coalescence)? What are the values to take the place of question marks? And you can also motivate the choice?

I have a gtx 460, and then compute with
capacity 2.1.
I read the SDK (with difficulty)
help me to clarify basic concepts.

A silly example, which only serves as an example to understand how to maximize access to global memory
(Setting the number of threads per block, grid size for access coalescing):

global void kernel (uchar4 dataOut *, int width, int height)
{
float red;

int x = (blockIdx.x) * blockDim.x threadIdx.x;
int y = (blockIdx.y) * blockDim.y threadIdx.y;

if ((x <width) & & (y <height))
{
   ....................
    ....................
    calculating the variable "red" that depends on the indices x and y

    ....................
    ....................

    dataOut [y * width x] = make_uchar4 (red, 0,0,255);
}

}

int main (int argc, char * argv )
{
int sizeW = atoi (argv [1]); <------ (any values … so do not necessarily multiples of a number set)

int sizeH = atoi (argv [2]); <------ (any values .... so do not necessarily multiples of a number set)

uchar4 *dataOut;

cudaMalloc (& dataOut, sizeW * sizeH * sizeof (uchar4));

dim3 nThreads (?,?);

dim3 nBlocks (?,?);

kernel <<<nBlocks, nThreads>>> (dataOut, sizeW, sizeH);
..............

...............

..................

cudaFree (dataOut);

}

then having a video card with compute capacity 2.1 what is the best setting (again for
coalescence)? What are the values to take the place of question marks? And you can also motivate the choice?

Suppose that you follow page 8 of programming guide, that is, threadIdx.x sweeps columns of the matrix, and you matrix is row-major, then you should set first element of thread block as multiple of 16, for example, dim block(16,16) or block(32,8) or …

however width of the matrix should be multiple of 16 or much bigger than 16.

Suppose that you follow page 8 of programming guide, that is, threadIdx.x sweeps columns of the matrix, and you matrix is row-major, then you should set first element of thread block as multiple of 16, for example, dim block(16,16) or block(32,8) or …

however width of the matrix should be multiple of 16 or much bigger than 16.