Why 1D and 2D call config are not giving same results? check this code...

Hello,

Paste the below code in template project in SDK samples and run it.

Basically the code is comparing the results of 1D and 2D call configs are same or not.

TestKernel1 uses 1D call config and TestKernel2 uses 2D call config.

But Both function results are NOT same.

[codebox]// // includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <template_kernel.cu>

global

void TestKernel1( int* array1, int limit )

{

int idx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;

if ( idx < limit )

	array1[idx] = idx;

}

global

void TestKernel2( int* array2, int limit )

{

int idx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;

int idy = __umul24(blockIdx.y,blockDim.y) + threadIdx.y;

int index = __umul24(idx,idy);

if ( index < limit )

	array2[index] = index;

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv)

{

int* dArray1=NULL;

int* dArray2=NULL;

int width = 300;

int height = 200;

cudaMalloc( (void**)&dArray1, sizeof(int)*width*height );	

cudaMalloc( (void**)&dArray2, sizeof(int)*width*height );

cudaMemset( dArray1, 0, sizeof(int)*width*height );

cudaMemset( dArray2, 0, sizeof(int)*width*height );

dim3 grid1( ((width*height)+255)/256,1,1);

dim3 block1( 256,1,1);

TestKernel1<<<grid1,block1>>>(dArray1, width*height);

dim3 grid2( (width+15)/16, (height+15)/16, 1);

dim3 block2( 16,16,1);

TestKernel2<<<grid2,block2>>>(dArray2, width*height);

int* hArray1=NULL;

int* hArray2=NULL;

hArray1 = (int*)malloc(sizeof(int)*width*height);

hArray2 = (int*)malloc(sizeof(int)*width*height);

cudaMemcpy( hArray1, dArray1, sizeof(int)*width*height, cudaMemcpyDeviceToHost);

cudaMemcpy( hArray2, dArray2, sizeof(int)*width*height, cudaMemcpyDeviceToHost);

int isSame = true;

for( int y = 0; y < height; ++y )

{

	for( int x = 0; x < width; ++x )

	{

		int index = y*width+x;

		if( hArray1[index] != hArray2[index] )

			isSame = false;

	}

}

if ( isSame == true )

	printf("\nBoth Kernels are same\n");

else

	printf("\nBoth Kernels are NOT same\n");

free(hArray1);

free(hArray2);

cudaFree(dArray1);

cudaFree(dArray2);

cutilExit(argc, argv);

}

[/codebox]

Makes sense, looks like your indexing is wrong in the 2D case. Here’s my edited code which runs correctly.

[codebox]

// // includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

global

void TestKernel1( int* array1, int limit )

{

int idx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;

if ( idx < limit )

array1[idx] = idx;

}

global

void TestKernel2( int* array2, int w, int h )

{

int idx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;

int idy = __umul24(blockIdx.y,blockDim.y) + threadIdx.y;

int index = __umul24(w,idy)+idx;

if ((idx < w) & (idy < h))

array2[index] = index;

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv)

{

int* dArray1=NULL;

int* dArray2=NULL;

int width = 300;

int height = 200;

cudaMalloc( (void**)&dArray1, sizeof(int)widthheight );

cudaMalloc( (void**)&dArray2, sizeof(int)widthheight );

cudaMemset( dArray1, 0, sizeof(int)widthheight );

cudaMemset( dArray2, 0, sizeof(int)widthheight );

dim3 grid1( ((width*height)+255)/256,1,1);

dim3 block1( 256,1,1);

TestKernel1<<<grid1,block1>>>(dArray1, width*height);

dim3 grid2( (width+15)/16, (height+15)/16, 1);

dim3 block2( 16,16,1);

TestKernel2<<<grid2,block2>>>(dArray2, width,height);

int* hArray1=NULL;

int* hArray2=NULL;

hArray1 = (int*)malloc(sizeof(int)widthheight);

hArray2 = (int*)malloc(sizeof(int)widthheight);

cudaMemcpy( hArray1, dArray1, sizeof(int)widthheight, cudaMemcpyDeviceToHost);

cudaMemcpy( hArray2, dArray2, sizeof(int)widthheight, cudaMemcpyDeviceToHost);

int isSame = true;

for( int y = 0; y < height; ++y )

{

for( int x = 0; x < width; ++x )

{

int index = y*width+x;

if( hArray1[index] != hArray2[index] )

isSame = false;

}

}

if ( isSame == true )

printf("\nBoth Kernels are same\n");

else

printf("\nBoth Kernels are NOT same\n");

free(hArray1);

free(hArray2);

cudaFree(dArray1);

cudaFree(dArray2);

cutilExit(argc, argv);

}

[/codebox]

N.

@ Nico,

thanks Nico, yes indexing in2D call config is wrong.

We can use the below code as well…

[codebox]void TestKernel2( int* array2, int limit, int width )

{

int idx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;

int idy = __umul24(blockIdx.y,blockDim.y) + threadIdx.y;

int index = __umul24(idy,width)+idx;



if ( index < limit )

	array2[index] = index;

}[/codebox]

You could use that code, but then you’d be writing some values twice unless your width is a multiple of 16.
You’ve got a grid of (19,13) blocks, with (16,16) threads each so for example

Thread (12,0) of block (18,0) will write to:

idx = (1816)+12 = 300
idy = 0
index = 0
300+300 = 300

And thread (0,1) of block (0,0) will also write to:

idx = (016)+0 = 0
idy = 1
index = 1
300+0 = 300

N.

Yes, you are correct. writing twice into the same mem location is bad.

but when I run your code, the timings are…

TestKernel1 exe time: 0.038467

TestKernel2 exe time: 0.032263

Both Kernels are same

Press ENTER to exit…

When I run the second version of My TestKernel2(), then timings are…

TestKernel1 exe time: 0.039194

TestKernel2 exe time: 0.027828

Both Kernels are same

Press ENTER to exit…

True, it’ll be a bit slower because of the extra if test that needs to be performed. All I’m saying is that you will run into trouble when you’re writing code that generates values based on the contents of threadID and/or blockID. In that case, it’ll write out two different values to the same location and the result will depend on which thread accesses it last.

N.

yes, you are 100% right. Thanks.