simple thraed calculation going wrong

Hi, This is my kernel

global void initialize(int *Smatrix,int length)

{

int threadx = blockDim.x * blocIdx.x + threadIdx.x;

    int thready = blockDim.y * blockIdx.y + threadIdxy;

    int idx = threadx + thradY * length

if (idx < length)

           dev_matrix[idx] = threadx * -2;

    else if((idx%length)==0)

           dev_matrix[idx] = thready * -2;

    else

           dev_matrix[idx] = 0;

}

Here is my kernel call:

initialize<<<blocks,threads>>>(Scorematrixdevice,length);

I have done cuda malloc and stuff. The kernel should make the left and and top border of the matrix(it is 1Dimensional but logically I am using the length as the column size of the matrix) in multiple of negative odd numbers.

Ex-> if the length= 4 and size of Smatrix =20 then the output should be,

0 -2 -4 -6

-2 0 0 0

-4 0 0 0

-6 0 0 0

-8 0 0 0

And it is giving the same output which is expected for this input but when the

length is 7 and the size of Smatrix is 119 then the out put is wired like

some times it gives this kind of output

0 -2 -4 -6 -8 -10 -12

-2 0 0 0 0 0 0

-4 0 0 0 0 0 0

-6 0 0 0 0 0 0

-8 0 0 0 0 0 0

-10 0 0 0 0 0 0

-10 0 0 0 0 0 0

-10 0 0 0 0 0 0

-16 0 0 0 0 0 0

-18 0 0 0 0 0 0

-20 0 0 0 0 0 0

-22 0 0 0 0 0 0

-22 0 0 0 0 0 0

-22 0 0 0 0 0 0

-28 0 0 0 0 0 0

-30 0 0 0 0 0 0

-30 0 0 0 0 0 0

what can be the problem in this?

try following code (natural index on 2-D data with boundary condition)

kernel function

[codebox]

// dev_matrix(1:n1, 1:n2) is a 2-D matrix

static global void initialize_v2(int *dev_matrix, unsigned int n1, unsigned int n2 )

{

unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;

unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;

if ( (idx < n2 ) && ( idy < n1) ){

// (idy, idx) = (i-1,j-1)

// row-major(i, j) = (j-1)*n2 + (i-1) = idy * n2 + idx

	unsigned int  index_in = idy * n2 + idx ;	

	if ( 0 == idy ){ // first row

		dev_matrix[ index_in ] = idx * -2;

	}else if ( 0 == idx ){ // first column

		dev_matrix[ index_in ] = idy * -2;

	}else{

		dev_matrix[ index_in ] = 0 ;

	}

}// legal index

}

[/codebox]

C-wrapper

[codebox]define BLOCK_DIM 16

void initialize_v2_device(int *Smatrix, unsigned int n1, unsigned int n2 ) ;

static global void initialize_v2(int *Smatrix, unsigned int n1, unsigned int n2 ) ;

void initialize_v2_device(int *Smatrix, unsigned int n1, unsigned int n2 )

{

dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);

dim3 grid( (n2 + BLOCK_DIM-1)/BLOCK_DIM , (n1 + BLOCK_DIM-1)/BLOCK_DIM, 1 );

initialize_v2<<< grid, threads >>>( Smatrix, n1, n2 ) ;

}[/codebox]

driver

[codebox]void test( void )

{

unsigned int n1 = 17 ;

unsigned int n2	= 7 ;

size_t  size = n1*n2 ;

int *h_odata ;

int *d_idata ;

// allocate host memory

h_odata = (int*) malloc( sizeof(int)*size );

assert( h_odata ) ;

// allocate device memory

cutilSafeCall( cudaMalloc( (void**) &d_idata, sizeof(int)*size) );

initialize_v2_device( d_idata , n1, n2 ) ;

cutilSafeCall( cudaMemcpy( h_odata, d_idata, sizeof(int)*size, cudaMemcpyDeviceToHost) );

int i, j, index_in ;

for ( i = 0 ; i < n1 ; i++){

	for ( j = 0 ; j < n2 ; j++){

		index_in = i*n2 + j ;

		printf("%d  ", h_odata[index_in] ) ;

	}// for j

	printf("\n");

}// for i

}[/codebox]

Amazing man… I really appreciate that you responded so promptly…I will surely go forwarded with the implementation you suggested… Thank you…
And ya I guess the problem which I am having is mainly because I am not allocating the grid and block dimensions properly.

Thank you again!

cheers!

hey when i implemented it… it is giving me wrong output… it gives,

0 -1 -2 -3 -4 -5 -6
-1 0 0 0 0 0 0
-2 0 0 0 0 0 0
-3 0 0 0 0 0 0
-4 0 0 0 0 0 0
-5 0 0 0 0 0 0
-6 0 0 0 0 0 0
-7 0 0 0 0 0 0
-8 0 0 0 0 0 0
-9 0 0 0 0 0 0
-10 0 0 0 0 0 0
-11 0 0 0 0 0 0
-12 0 0 0 0 0 0
-13 0 0 0 0 0 0
-14 0 0 0 0 0 0
-15 0 0 0 0 0 0
-16 0 0 0 0 0 0

and also any guess why the one which I typed is going wrong ?

Ah…
At my University I use 2 different machines one having a gtx 8800 and other gtx 260.
It works fine on 260… but gives the above o/p on gtx8800… Don’t know why it happens.
But as my submission are closer I guess I will switch over to 260 instead of 8800… but I am curious why is such discrepancy between the two…

Cheers!