Execution Configuration (4, 4) Vs (1, 16)

Hi,

I have a kernel that calculates matrix multiplication (on two 4 x 4 matrices). The result comes correct when I chose the execution configuration as 1, 16 i.e 1 block and 16 threads per block. However, when I change the execution configuration to 4, 4, it prints wrong results. I would appreciate your help here.

Thanks,

The kernel and part of main() is given here:

__global__ void Shar(float *a,float *b,float *c,int n)

{

		__shared__ float aTile[4][4],bTile[4][4];

		int row=blockIdx.y*blockDim.y+threadIdx.y;

		int col=blockIdx.x*blockDim.x+threadIdx.x;

		float sum=0.0;

		aTile[threadIdx.y][threadIdx.x]=a[row*n+threadIdx.x];

		bTile[threadIdx.y][threadIdx.x]=b[threadIdx.y*n+col];

		__syncthreads();

		for(int i=0;i<n;i++)

		{

				sum+=aTile[threadIdx.y][i]*bTile[i][threadIdx.x];

				//printf("\n%f",sum);

		}

		c[row*n+col]=sum;

}

int main()

{

		float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;

		int i,n;

		n=4;

		size_t size=sizeof(float)*(n*n);

		a_h=(float*)malloc(size);

		b_h=(float*)malloc(size);

		c_h=(float*)malloc(size);

		cudaMalloc((void**)&a_d,size);

		cudaMalloc((void**)&b_d,size);

		cudaMalloc((void**)&c_d,size);

		for(i=0;i<(n*n);i++)

		{

				a_h[i]=2;

		}

		cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);

		for(i=0;i<(n*n);i++)

		{

			 b_h[i]=2;

		}

		cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);

Shar<<<1,16>>>(a_d,b_d,c_d,n);

		cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);

		printf("\nMultiplication Of Matrix");

		for(i=0;i<(n*n);i++)

		{

				printf("\n%f",c_h[i]);

		}

		free(a_h);

		free(b_h);

		free(c_h);

		cudaFree(a_d);

		cudaFree(b_d);

		cudaFree(c_d);

		return 0;

}

as you say: execution configuration as 1, 16 i.e 1 block and 16 threads per block

Shar<<<1,16>>>(a_d,b_d,c_d,n);

then in your kernel

row=blockIdx.y*blockDim.y+threadIdx.y = 0

col=blockIdx.x*blockDim.x+threadIdx.x = threadIdx.x

then your kernel is equivalent to following code

__global__ void Shar(float *a,float *b,float *c,int n)

{

		__shared__ float aTile[4][4],bTile[4][4];

		

		float sum=0.0;

		aTile[0][threadIdx.x] = a[threadIdx.x];

		bTile[0][threadIdx.x] = b[threadIdx.x];

		__syncthreads();

		for(int i=0;i<n;i++)

		{

				sum += aTile[0][i]*bTile[i][threadIdx.x];

				//printf("\n%f",sum);

		}

		c[threadIdx.x] = sum;

}

I think that even “1 block and 16 threads per block”, your kernel does not have correct result.

The following kernel comes from SDK example, you can read it

__global__ void

matrixMul( doublereal* C, doublereal* A, doublereal* B, int wA, int wB)

{

	// Block index

	int bx = blockIdx.x;

	int by = blockIdx.y;

	// Thread index

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	// Index of the first sub-matrix of A processed by the block

	int aBegin = wA * BLOCK_SIZE * by;

	// Index of the last sub-matrix of A processed by the block

	int aEnd   = aBegin + wA - 1;

	// Step size used to iterate through the sub-matrices of A

	int aStep  = BLOCK_SIZE;

	// Index of the first sub-matrix of B processed by the block

	int bBegin = BLOCK_SIZE * bx;

	// Step size used to iterate through the sub-matrices of B

	int bStep  = BLOCK_SIZE * wB;

	// Csub is used to store the element of the block sub-matrix

	// that is computed by the thread

	doublereal  Csub = 0.0;

	// Loop over all the sub-matrices of A and B

	// required to compute the block sub-matrix

	for (int a = aBegin, b = bBegin;

			 a <= aEnd;

			 a += aStep, b += bStep) {

		// Declaration of the shared memory array As used to

		// store the sub-matrix of A

		__shared__ doublereal As[BLOCK_SIZE][BLOCK_SIZE];

		// Declaration of the shared memory array Bs used to

		// store the sub-matrix of B

		__shared__ doublereal Bs[BLOCK_SIZE][BLOCK_SIZE];

		// Load the matrices from device memory

		// to shared memory; each thread loads

		// one element of each matrix

		AS(ty, tx) = A[a + wA * ty + tx];

		BS(ty, tx) = B[b + wB * ty + tx];

		// Synchronize to make sure the matrices are loaded

		__syncthreads();

		// Multiply the two matrices together;

		// each thread computes one element

		// of the block sub-matrix

		for (int k = 0; k < BLOCK_SIZE; ++k){

			Csub += AS(ty, k) * BS(k, tx);	

		}

		// Synchronize to make sure that the preceding

		// computation is done before loading two new

		// sub-matrices of A and B in the next iteration

		__syncthreads();

	}

	// Write the block sub-matrix to device memory;

	// each thread writes one element

	int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

	C[c + wB * ty + tx] = Csub;

}