shared memory

Hi,

If we define for example an array with label Shared within the global part of the code, is it going to be defined again for each thread or it’s defined only once? I know shared memory is for the whole block but I dont know where it should I define it.

In the code below, it’s defined in a loop within the global and I’m wondering if this is how it should be?

__global__ void MatMulKernel( Matrix A, Matrix B, Matrix C){

	int blockRow= blockIdx.y;

	int blockCol= blockIdx.x;	

	

	Matrix Csub = GetSubMatrix (C, blockRow, blockCol);

	

	float Cval=0;

	

	int row=threadIdx.y;

	int col=threadIdx.x;

	

	for (int m=0; m<(A.width/BLOCK_SIZE); ++m){

	

		Matrix Asub=GetSubMatrix(A, blockRow, m);

		Matrix Bsub= GetSubMatrix (B, m , blockCol);

		

		__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

		__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

		

		As[row][col]= GetElement (Asub, row, col);

		Bs[row][col]= GetElement (Bsub, row, col);

		

		__syncthreads();

		

		for (int e=0; e<BLOCK_SIZE; ++e){

			Cval+= As[row][e]* Bs[e][col];

		}

		

		__syncthreads();

		

	}

		

	SetElement(Csub, row, col, Cval);

	

}

Thanks for your help!

Hi,

If we define for example an array with label Shared within the global part of the code, is it going to be defined again for each thread or it’s defined only once? I know shared memory is for the whole block but I dont know where it should I define it.

In the code below, it’s defined in a loop within the global and I’m wondering if this is how it should be?

__global__ void MatMulKernel( Matrix A, Matrix B, Matrix C){

	int blockRow= blockIdx.y;

	int blockCol= blockIdx.x;	

	

	Matrix Csub = GetSubMatrix (C, blockRow, blockCol);

	

	float Cval=0;

	

	int row=threadIdx.y;

	int col=threadIdx.x;

	

	for (int m=0; m<(A.width/BLOCK_SIZE); ++m){

	

		Matrix Asub=GetSubMatrix(A, blockRow, m);

		Matrix Bsub= GetSubMatrix (B, m , blockCol);

		

		__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

		__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

		

		As[row][col]= GetElement (Asub, row, col);

		Bs[row][col]= GetElement (Bsub, row, col);

		

		__syncthreads();

		

		for (int e=0; e<BLOCK_SIZE; ++e){

			Cval+= As[row][e]* Bs[e][col];

		}

		

		__syncthreads();

		

	}

		

	SetElement(Csub, row, col, Cval);

	

}

Thanks for your help!

As you already said, shared memory is allocated per block, i.e. all threads of a block see identical values for the shared memory (if __syncthreads() is used properly to synchronize accesses).

Often shared variables are declared at the beginning of a kernel, although you use is fine as well. Trying to declare a shared variable outside of a kernel will just give an error.

As you already said, shared memory is allocated per block, i.e. all threads of a block see identical values for the shared memory (if __syncthreads() is used properly to synchronize accesses).

Often shared variables are declared at the beginning of a kernel, although you use is fine as well. Trying to declare a shared variable outside of a kernel will just give an error.