If we define for example an array with label Shared within the global part of the code, is it going to be defined again for each thread or it’s defined only once? I know shared memory is for the whole block but I dont know where it should I define it.
In the code below, it’s defined in a loop within the global and I’m wondering if this is how it should be?
__global__ void MatMulKernel( Matrix A, Matrix B, Matrix C){
int blockRow= blockIdx.y;
int blockCol= blockIdx.x;
Matrix Csub = GetSubMatrix (C, blockRow, blockCol);
float Cval=0;
int row=threadIdx.y;
int col=threadIdx.x;
for (int m=0; m<(A.width/BLOCK_SIZE); ++m){
Matrix Asub=GetSubMatrix(A, blockRow, m);
Matrix Bsub= GetSubMatrix (B, m , blockCol);
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[row][col]= GetElement (Asub, row, col);
Bs[row][col]= GetElement (Bsub, row, col);
__syncthreads();
for (int e=0; e<BLOCK_SIZE; ++e){
Cval+= As[row][e]* Bs[e][col];
}
__syncthreads();
}
SetElement(Csub, row, col, Cval);
}
If we define for example an array with label Shared within the global part of the code, is it going to be defined again for each thread or it’s defined only once? I know shared memory is for the whole block but I dont know where it should I define it.
In the code below, it’s defined in a loop within the global and I’m wondering if this is how it should be?
__global__ void MatMulKernel( Matrix A, Matrix B, Matrix C){
int blockRow= blockIdx.y;
int blockCol= blockIdx.x;
Matrix Csub = GetSubMatrix (C, blockRow, blockCol);
float Cval=0;
int row=threadIdx.y;
int col=threadIdx.x;
for (int m=0; m<(A.width/BLOCK_SIZE); ++m){
Matrix Asub=GetSubMatrix(A, blockRow, m);
Matrix Bsub= GetSubMatrix (B, m , blockCol);
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[row][col]= GetElement (Asub, row, col);
Bs[row][col]= GetElement (Bsub, row, col);
__syncthreads();
for (int e=0; e<BLOCK_SIZE; ++e){
Cval+= As[row][e]* Bs[e][col];
}
__syncthreads();
}
SetElement(Csub, row, col, Cval);
}
As you already said, shared memory is allocated per block, i.e. all threads of a block see identical values for the shared memory (if __syncthreads() is used properly to synchronize accesses).
Often shared variables are declared at the beginning of a kernel, although you use is fine as well. Trying to declare a shared variable outside of a kernel will just give an error.
As you already said, shared memory is allocated per block, i.e. all threads of a block see identical values for the shared memory (if __syncthreads() is used properly to synchronize accesses).
Often shared variables are declared at the beginning of a kernel, although you use is fine as well. Trying to declare a shared variable outside of a kernel will just give an error.