__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aShared[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
int myRow = blockIdx.x * blockDim.x + threadIdx.x;
int myRowInBlock = threadIdx.x, myColInBlock = threadIdx.y;
int rowSum = 0;
for (int m = 0; m < (matSize + BLOCK_SIZE - 1) / BLOCK_SIZE; m++) {
aShared[myRowInBlock][myColInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myColInBlock,matSize);
if (myColInBlock==0) {bShared[myRowInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myRowInBlock,matSize);}
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myColInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
// rowSum += getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+k,matSize) * getValFromVector(bOnGPU,m*BLOCK_SIZE+k,matSize);
rowSum += aShared[myRowInBlock][k] * bShared[k];
}
}
}
if (myColInBlock==0 && myRow<matSize) {cOnGPU[myRow] = rowSum;}
}
The above kernel gives incorrent results for matrix-vector multiplication. In debugging, I can see that some warps in the block are executing when m=0 and others when m=1. If i decomment line 17 and comment line 18, the results are correct again. Is there something wrong with how I’m using _syncthreads()?