// Kernel that executes on the CUDA device
__global__ void compute(float *result, int H, int W, float* A,float* B)
{
int idx=threadIdx.x + blockIdx.x* blockDim.x;
int idy=threadIdx.y + blockIdx.y* blockDim.y;
int index =idy*(gridDim.x*blockDim.x)+idx;
int indexB=index%24;
if ( idx < W && idy < H ) result[index]= A[index] + B[indexB];
}
with
It worked well but the % operation is time expensive ! So I decided to change the ThreadPerBlock to dim3 threadPerBlock(24,8) in order to use threadIdx.x as the index for the B array !
And then, if I kept my program like that or if I change it according to your suggestion, the program does not the good sum, it does not add A[i][j] with B[j] but with B[j-1], B[j-8] or every other possibilities !
The way I’m doing it is probably not “clean” but I don’t see other ways of doing it…
don’t use gridDim.x*blockDim.x to index matrix element, you should use
dimension of the matrix, H or W.
// Kernel that executes on the CUDA device
__global__ void compute(float *result, int H, int W, float* A,float* B)
{
int idx=threadIdx.x + blockIdx.x* blockDim.x;
int idy=threadIdx.y + blockIdx.y* blockDim.y;
if ( idx < W && idy < H ){
// valid index (idx, idy) and
// its row-major map is (idy * W + idx)
int index = idy * W + idx;
result[index]= A[index] + B[idx]; // result[idy][idx] = A[idy][idx] + B[idx]
}
}