I want to reduce subsets of a matrix and end up with a smaller matrix that contains the results of that operation. Basically, I want to add patches of a matrix.

I tried to write a simple kernel that would add patches of 5x5 from a 20000x5000 matrix. In this example, and for debugging, I am adding ones. Ideally I should end up with a smaller matrix of size 4000x1000 filled with 25s. The results that I get are unexpected since the values in the accumulated matrix range from 7 to 45. I am aware I have a race condition in my kernel but I thought using atomicAdd would do it. Is there something else wrong or what am I missing?

```
#include <stdio.h>
#include <cuda.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
__global__ void
accum_patch (
int sz_M,
int sz_N,
int sz_patch_M,
int sz_patch_N,
int sz_patch_total,
int n_patch_M,
int n_patch_N,
double *d_accum)
{
//kernel 2D grid with 1D blocks
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int i_thread = blockId * blockDim.x + threadIdx.x;
if (i_thread < sz_M * sz_N){
//global indeces in array
int i_N = floorf( (float) i_thread/sz_M);
int i_M = i_thread - i_N*sz_M;
//patch indeces
int i_patch_M = i_M/sz_patch_M;
int i_patch_N = i_N/sz_patch_N;
//patch linear index (row-major order)
int i_patch = i_patch_N*n_patch_M + i_patch_M;
if((i_M < sz_M) && (i_N < sz_N)){
atomicAdd(&d_accum[i_patch], 1);
}
}
}
int main (){
//define matrix size
int M = 20000;
int N = 5000;
//define patch size
int M_patch = 5;
int N_patch = 5;
int threads_per_block = 512;
//compute number of patches that fit in matrix
int n_patch_M = floor( (float) M/M_patch);
int n_patch_N = ceil( (float) N/N_patch);
int n_patch = n_patch_N * n_patch_M;
//allocate size for accumulated array
double *d_accum, *h_accum;
h_accum = (double *) malloc (sizeof(double) * n_patch_M * n_patch_N);
//compute kernel launch parameters
//type: 2D grid with 1D blocks
float grid_total = (float) M*N/threads_per_block;
int nblocks = (int) ceil(pow(grid_total, 0.5)) + 1;
dim3 block(threads_per_block);
dim3 grid(nblocks, nblocks, 1);
//device memory allocations
checkCudaErrors(cudaMalloc((void**)&d_accum,
n_patch*sizeof(double)));
checkCudaErrors(cudaMemset(d_accum, 0, n_patch*sizeof(double)));
cudaDeviceSynchronize();
//accumulate
accum_patch <<<grid, block>>> (M, N, M_patch, N_patch, M_patch*N_patch,
n_patch_M, n_patch_N, d_accum);
cudaDeviceSynchronize();
//copy results back to host
checkCudaErrors(cudaMemcpy(h_accum, d_accum,
n_patch * sizeof(double),
cudaMemcpyDeviceToHost));
//device memory clean up
checkCudaErrors(cudaFree(d_accum));
for (int i = 0; i<n_patch; i++){
if(h_accum[i]>(N_patch*M_patch + 1)){
printf("accum[%d]: %10.10f.\n", i, h_accum[i]);
}
}
return 0;
}
```

I compile and run it by doing:

nvcc -o accum_subsets accum_subsets.cu -I /usr/local/cuda/samples/common/inc -arch=sm_60

./accum_subsets