Getting zero values on 2D large kernels

Hi,

For some reason I am getting zero “distance” values when running this code in my jetson Xavier AGX ([16GB version] - Jetpack 4.4 [L4T 32.4.3]). Can someone help me figure out why I am having problems with this kernel? It would seem to me that I am under the correct threadPerBlock and number of block requirement for both dimensions. I compile this code with a simple

nvcc -o test test.cu

Thanks!

Here is the code:

#include <iostream>
#include <math.h>


int iDivUp(int a, int b) // Round a / b to nearest higher integer value
    { return (a % b != 0) ? (a / b + 1) : (a / b); }


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void assemble_final_result(float* distances,
                                      const int NX, const int NY) {

    int i = threadIdx.x + blockIdx.x * gridDim.x;
    int j = threadIdx.y + blockIdx.y * gridDim.y;

    if ((i < NX) && (j < NY)) distances[i + j*NX] = i + j*NX;
    __syncthreads();

}


int main()
{
	int vec1Size = 16384;
	int vec2Size = 680;
	int iterSize = 32;

    float* distances = NULL;
    gpuErrchk( cudaMallocManaged(&distances, sizeof(float)*vec1Size*vec2Size*iterSize) );


    int numX = 32;
    int numY = 32;
    dim3 threadsPerBlockFinal(numX, numY);
    dim3 numBlocksFinal = dim3(iDivUp(vec1Size, numX), iDivUp(vec2Size*iterSize, numY));



    assemble_final_result<<<numBlocksFinal, threadsPerBlockFinal>>>(distances, vec1Size, iterSize*vec2Size);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    for(auto i = 0; i < 10000; ++i)
    {
        std::cout << "distances[" << i << "]: " << distances[i] << "\n";
    }
    
    cudaFree(distances);


    return 0;
}

Hi,

The index should be calculated like this:
(blockDim indicates the 2D block number)

int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;

We can get the correct assignment after updating this.

Thanks.