Hi,
For some reason I am getting zero “distance” values when running this code in my jetson Xavier AGX ([16GB version] - Jetpack 4.4 [L4T 32.4.3]). Can someone help me figure out why I am having problems with this kernel? It would seem to me that I am under the correct threadPerBlock and number of block requirement for both dimensions. I compile this code with a simple
nvcc -o test test.cu
Thanks!
Here is the code:
#include <iostream>
#include <math.h>
int iDivUp(int a, int b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void assemble_final_result(float* distances,
const int NX, const int NY) {
int i = threadIdx.x + blockIdx.x * gridDim.x;
int j = threadIdx.y + blockIdx.y * gridDim.y;
if ((i < NX) && (j < NY)) distances[i + j*NX] = i + j*NX;
__syncthreads();
}
int main()
{
int vec1Size = 16384;
int vec2Size = 680;
int iterSize = 32;
float* distances = NULL;
gpuErrchk( cudaMallocManaged(&distances, sizeof(float)*vec1Size*vec2Size*iterSize) );
int numX = 32;
int numY = 32;
dim3 threadsPerBlockFinal(numX, numY);
dim3 numBlocksFinal = dim3(iDivUp(vec1Size, numX), iDivUp(vec2Size*iterSize, numY));
assemble_final_result<<<numBlocksFinal, threadsPerBlockFinal>>>(distances, vec1Size, iterSize*vec2Size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
for(auto i = 0; i < 10000; ++i)
{
std::cout << "distances[" << i << "]: " << distances[i] << "\n";
}
cudaFree(distances);
return 0;
}