Hi,
I’m testing long long numbers before adding more code, the kernel should print the last tid based on the grid and blocks dimension, but I can not get the right results. Dim3 grid(2147483647,1,1) and dim3 blocks(1024,1,1), so I should get the last tid, based on 1D grid and 1D blocks, as (2147483647 * 1024) - 1, but it does not go further then 4294967295 (tid > 4294967294) (found out by trial and errors). What am I doing wrong?
// System includes
#include <stdio.h>
#include <stdlib.h> // For exit()
#include <string.h>
#include <math.h>
#include <ctype.h>
//#include <inttypes.h>
//#include <stdint.h>
// CUDA runtime
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
// Kernel definition
// Run on GPU
__global__ void add() {
// 1D grid of 1D blocks
unsigned long long tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid == 2199023254527)
printf("%lld \n",tid);
}
int main(void) {
dim3 grid(2147483647, 1, 1);
dim3 blocks(1024, 1, 1);
// Launch add() kernel on GPU
add<<<grid,blocks>>>();
getLastCudaError("Kernel execution failed");
checkCudaErrors( cudaPeekAtLastError() );
checkCudaErrors( cudaDeviceSynchronize() );
return 0;
}