Hi,
I have a question about the default stream. I need to execute a kernel like this:
template <typename T>
__global__ void myKernel(T *myData, int ix, int iy, int iz)
{
//index for each dimension
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int k = blockDim.z * blockIdx.z + threadIdx.z;
//Global index
int idx = (((i * iy) + j) * iz) + k;
//Validate the dimension indexes.
if ( i < ix and j < iy and k < iz )
{
myData[idx] += MY_CONSTANT;
}
}
From the host I execute the kernel:
float *data;
cudaMallocManaged(&data, M);
for (int i = 0; i < N; i ++)
{
myKernel<<gridSize, blockSize>>(data, ix, iy, iz);
}
My doubt is if the kernels will execute in the order that they were issued or if they will run concurrently.
I know that the default stream execute the instruction in the order that were issued but I’m not sure if this is true with a iteration.
Thank you.