Platform : Jetson TK1
I am unable to achieve kernel concurrency with the following code :-
kernel :
__global__ void add_matrix(char *m1, char *m2, char* m3, char * outm, int *grayWidthStep, int * offset)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i >= 960)
return;
if(j >= 540)
return;
const int gray_tid = j * (*grayWidthStep) + (i*3) + (*offset);
outm[gray_tid] = (m1[gray_tid]) * ( m2[gray_tid]) * ( m3[gray_tid]);
outm[gray_tid + 1] = (m1[gray_tid + 1]) * (m2[gray_tid + 1]) * (m2[gray_tid + 1]);
outm[gray_tid + 2] = (m1[gray_tid + 2]) * (m2[gray_tid + 2]) * (m2[gray_tid + 1]);
return;
}
Two streams :
for (int i = 0; i < stream_count; ++i)
cudaStreamCreate(&stream[i]);
Memory allocations :
//cudaMalloc for each matrix . Showing just one here.
cudaMalloc((char **)&imageMain_d,input_size);
Memory copies :
//cudaMemcpyAsync for each matrix.
cudaMemcpyAsync(imageMain_d, imageMain.ptr(), input_size, cudaMemcpyHostToDevice,stream[0]);
Grid Size
dim3 block(32, 16, 1);
dim3 grid(((imageMain.cols/2) + block.x - 1)/block.x, ((imageMain.rows/2) + block.y - 1)/block.y);
Kernel Launches
add_matrix<<<grid,block,0,stream[0]>>>(imageMain_d,imageLogo_d,image3_d,imageout_d,image_step_d, offset_d);
add_matrix<<<grid,block,0,stream[1]>>>(imageMain_d2,imageLogo_d2,image3_d2,imageout_d2,image_step_d2,offset_d2);
Following by cudaDeviceSynchronize();
The Visual profiler shows both as occuring one after the other on alternate streams
same for cudaMemcpyAsync, one after the other on alternate streams.