Hi,
I’m trying to use CUDA stream to build a processing pipeline that overlaps my kernel launch and memory transfer from and to GPU. Although the kernel has been distributed to various stream, all the processing was actually serialised and the memcpy is not overlapping the kernel execution at all. I wonder if someone could have a look and advice me on this? I’m using CUDA 8, gtx 980, and have added the --default-stream per thread command before compiling using visual studio.
Below is the profiling results and code to overlap the kernel execution and memory transfer.
Thanks!
Andy
https://drive.google.com/open?id=0B4VtavMHEP2LTENqQ3VwOTRXSzg
for (h = 0; h < numFrame; h++) {
for (i = 0; i < numAngle; i++) {
//cudaStreamSynchronize(stream[sID]);
printf("Recon image %d", i + h*numAngle);
offSetIn = (h*numAngle + i)*numSample*numChannel;
offSetOut = (h*numAngle + i)*imageHeight*imageWidth;
h_t0_wl = h_t0[i] * h_fs + 0.5f;
myparms.srcPtr = make_cudaPitchedPtr(d_RFIn + offSetIn, numSample * sizeof(float2), numSample, 1);
myparms.dstPos = make_cudaPos(0, 0, numChannel*(i%nLayered));
cudaMemcpy3DAsync(&myparms, stream[i]);
kernel_DAS_DynApo8 << <dimGrid, dimBlock, 0, stream[i] >> > (d_LRI + offSetOut, d_pixelMapX, d_pixelMapZ, h_angle[i], imageHeight, imageWidth, numChannel, h_t0_wl, i%nLayered);
cudaMemcpy2DAsync(h_LRI_r + offSetOut, sizeof(float), d_LRI + offSetOut, sizeof(float2), sizeof(float), imageHeight*imageWidth, cudaMemcpyDeviceToHost, stream[i]);
cudaMemcpy2DAsync(h_LRI_i + offSetOut, sizeof(float), &(d_LRI[offSetOut].y), sizeof(float2), sizeof(float), imageHeight*imageWidth, cudaMemcpyDeviceToHost, stream[i]);
}
}
cudaDeviceSynchronize();