My GPU is Tesla K40.
It has one kernal engine and two copy engine.
I encountered a strange phenomenon when I test the performance of cudaMemcpyAsync as follows:
threads = dim3(512, 1);
blocks = dim3(n / threads.x, 1);
//time for device memcpy in one stream and kernal in other stream
checkCudaErrors(cudaEventRecord(start_event,0));
for(int i = 0 ; i < 5 ; ++i) {
checkCudaErrors(cudaMemcpyAsync(deviceMemoryDest_2, deviceMemorySrc_1, nbytes,cudaMemcpyDeviceToDevice,streams[0]));
}
for(int i = 0 ; i < 5 ; ++i) {
scaleVector<<<blocks, threads, 0, streams[1]>>>(deviceComputeMemory, deviceFactorMemory, num_iterations);
}
checkCudaErrors(cudaEventRecord(stop_event,0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&time, start_event, stop_event));
cout << “time is:\t” << time << endl;
It seems that the kernal and memcpy code above cannot overlap. But when I change some code as follows, they can overlap:
threads = dim3(512, 1);
blocks = dim3(n / threads.x, 1);
//time for device memcpy in one stream and kernal in other stream
checkCudaErrors(cudaEventRecord(start_event,0));
for(int i = 0 ; i < nreps ; ++i) {
checkCudaErrors(cudaMemcpyAsync(deviceMemoryDest_2, deviceMemorySrc_1, nbytes,cudaMemcpyDeviceToDevice,streams[0]));
scaleVector<<<blocks, threads, 0, streams[1]>>>(deviceComputeMemory, deviceFactorMemory, num_iterations);
}
checkCudaErrors(cudaEventRecord(stop_event,0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&time, start_event, stop_event));
cout << “time is:\t” << time << endl;
I think it is a very strange phenomenon and want to know what happened in the kernal engine and copy engine when I using cudaMemcpyDeviceToDevice.