Hello all,
I tried to use two streams to execute kernel and memcpy between host and device concurrently.
In my application, two CPU thread generate tasks and they push generated task into one of two streams after the other.
Following is my code.
void CUDA_doAsync( GPU_Task_Unit* GTU ){
unsigned int taskSize = CUDA_SEGMENT_SIZE * GTU->numSeg;
unsigned int streamID = (GTU->id)%CUDA_NUM_STREAM;
dim3 dimGrid(taskSize/BLOCK_SIZE,1);
dim3 dimBlock(BLOCK_SIZE, 1);
cutilSafeCall( cudaMemcpyAsync( dTasks[GTU->id], GTU->pStartSeg, taskSize * sizeof(CUDAtask), cudaMemcpyHostToDevice, stream[streamID] ) );
cutilSafeCall( cudaMemcpyAsync( GTU->stateSend, dFlag_true, sizeof(char), cudaMemcpyDeviceToHost, stream[streamID] ) );
doKernel<<<dimGrid, dimBlock, 0, stream[streamID]>>>( dTasks[GTU->id], dResults[GTU->id] );
cutilSafeCall( cudaMemcpyAsync( GTU->result, dResults[GTU->id], taskSize * sizeof(CUDAresult), cudaMemcpyDeviceToHost, stream[streamID] ) );
cutilSafeCall( cudaMemcpyAsync( GTU->stateTask, dFlag_true, sizeof(bool), cudaMemcpyDeviceToHost, stream[streamID] ) );
}
I expected that it may increase the performace of my App.
But, it made my applicaiton slow-down slightly.
I cannot understand the reason.
I use GTX285 and all host memory used in cudaMemcpyAsync are allocated by using cudaMallocHost().
Also, my applicaion does not need to synchronization at the end of the code.
Please, give me your advice.
Thanks.