Thanks,Txbob.
I have another question: In one process, I create two streams for one CPU, In each stream, I lunched two same kernels. Is these kernels from one stream can run concurrently with kenels from another stream?
The sample code is:
for (int i = 0; i < 2; i++)
{
cudaStatus = cudaEventRecord(start[i], stream[i]);
cudaStatus = cudaMemcpyAsync(pDevA[i], pHostA[i], sizeof(float)*nWidth*nHeight, cudaMemcpyHostToDevice, stream[i]);
TestKernel <<< gridSize, blockSize, 0, stream[i] >>>(pDevC[i], pDevA[i], nWidth, nHeight, 100);
cudaStatus = cudaMemcpyAsync(pHostC[i], pDevC[i], sizeof(float)*nWidth*nHeight, cudaMemcpyDeviceToHost, stream[i]);
TestKernel2 << < gridSize, blockSize, 0, stream[i] >> >(pDevC[i], pDevA[i], nWidth, nHeight, 100);
cudaStatus = cudaMemcpyAsync(pHostC[i], pDevC[i], sizeof(float)*nWidth*nHeight, cudaMemcpyDeviceToHost, stream[i]);
cudaStatus = cudaEventRecord(stop[i], stream[i]);
}
And how about the stream execution time?