Hi
I read the post https://developer.nvidia.com/blog/gpu-pro-tip-cuda-7-streams-simplify-concurrency/#disqus_thread
const int N = 1 << 20;
global void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
int main()
{
const int num_streams = 8;
cudaStream_t streams[num_streams];
float *data[num_streams];
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
cudaMalloc(&data[i], N * sizeof(float));
// launch one worker kernel per stream
kernel<<<1, 64, 0, streams[i]>>>(data[i], N);
// launch a dummy kernel on the default stream
kernel<<<1, 1>>>(0, 0);
}
cudaDeviceReset();
return 0;
}
I have questions about cudaMalloc because in cuda c programming guide, it claims that device memory allocation is synchronous. However, in this post, it did not affect the asynchronous execution. However, if I add one cudaFree, it will affect the asynchronous and make it much slower.