I have some painful problems when I’m using cudaStreamAddCallback in my code. I managed some stream and use cudaStreamAddCallback to register callback which notify me when tasks assigned to the stream is completed.
My device is GTX 970 and my development environment is Visual Studio 2013 with CUDA 7.
The following code and image is my code without cudaStreamAddCallback and its timeline profiled by nsight. Everything looks good:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_call_check.h"
#define NUM_STREAM 32
#define ARRAY_SIZE 1000000
__global__ void kernel_add_one(float* point, unsigned int num)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t offset = idx; offset < num; offset += gridDim.x * blockDim.x)
{
point[offset] += 1;
}
}
void CUDART_CB callback(cudaStream_t stream, cudaError_t status, void *arg)
{
}
int main()
{
float *dev_points[NUM_STREAM];
float *host_points[NUM_STREAM];
cudaStream_t streams[NUM_STREAM];
for (size_t i = 0; i < NUM_STREAM; ++i)
{
CUDA_SAFE_CALL(cudaMalloc(dev_points + i, ARRAY_SIZE * sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost(host_points + i, ARRAY_SIZE * sizeof(float)));
CUDA_SAFE_CALL(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
for (size_t j = 0; j < ARRAY_SIZE; ++j)
{
host_points[i][j] = static_cast<float>(i + j);
}
}
for (size_t i = 0; i < NUM_STREAM; ++i)
{
CUDA_SAFE_CALL(cudaMemcpyAsync(dev_points[i], host_points[i],
ARRAY_SIZE * sizeof(float),
cudaMemcpyHostToDevice, streams[i]));
kernel_add_one<<<1, 16, 0, streams[i]>>>(dev_points[i], ARRAY_SIZE);
CUDA_SAFE_CALL(cudaMemcpyAsync(host_points[i], dev_points[i],
ARRAY_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, streams[i]));
}
CUDA_SAFE_CALL(cudaDeviceSynchronize());
for (size_t i = 0; i < NUM_STREAM; ++i)
{
CUDA_SAFE_CALL(cudaFree(dev_points[i]));
CUDA_SAFE_CALL(cudaFreeHost(host_points[i]));
CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
}
}
But if I using following codes, the kernel execution is serialized. The difference of the two code is about line 48.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_call_check.h"
#define NUM_STREAM 32
#define ARRAY_SIZE 1000000
__global__ void kernel_add_one(float* point, unsigned int num)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t offset = idx; offset < num; offset += gridDim.x * blockDim.x)
{
point[offset] += 1;
}
}
void CUDART_CB callback(cudaStream_t stream, cudaError_t status, void *arg)
{
}
int main()
{
float *dev_points[NUM_STREAM];
float *host_points[NUM_STREAM];
cudaStream_t streams[NUM_STREAM];
for (size_t i = 0; i < NUM_STREAM; ++i)
{
CUDA_SAFE_CALL(cudaMalloc(dev_points + i, ARRAY_SIZE * sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost(host_points + i, ARRAY_SIZE * sizeof(float)));
CUDA_SAFE_CALL(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
for (size_t j = 0; j < ARRAY_SIZE; ++j)
{
host_points[i][j] = static_cast<float>(i + j);
}
}
for (size_t i = 0; i < NUM_STREAM; ++i)
{
CUDA_SAFE_CALL(cudaMemcpyAsync(dev_points[i], host_points[i],
ARRAY_SIZE * sizeof(float),
cudaMemcpyHostToDevice, streams[i]));
kernel_add_one<<<1, 16, 0, streams[i]>>>(dev_points[i], ARRAY_SIZE);
CUDA_SAFE_CALL(cudaMemcpyAsync(host_points[i], dev_points[i],
ARRAY_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, streams[i]));
CUDA_SAFE_CALL(cudaStreamAddCallback(streams[i], callback, NULL, 0));
}
CUDA_SAFE_CALL(cudaDeviceSynchronize());
for (size_t i = 0; i < NUM_STREAM; ++i)
{
CUDA_SAFE_CALL(cudaFree(dev_points[i]));
CUDA_SAFE_CALL(cudaFreeHost(host_points[i]));
CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
}
}
Timeline is following:
Does some one know the reason of this problem? It has bothered me for a long time.