I thought I had streams all figured out. You use the host to schedule tons of async operations, with dependencies managed by a cudaStream_t handle. In theory, this should maximize GPU usage; at least in terms of how many of the completely disjoint multiprocessors are being used.
But it wasn’t quite working like I expected. So I wrote this test program, which just does an async memcpy onto the device, launches a kernel, and then does an async memcpy back to the host. I have pools of streams and buffers and I just fire off as many ‘host_square’ functions as possible. I don’t even bother to synchronize the stream unless I’m going to run another host_square on it.
The number of preallocated streams and buffers is configurable by NUM_STREAMS. It seems like any value from 1-48 works fine; above 48 and it starts acting synchronously for some reason. I’d like to know why that is, but that’s not my biggest concern right now.
The problem is this: the time recorded for doTest with 48 streams is the same as the time recorded for doTest with only 1 stream. I don’t understand how this can be. My device supposedly has 16 disjoint multiprocessors; shouldn’t I be seeing a 16x speedup going from 1 stream to 16 streams? If not, how do I take advantage of all multiprocessors on the device?
I must not be understanding streams, or else I’m doing something wrong. I apologize for the length of this code sample, but I want to make sure anyone who cares can compile my code straight out of the box (just add a main() that calls runTest). I’m using 64-bit Ubuntu 8.04 and CUDA 2.0.
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#define SAFE_CALL(call) \
if( cudaSuccess != call) { \
fprintf(stderr, "[%s:%i] %s\n", \
__FILE__, __LINE__, cudaGetErrorString(cudaGetLastError())); \
exit(EXIT_FAILURE); \
}
#define TIME(X) { \
struct timeval before, after; \
gettimeofday(&before, NULL); \
X; \
gettimeofday(&after, NULL); \
int d = (after.tv_sec - before.tv_sec) * 1000000 + after.tv_usec - before.tv_usec; \
if (d > 10) fprintf(stderr, "% 6u us: [%u] " #X "\n", d, i % NUM_STREAMS); \
}
#define NUM_THREADS 512
#define MAX_VEC_LEN 1024
#define NUM_STREAMS 48
__device__ float dev_vector[MAX_VEC_LEN * NUM_STREAMS];
int g_nextVectorIdx = 0;
cudaStream_t g_streams[NUM_STREAMS];
int g_nextStreamIdx = 0;
__global__ void glob_square(int vecnum, int len)
{
float * v = &dev_vector[vecnum * MAX_VEC_LEN];
int j = 0;
while (++j < 10000)
{
for (int i=threadIdx.x; i < len; i += NUM_THREADS)
{
float f = v[i];
v[i] = f * f;
}
for (int i=threadIdx.x; i < len; i += NUM_THREADS)
{
float f = v[i];
v[i] = sqrt(f);
}
}
}
float * getDevVectorPtr(int i)
{
float *ptr;
SAFE_CALL(cudaGetSymbolAddress((void **) &ptr, dev_vector));
return ptr + MAX_VEC_LEN * i;
}
void host_square(float * v, int len)
{
int stream = g_streams[g_nextStreamIdx++ % NUM_STREAMS];
int i = g_nextVectorIdx++ % NUM_STREAMS;
TIME(SAFE_CALL(cudaMemcpyAsync(getDevVectorPtr(i), v, len * sizeof(float), cudaMemcpyHostToDevice, stream));)
dim3 grid(1,1,1);
dim3 threads(NUM_THREADS,1,1);
TIME((glob_square<<<grid, threads, 0, stream>>>(i, len)));
cudaError_t r = cudaGetLastError();
if (r != cudaSuccess) {
fprintf(stderr, "kernel launch: %s\n", cudaGetErrorString(r));
}
TIME(SAFE_CALL(cudaMemcpyAsync(v, getDevVectorPtr(i), len * sizeof(float), cudaMemcpyDeviceToHost, stream));)
}
float * g_hostVector = NULL;
void initTest()
{
int i=0;
TIME(SAFE_CALL(cudaMallocHost((void **) &g_hostVector, NUM_STREAMS * MAX_VEC_LEN * sizeof(float)));)
for (i=0; i < NUM_STREAMS; ++i)
{
SAFE_CALL(cudaStreamCreate(&g_streams[i]));
}
for (i=0; i < NUM_STREAMS * MAX_VEC_LEN; ++i)
{
g_hostVector[i] = (float) i;
}
}
void doTest()
{
for (int i=0; i < 128; ++i)
{
float * v = &g_hostVector[(i % NUM_STREAMS) * MAX_VEC_LEN];
if (cudaStreamQuery(g_streams[i % NUM_STREAMS]) != cudaSuccess)
{
fprintf(stderr, "stream %u busy\n", i % NUM_STREAMS);
cudaStreamSynchronize(g_streams[i % NUM_STREAMS]);
}
host_square(v, MAX_VEC_LEN);
}
SAFE_CALL(cudaThreadSynchronize());
}
extern "C" void runTest()
{
int i=0;
TIME(initTest();)
TIME(doTest();)
}
Also, in the Cuda Profiler, only 1 of every 8 launches has a cta_launched of 1; the other 7 are 0. Those show valid numbers for GPU time and occupancy, but nothing else. Not sure if that’s related or a bug in the Profiler or some other problem entirely.