Streams and multiprocessor usage?

I thought I had streams all figured out. You use the host to schedule tons of async operations, with dependencies managed by a cudaStream_t handle. In theory, this should maximize GPU usage; at least in terms of how many of the completely disjoint multiprocessors are being used.

But it wasn’t quite working like I expected. So I wrote this test program, which just does an async memcpy onto the device, launches a kernel, and then does an async memcpy back to the host. I have pools of streams and buffers and I just fire off as many ‘host_square’ functions as possible. I don’t even bother to synchronize the stream unless I’m going to run another host_square on it.

The number of preallocated streams and buffers is configurable by NUM_STREAMS. It seems like any value from 1-48 works fine; above 48 and it starts acting synchronously for some reason. I’d like to know why that is, but that’s not my biggest concern right now.

The problem is this: the time recorded for doTest with 48 streams is the same as the time recorded for doTest with only 1 stream. I don’t understand how this can be. My device supposedly has 16 disjoint multiprocessors; shouldn’t I be seeing a 16x speedup going from 1 stream to 16 streams? If not, how do I take advantage of all multiprocessors on the device?

I must not be understanding streams, or else I’m doing something wrong. I apologize for the length of this code sample, but I want to make sure anyone who cares can compile my code straight out of the box (just add a main() that calls runTest). I’m using 64-bit Ubuntu 8.04 and CUDA 2.0.

#include <stdio.h>

#include <sys/time.h>

#include <time.h>

#define SAFE_CALL(call)                                                      \

    if( cudaSuccess != call) {                                               \

        fprintf(stderr, "[%s:%i] %s\n",                                      \

                __FILE__, __LINE__, cudaGetErrorString(cudaGetLastError())); \

        exit(EXIT_FAILURE);                                                  \

    } 

#define TIME(X) { \

    struct timeval before, after; \

    gettimeofday(&before, NULL); \

    X; \

    gettimeofday(&after, NULL); \

    int d = (after.tv_sec - before.tv_sec) * 1000000 + after.tv_usec - before.tv_usec; \

    if (d > 10) fprintf(stderr, "% 6u us: [%u] " #X "\n", d, i % NUM_STREAMS); \

   }

#define NUM_THREADS 512

#define MAX_VEC_LEN 1024

#define NUM_STREAMS 48

__device__ float dev_vector[MAX_VEC_LEN * NUM_STREAMS];

int g_nextVectorIdx = 0;

cudaStream_t g_streams[NUM_STREAMS];

int g_nextStreamIdx = 0;

__global__ void glob_square(int vecnum, int len)

{

    float * v = &dev_vector[vecnum * MAX_VEC_LEN];

   int j = 0;

    while (++j < 10000)

    {

        for (int i=threadIdx.x; i < len; i += NUM_THREADS)

        {

            float f = v[i];

            v[i] = f * f;

        }

       for (int i=threadIdx.x; i < len; i += NUM_THREADS)

        {

            float f = v[i];

            v[i] = sqrt(f);

        }

    }

}

float * getDevVectorPtr(int i)

{

    float *ptr;

    SAFE_CALL(cudaGetSymbolAddress((void **) &ptr, dev_vector));

    return ptr + MAX_VEC_LEN * i;

}

void host_square(float * v, int len)

{

    int stream = g_streams[g_nextStreamIdx++ % NUM_STREAMS];

    int i = g_nextVectorIdx++ % NUM_STREAMS;

   TIME(SAFE_CALL(cudaMemcpyAsync(getDevVectorPtr(i), v, len * sizeof(float), cudaMemcpyHostToDevice, stream));)

   dim3 grid(1,1,1);

    dim3 threads(NUM_THREADS,1,1);

    TIME((glob_square<<<grid, threads, 0, stream>>>(i, len)));

    cudaError_t r = cudaGetLastError();

    if (r != cudaSuccess) {                                              

        fprintf(stderr, "kernel launch: %s\n", cudaGetErrorString(r));

    }

   TIME(SAFE_CALL(cudaMemcpyAsync(v, getDevVectorPtr(i), len * sizeof(float), cudaMemcpyDeviceToHost, stream));)

}

float * g_hostVector = NULL;

void initTest()

{

    int i=0;

    TIME(SAFE_CALL(cudaMallocHost((void **) &g_hostVector, NUM_STREAMS * MAX_VEC_LEN * sizeof(float)));)

    

    for (i=0; i < NUM_STREAMS; ++i)

    {

         SAFE_CALL(cudaStreamCreate(&g_streams[i]));

   }

   for (i=0; i < NUM_STREAMS * MAX_VEC_LEN; ++i)

    {

        g_hostVector[i] = (float) i;

    }

}

void doTest()

{

    for (int i=0; i < 128; ++i)

    {   

        float * v = &g_hostVector[(i % NUM_STREAMS) * MAX_VEC_LEN];

        if (cudaStreamQuery(g_streams[i % NUM_STREAMS]) != cudaSuccess)

        {

            fprintf(stderr, "stream %u busy\n", i % NUM_STREAMS);

            cudaStreamSynchronize(g_streams[i % NUM_STREAMS]);

        }

        host_square(v, MAX_VEC_LEN);

    }

    SAFE_CALL(cudaThreadSynchronize());

}

extern "C" void runTest()

{

    int i=0;

    TIME(initTest();)

    TIME(doTest();)

}

Also, in the Cuda Profiler, only 1 of every 8 launches has a cta_launched of 1; the other 7 are 0. Those show valid numbers for GPU time and occupancy, but nothing else. Not sure if that’s related or a bug in the Profiler or some other problem entirely.

Hi,
ich have seen same usses.
Stream usage does not give me any performance imporvement at all.
I am also trying to do async copies, and my device supports it

As I understand it, two kernels cannot run on the GPU in the same time, even if they belong to different streams. On the other hand, you keep multiprocessors busy by providing enough blocks of threads for each kernel.

What you can achieve by streams is parallelism between trasfers to/from gpu and kernel execution (not within same stream, because the kernel associated to the stream needs its data).

Correct. Currently streams allow you to overlap a kernel execution and a CPU-GPU async memcopy, but not multiple kernels. See, for example, simpleStreams sample for a way how this could improve overall app performance.

Paulius