simple vector add with streams

Hello , I can’t understand why a simple vector add with streams runs slower than without.

Time using streams : 27.577 ms
Time not using streams : 11.511 ms

#include <cstdio>
#include <cstdlib>

#define USE_STREAMS 1

#define gpuErrchk(ans) { gpuAssert( (ans), __FILE__, __LINE__ ); }

inline void
gpuAssert( cudaError_t code, const char * file, int line, bool abort = true )
{
	if ( cudaSuccess != code )
	{
		fprintf( stderr, "\nGPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
		if ( abort )
			exit( code );
	}


	return;

} /* gpuAssert */
 

__global__ void Add( int N ,int Offset ,float * devA , float * devB , float *devC )
{

	for ( int idx = blockIdx.x * blockDim.x + threadIdx.x + Offset; idx < N; idx += blockDim.x * gridDim.x )
		
		devC[ idx ] = devA[ idx ] + devB[ idx ];

}



int main()
{
	
	int N = 4000000;
	
	int Threads = 256;
	
	const int NbStreams = 8;
	const int StreamSize = N / NbStreams;
	
	float *A , *B , *C;
	gpuErrchk( cudaHostAlloc( (void**) &A , N * sizeof(*A) ,cudaHostAllocDefault ) );
	gpuErrchk( cudaHostAlloc( (void**) &B , N * sizeof(*B) ,cudaHostAllocDefault ) );
	gpuErrchk( cudaHostAlloc( (void**) &C , N * sizeof(*C) ,cudaHostAllocDefault ) );

	for ( int i = 0; i < N; i++ )
	{
		A[ i ] = i;
		B[ i ] = i + 1;
	}
	
	float *devA , *devB , *devC;
	gpuErrchk( cudaMalloc( (void**) &devA , N * sizeof(*devA)) );
	gpuErrchk( cudaMalloc( (void**) &devB , N * sizeof(*devB)) );
	gpuErrchk( cudaMalloc( (void**) &devC , N * sizeof(*devC)) );

	cudaEvent_t EventPre,
	            EventPost;
	float PostPreTime;
	
	gpuErrchk( cudaEventCreate( &EventPre ) );
	gpuErrchk( cudaEventCreate( &EventPost ) );
	
    cudaStream_t Stream[ NbStreams ];
	for ( int i = 0; i < NbStreams; i++ )
    	gpuErrchk( cudaStreamCreate(&Stream[ i ]) );

#if ! USE_STREAMS
	
	gpuErrchk( cudaEventRecord(EventPre ) );

	gpuErrchk( cudaMemcpy(devA, A, N * sizeof(*A), cudaMemcpyHostToDevice) );
	gpuErrchk( cudaMemcpy(devB, B, N * sizeof(*B), cudaMemcpyHostToDevice) );
	gpuErrchk( cudaMemcpy(devC, C, N * sizeof(*C), cudaMemcpyHostToDevice) );
	
	Add<<< N / Threads, Threads>>>( N ,0, devA , devB , devC );
	
	gpuErrchk( cudaMemcpy(A, devA, N * sizeof(*A), cudaMemcpyDeviceToHost) );
	gpuErrchk( cudaMemcpy(B, devB, N * sizeof(*B), cudaMemcpyDeviceToHost) );
	gpuErrchk( cudaMemcpy(C, devC, N * sizeof(*C), cudaMemcpyDeviceToHost) );

	gpuErrchk( cudaEventRecord( EventPost ) );
	gpuErrchk( cudaEventSynchronize( EventPost ) );
	gpuErrchk( cudaEventElapsedTime( &PostPreTime, EventPre, EventPost ) );
	printf( "\nTime not using streams: %f ms\n", PostPreTime );
	
#else
	
	gpuErrchk( cudaEventRecord( EventPre ) );
	for ( int i = 0; i < NbStreams; i++ )
	{
		int Offset = i * StreamSize;
		
		gpuErrchk( cudaMemcpyAsync(&devA[ Offset ], &A[ Offset ], StreamSize * sizeof(*A), cudaMemcpyHostToDevice, Stream[ i ]) );
		gpuErrchk( cudaMemcpyAsync(&devB[ Offset ], &B[ Offset ], StreamSize * sizeof(*B), cudaMemcpyHostToDevice, Stream[ i ]) );
		gpuErrchk( cudaMemcpyAsync(&devC[ Offset ], &C[ Offset ], StreamSize * sizeof(*C), cudaMemcpyHostToDevice, Stream[ i ]) );
		
		Add<<< StreamSize / Threads, Threads, 0, Stream[ i ]>>>( N ,Offset, devA , devB , devC );
	
		gpuErrchk( cudaMemcpyAsync(&A[ Offset ], &devA[ Offset ], StreamSize * sizeof(*devA), cudaMemcpyDeviceToHost, Stream[ i ]) );
		gpuErrchk( cudaMemcpyAsync(&B[ Offset ], &devB[ Offset ], StreamSize * sizeof(*devB), cudaMemcpyDeviceToHost, Stream[ i ]) );
		gpuErrchk( cudaMemcpyAsync(&C[ Offset ], &devC[ Offset ], StreamSize * sizeof(*devC), cudaMemcpyDeviceToHost, Stream[ i ]) );

	}
	
	gpuErrchk( cudaEventRecord( EventPost ) );
	gpuErrchk( cudaEventSynchronize( EventPost ) );
	gpuErrchk( cudaEventElapsedTime( &PostPreTime, EventPre,EventPost ) );
	printf( "\nTime using streams: %f ms\n", PostPreTime );

#endif /* ! USE_STREAMS */
	
	//for ( int i = 0; i < N; i++ ) 
		//printf( "\n C[%d] = %f",i,C[ i ] );
	
	for ( int i = 0; i < NbStreams; i++ )
		gpuErrchk( cudaStreamDestroy(Stream[ i ]) );
	
	gpuErrchk( cudaFree(devA) );
	gpuErrchk( cudaFree(devB) );
	gpuErrchk( cudaFree(devC) );

	gpuErrchk( cudaFreeHost(A) );
	gpuErrchk( cudaFreeHost(B) );
	gpuErrchk( cudaFreeHost(C) );

	gpuErrchk( cudaEventDestroy(EventPre) );
    gpuErrchk( cudaEventDestroy(EventPost) );
  
	printf("\n");
	
	return 0;
	
}

Thanks!

Your streamed kernel isn’t parceling up the work correctly:

Add<<< StreamSize / Threads, Threads, 0, Stream[ i ]>>>( N ,Offset,
                                                         ^^^^^^^^^

this is telling each kernel launch to start at Offset and go to the end of the data set (N). Since the earlier kernels do not have all the data copied down yet, they are potentially adding bogus numbers in some cases. You probably don’t observe this because later kernels come along and “fix” it.

You can probably fix the above error just by changing N in the above kernel launch to Offset+StreamSize

Having said that, I don’t think this is a particularly good test case. If you want to analyze what is going on in your particular case, I suggest using one of the profilers. When I run a fixed code with proper results validation, I get ~12ms for streams and ~16ms for no streams, on linux/CUDA7/Quadro5000 GPU. On the other hand, if I run it on a GT640 GPU, I get approximately equal times (~30ms) for both.

Thank you for your help txbob.

So, I changed only the N and replaced it with Offset + StreamSize.
I am leaving the second argument ( offset ) as it is ? And hence use:

int idx = blockIdx.x * blockDim.x + threadIdx.x + offset

or I completely removing the offset from the thread Idx?

Is there a way to choose the number of streams?

Thanks!

I’m puzzled. You seem to be completely unfamiliar with the code. This chooses the number of streams:

const int NbStreams = 8;

Yes , ok . I mean ,I chose the number of streams now in this example.
Is there a genaral rule?

Also, do I have to completely remove the ‘offset’

from

int idx = blockIdx.x * blockDim.x + threadIdx.x + offset

?
as I wrote above?

Thanks!

What I said:

“You can probably fix the above error just by changing N in the above kernel launch to Offset+StreamSize”

Here is the code I used. When compiling, just add the command line switch -DNOSTREAMS to use the non streams version.

#include <cstdio>
#include <cstdlib>

#define gpuErrchk(ans) { gpuAssert( (ans), __FILE__, __LINE__ ); }

inline void
gpuAssert( cudaError_t code, const char * file, int line, bool abort = true )
{
        if ( cudaSuccess != code )
        {
                fprintf( stderr, "\nGPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
                if ( abort )
                        exit( code );
        }

return;

} /* gpuAssert */

__global__ void Add( int N ,int Offset ,float * devA , float * devB , float *devC )
{

        for ( int idx = blockIdx.x * blockDim.x + threadIdx.x + Offset; idx < N; idx += blockDim.x * gridDim.x )

                devC[ idx ] = devA[ idx ] + devB[ idx ];

}

int main()
{

        int N = 4000000;

        int Threads = 256;

        const int NbStreams = 8;

        float *A , *B , *C;
        gpuErrchk( cudaHostAlloc( (void**) &A , N * sizeof(*A) ,cudaHostAllocDefault ) );
        gpuErrchk( cudaHostAlloc( (void**) &B , N * sizeof(*B) ,cudaHostAllocDefault ) );
        gpuErrchk( cudaHostAlloc( (void**) &C , N * sizeof(*C) ,cudaHostAllocDefault ) );

        for ( int i = 0; i < N; i++ )
        {
                A[ i ] = i;
                B[ i ] = i + 1;
        }

        float *devA , *devB , *devC;
        gpuErrchk( cudaMalloc( (void**) &devA , N * sizeof(*devA)) );
        gpuErrchk( cudaMalloc( (void**) &devB , N * sizeof(*devB)) );
        gpuErrchk( cudaMalloc( (void**) &devC , N * sizeof(*devC)) );

        cudaEvent_t EventPre,
                    EventPost;
        float PostPreTime;

        gpuErrchk( cudaEventCreate( &EventPre ) );
        gpuErrchk( cudaEventCreate( &EventPost ) );

    cudaStream_t Stream[ NbStreams ];
        for ( int i = 0; i < NbStreams; i++ )
        gpuErrchk( cudaStreamCreate(&Stream[ i ]) );

#ifdef NOSTREAMS

        gpuErrchk( cudaEventRecord(EventPre ) );

        gpuErrchk( cudaMemcpy(devA, A, N * sizeof(*A), cudaMemcpyHostToDevice) );
        gpuErrchk( cudaMemcpy(devB, B, N * sizeof(*B), cudaMemcpyHostToDevice) );
//        gpuErrchk( cudaMemcpy(devC, C, N * sizeof(*C), cudaMemcpyHostToDevice) );

        Add<<< N / Threads, Threads>>>( N ,0, devA , devB , devC );

//        gpuErrchk( cudaMemcpy(A, devA, N * sizeof(*A), cudaMemcpyDeviceToHost) );
//        gpuErrchk( cudaMemcpy(B, devB, N * sizeof(*B), cudaMemcpyDeviceToHost) );
        gpuErrchk( cudaMemcpy(C, devC, N * sizeof(*C), cudaMemcpyDeviceToHost) );

        gpuErrchk( cudaEventRecord( EventPost ) );
        gpuErrchk( cudaEventSynchronize( EventPost ) );
        gpuErrchk( cudaEventElapsedTime( &PostPreTime, EventPre, EventPost ) );
        printf( "\nTime not using streams: %f ms\n", PostPreTime );

#else

        const int StreamSize = N / NbStreams;
        gpuErrchk( cudaEventRecord( EventPre ) );
        for ( int i = 0; i < NbStreams; i++ )
        {
                int Offset = i * StreamSize;

                gpuErrchk( cudaMemcpyAsync(&devA[ Offset ], &A[ Offset ], StreamSize * sizeof(*A), cudaMemcpyHostToDevice, Stream[ i ]) );
                gpuErrchk( cudaMemcpyAsync(&devB[ Offset ], &B[ Offset ], StreamSize * sizeof(*B), cudaMemcpyHostToDevice, Stream[ i ]) );
//                gpuErrchk( cudaMemcpyAsync(&devC[ Offset ], &C[ Offset ], StreamSize * sizeof(*C), cudaMemcpyHostToDevice, Stream[ i ]) );

                Add<<< StreamSize / Threads, Threads, 0, Stream[ i ]>>>( Offset+StreamSize ,Offset, devA , devB , devC );

//                gpuErrchk( cudaMemcpyAsync(&A[ Offset ], &devA[ Offset ], StreamSize * sizeof(*devA), cudaMemcpyDeviceToHost, Stream[ i ]) );
//                gpuErrchk( cudaMemcpyAsync(&B[ Offset ], &devB[ Offset ], StreamSize * sizeof(*devB), cudaMemcpyDeviceToHost, Stream[ i ]) );
                gpuErrchk( cudaMemcpyAsync(&C[ Offset ], &devC[ Offset ], StreamSize * sizeof(*devC), cudaMemcpyDeviceToHost, Stream[ i ]) );

        }

        gpuErrchk( cudaEventRecord( EventPost ) );
        gpuErrchk( cudaEventSynchronize( EventPost ) );
        gpuErrchk( cudaEventElapsedTime( &PostPreTime, EventPre,EventPost ) );
        printf( "\nTime using streams: %f ms\n", PostPreTime );

#endif /* ! USE_STREAMS */

        for ( int i = 0; i < N; i++ )
                if (C[i] != (A[i]+B[i])) {printf("mismatch at %d, was: %f, should be: %f\n", i, C[i], (A[i]+B[i])); return 1;}

        for ( int i = 0; i < NbStreams; i++ )
                gpuErrchk( cudaStreamDestroy(Stream[ i ]) );

        gpuErrchk( cudaFree(devA) );
        gpuErrchk( cudaFree(devB) );
        gpuErrchk( cudaFree(devC) );

        gpuErrchk( cudaFreeHost(A) );
        gpuErrchk( cudaFreeHost(B) );
        gpuErrchk( cudaFreeHost(C) );

        gpuErrchk( cudaEventDestroy(EventPre) );
    gpuErrchk( cudaEventDestroy(EventPost) );

        printf("\n");

        return 0;

}

OK, thank you.