Problems with streams

chhenning_air · September 30, 2010, 3:11pm

Hi there,

could someone tell what is wrong with my simple stream application. Here is the complete source code:

[codebox]#include <stdio.h>

#include <tchar.h>

#include

#include <cuda_runtime.h>

typedef float float_t;

using namespace std;

global void add( const float_t* a

               , const float_t* b

               , float_t*       c

               , const int      n

               )

{

int tid = threadIdx.x + blockIdx.x * blockDim.x;

while( tid < n )

{

    c[tid] = a[tid] + b[tid];

tid += blockDim.x;

}

inline void check_for_error( cudaError_t err )

{

if( err != cudaSuccess )

{

    const char* error_msg = cudaGetErrorString( err );

    std::cout << error_msg << std::endl;

exit( 1 );

}

void run_test( float_t* a

         , float_t* b

         , float_t* c

         , float_t* dev_a

         , float_t* dev_b

         , float_t* dev_c

         , int      n

         , cudaStream_t streams[2]

         )

{

int stride = n / 2;

cudaEvent_t start_event, stop_event;

check_for_error( cudaEventCreateWithFlags( &start_event, cudaEventDefault ));

check_for_error( cudaEventCreateWithFlags( &stop_event , cudaEventDefault ));

cudaEventRecord(start_event, 0);

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( dev_a + i * stride, a + i * stride, stride, cudaMemcpyHostToDevice, streams[i] ));

    check_for_error( cudaMemcpyAsync( dev_b + i * stride, b + i * stride, stride, cudaMemcpyHostToDevice, streams[i] ));

add<<< 1, stride, 0, streams[ i ] >>>( dev_a + i * stride, dev_b + i * stride, dev_c + i * stride, stride );

    cudaThreadSynchronize();

check_for_error( cudaMemcpyAsync( c + i * stride, dev_c + i * stride, stride, cudaMemcpyDeviceToHost, streams[i] ));

cudaEventRecord(stop_event, 0);

cudaEventSynchronize(stop_event);

cudaEventDestroy(start_event);

cudaEventDestroy(stop_event);

}

float_t get()

{

return float_t( rand() ) / RAND_MAX;

}

int main(int argc, char* argv)

{

int n = 10;

float_t *a, *b, *c;

check_for_error( cudaHostAlloc( &a, n * sizeof( float_t ), cudaHostAllocDefault ));

check_for_error( cudaHostAlloc( &b, n * sizeof( float_t ), cudaHostAllocDefault ));

check_for_error( cudaHostAlloc( &c, n * sizeof( float_t ), cudaHostAllocDefault ));

generate_n( a, n, get );

generate_n( b, n, get );

float_t *dev_a, *dev_b, *dev_c;

check_for_error( cudaMalloc( &dev_a, n * sizeof( float_t )));

check_for_error( cudaMalloc( &dev_b, n * sizeof( float_t )));

check_for_error( cudaMalloc( &dev_c, n * sizeof( float_t )));

cudaStream_t streams[2];

check_for_error( cudaStreamCreate( &streams[0] ));

check_for_error( cudaStreamCreate( &streams[1] ));

run_test( a, b, c, dev_a, dev_b, dev_c, n, streams );

cudaStreamDestroy( streams[0] );

cudaStreamDestroy( streams[1] );

check_for_error( cudaFree( dev_a ));

check_for_error( cudaFree( dev_b ));

check_for_error( cudaFree( dev_c ));

check_for_error( cudaFreeHost( a ));

check_for_error( cudaFreeHost( b ));

check_for_error( cudaFreeHost( c ));

}

[/codebox]

I have two streams but for some reason only the first element of each chunk is computed.

Anyone can spot what’s wrong?

Thanks,

Christian

chhenning_air · September 30, 2010, 3:28pm

Here is small correction to my run_test():

[codebox]void run_test( float_t* a

         , float_t* b

         , float_t* c

         , float_t* dev_a

         , float_t* dev_b

         , float_t* dev_c

         , int      n

         , cudaStream_t streams[2]

         )

{

int stride = n / 2;

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( dev_a + i * stride, a + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

    check_for_error( cudaMemcpyAsync( dev_b + i * stride, b + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

}

for( int i = 0 ; i < 2; ++i )

{

    add<<< 1, stride, 0, streams[ i ] >>>( dev_a + i * stride, dev_b + i * stride, dev_c + i * stride, stride );

}

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( c + i * stride, dev_c + i * stride, stride * sizeof(float_t), cudaMemcpyDeviceToHost, streams[i] ));

}

cudaThreadSynchronize();

}[/codebox]

chhenning_air · September 30, 2010, 3:28pm

Here is small correction to my run_test():

[codebox]void run_test( float_t* a

         , float_t* b

         , float_t* c

         , float_t* dev_a

         , float_t* dev_b

         , float_t* dev_c

         , int      n

         , cudaStream_t streams[2]

         )

{

int stride = n / 2;

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( dev_a + i * stride, a + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

    check_for_error( cudaMemcpyAsync( dev_b + i * stride, b + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

}

for( int i = 0 ; i < 2; ++i )

{

    add<<< 1, stride, 0, streams[ i ] >>>( dev_a + i * stride, dev_b + i * stride, dev_c + i * stride, stride );

}

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( c + i * stride, dev_c + i * stride, stride * sizeof(float_t), cudaMemcpyDeviceToHost, streams[i] ));

}

cudaThreadSynchronize();

}[/codebox]

chhenning_air · September 30, 2010, 3:32pm

The app doesn’t work for n=10 but it works fine for n=1024. Anyone any ideas why?

Thanks,
Christian

chhenning_air · September 30, 2010, 3:32pm

The app doesn’t work for n=10 but it works fine for n=1024. Anyone any ideas why?

Thanks,
Christian

Topic		Replies	Views
simple vector add with streams CUDA Programming and Performance	6	2463	May 1, 2015
Streams Problem CUDA Programming and Performance	2	4710	December 7, 2008
Problem using streams Can't get more than one stream to work CUDA Programming and Performance	3	4717	October 8, 2008
Cuda streams CUDA Programming and Performance	1	6632	February 11, 2011
Questions on Stream Management CUDA Programming and Performance	14	19593	September 28, 2008
Problem using cuda streams CUDA Programming and Performance	13	2110	March 4, 2015
Questions on Streams CUDA Programming and Performance	5	2206	July 16, 2008
cuda samples related to cuda::stream does not work CUDA Programming and Performance	0	373	October 17, 2019
a problem about stream (concurrent copy and execute) CUDA Programming and Performance	0	3418	August 10, 2010
Why does my streaming vector add fails? CUDA Programming and Performance	2	2780	August 26, 2011

Problems with streams

Related topics