Problems with streams

Hi there,

could someone tell what is wrong with my simple stream application. Here is the complete source code:

[codebox]#include <stdio.h>

#include <tchar.h>

#include

#include

#include

#include

#include

#include

#include <cuda_runtime.h>

typedef float float_t;

using namespace std;

global void add( const float_t* a

               , const float_t* b

               , float_t*       c

               , const int      n

               )

{

int tid = threadIdx.x + blockIdx.x * blockDim.x;

while( tid < n )

{

    c[tid] = a[tid] + b[tid];

tid += blockDim.x;

}

}

inline void check_for_error( cudaError_t err )

{

if( err != cudaSuccess )

{

    const char* error_msg = cudaGetErrorString( err );

    std::cout << error_msg << std::endl;

exit( 1 );

}

}

void run_test( float_t* a

         , float_t* b

         , float_t* c

         , float_t* dev_a

         , float_t* dev_b

         , float_t* dev_c

         , int      n

         , cudaStream_t streams[2]

         )

{

int stride = n / 2;

cudaEvent_t start_event, stop_event;

check_for_error( cudaEventCreateWithFlags( &start_event, cudaEventDefault ));

check_for_error( cudaEventCreateWithFlags( &stop_event , cudaEventDefault ));

cudaEventRecord(start_event, 0);

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( dev_a + i * stride, a + i * stride, stride, cudaMemcpyHostToDevice, streams[i] ));

    check_for_error( cudaMemcpyAsync( dev_b + i * stride, b + i * stride, stride, cudaMemcpyHostToDevice, streams[i] ));

add<<< 1, stride, 0, streams[ i ] >>>( dev_a + i * stride, dev_b + i * stride, dev_c + i * stride, stride );

    cudaThreadSynchronize();

check_for_error( cudaMemcpyAsync( c + i * stride, dev_c + i * stride, stride, cudaMemcpyDeviceToHost, streams[i] ));

}

cudaEventRecord(stop_event, 0);

cudaEventSynchronize(stop_event);

cudaEventDestroy(start_event);

cudaEventDestroy(stop_event);

}

float_t get()

{

return float_t( rand() ) / RAND_MAX;

}

int main(int argc, char* argv)

{

int n = 10;

float_t *a, *b, *c;

check_for_error( cudaHostAlloc( &a, n * sizeof( float_t ), cudaHostAllocDefault ));

check_for_error( cudaHostAlloc( &b, n * sizeof( float_t ), cudaHostAllocDefault ));

check_for_error( cudaHostAlloc( &c, n * sizeof( float_t ), cudaHostAllocDefault ));

generate_n( a, n, get );

generate_n( b, n, get );

float_t *dev_a, *dev_b, *dev_c;

check_for_error( cudaMalloc( &dev_a, n * sizeof( float_t )));

check_for_error( cudaMalloc( &dev_b, n * sizeof( float_t )));

check_for_error( cudaMalloc( &dev_c, n * sizeof( float_t )));

cudaStream_t streams[2];

check_for_error( cudaStreamCreate( &streams[0] ));

check_for_error( cudaStreamCreate( &streams[1] ));

run_test( a, b, c, dev_a, dev_b, dev_c, n, streams );

cudaStreamDestroy( streams[0] );

cudaStreamDestroy( streams[1] );

check_for_error( cudaFree( dev_a ));

check_for_error( cudaFree( dev_b ));

check_for_error( cudaFree( dev_c ));

check_for_error( cudaFreeHost( a ));

check_for_error( cudaFreeHost( b ));

check_for_error( cudaFreeHost( c ));

}

[/codebox]

I have two streams but for some reason only the first element of each chunk is computed.

Anyone can spot what’s wrong?

Thanks,

Christian

Here is small correction to my run_test():

[codebox]void run_test( float_t* a

         , float_t* b

         , float_t* c

         , float_t* dev_a

         , float_t* dev_b

         , float_t* dev_c

         , int      n

         , cudaStream_t streams[2]

         )

{

int stride = n / 2;

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( dev_a + i * stride, a + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

    check_for_error( cudaMemcpyAsync( dev_b + i * stride, b + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

}

for( int i = 0 ; i < 2; ++i )

{

    add<<< 1, stride, 0, streams[ i ] >>>( dev_a + i * stride, dev_b + i * stride, dev_c + i * stride, stride );

}

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( c + i * stride, dev_c + i * stride, stride * sizeof(float_t), cudaMemcpyDeviceToHost, streams[i] ));

}

cudaThreadSynchronize();

}[/codebox]

Here is small correction to my run_test():

[codebox]void run_test( float_t* a

         , float_t* b

         , float_t* c

         , float_t* dev_a

         , float_t* dev_b

         , float_t* dev_c

         , int      n

         , cudaStream_t streams[2]

         )

{

int stride = n / 2;

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( dev_a + i * stride, a + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

    check_for_error( cudaMemcpyAsync( dev_b + i * stride, b + i * stride, stride * sizeof(float_t), cudaMemcpyHostToDevice, streams[i] ));

}

for( int i = 0 ; i < 2; ++i )

{

    add<<< 1, stride, 0, streams[ i ] >>>( dev_a + i * stride, dev_b + i * stride, dev_c + i * stride, stride );

}

for( int i = 0 ; i < 2; ++i )

{

    check_for_error( cudaMemcpyAsync( c + i * stride, dev_c + i * stride, stride * sizeof(float_t), cudaMemcpyDeviceToHost, streams[i] ));

}

cudaThreadSynchronize();

}[/codebox]

The app doesn’t work for n=10 but it works fine for n=1024. Anyone any ideas why?

Thanks,
Christian

The app doesn’t work for n=10 but it works fine for n=1024. Anyone any ideas why?

Thanks,
Christian