Hi there,
could someone tell what is wrong with my simple stream application. Here is the complete source code:
[codebox]#include <stdio.h>
#include <tchar.h>
#include <cuda_runtime.h>
typedef float float_t;
using namespace std;
global void add( const float_t* a
, const float_t* b
, float_t* c
, const int n
)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while( tid < n )
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x;
}
}
inline void check_for_error( cudaError_t err )
{
if( err != cudaSuccess )
{
const char* error_msg = cudaGetErrorString( err );
std::cout << error_msg << std::endl;
exit( 1 );
}
}
void run_test( float_t* a
, float_t* b
, float_t* c
, float_t* dev_a
, float_t* dev_b
, float_t* dev_c
, int n
, cudaStream_t streams[2]
)
{
int stride = n / 2;
cudaEvent_t start_event, stop_event;
check_for_error( cudaEventCreateWithFlags( &start_event, cudaEventDefault ));
check_for_error( cudaEventCreateWithFlags( &stop_event , cudaEventDefault ));
cudaEventRecord(start_event, 0);
for( int i = 0 ; i < 2; ++i )
{
check_for_error( cudaMemcpyAsync( dev_a + i * stride, a + i * stride, stride, cudaMemcpyHostToDevice, streams[i] ));
check_for_error( cudaMemcpyAsync( dev_b + i * stride, b + i * stride, stride, cudaMemcpyHostToDevice, streams[i] ));
add<<< 1, stride, 0, streams[ i ] >>>( dev_a + i * stride, dev_b + i * stride, dev_c + i * stride, stride );
cudaThreadSynchronize();
check_for_error( cudaMemcpyAsync( c + i * stride, dev_c + i * stride, stride, cudaMemcpyDeviceToHost, streams[i] ));
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
}
float_t get()
{
return float_t( rand() ) / RAND_MAX;
}
int main(int argc, char* argv)
{
int n = 10;
float_t *a, *b, *c;
check_for_error( cudaHostAlloc( &a, n * sizeof( float_t ), cudaHostAllocDefault ));
check_for_error( cudaHostAlloc( &b, n * sizeof( float_t ), cudaHostAllocDefault ));
check_for_error( cudaHostAlloc( &c, n * sizeof( float_t ), cudaHostAllocDefault ));
generate_n( a, n, get );
generate_n( b, n, get );
float_t *dev_a, *dev_b, *dev_c;
check_for_error( cudaMalloc( &dev_a, n * sizeof( float_t )));
check_for_error( cudaMalloc( &dev_b, n * sizeof( float_t )));
check_for_error( cudaMalloc( &dev_c, n * sizeof( float_t )));
cudaStream_t streams[2];
check_for_error( cudaStreamCreate( &streams[0] ));
check_for_error( cudaStreamCreate( &streams[1] ));
run_test( a, b, c, dev_a, dev_b, dev_c, n, streams );
cudaStreamDestroy( streams[0] );
cudaStreamDestroy( streams[1] );
check_for_error( cudaFree( dev_a ));
check_for_error( cudaFree( dev_b ));
check_for_error( cudaFree( dev_c ));
check_for_error( cudaFreeHost( a ));
check_for_error( cudaFreeHost( b ));
check_for_error( cudaFreeHost( c ));
}
[/codebox]
I have two streams but for some reason only the first element of each chunk is computed.
Anyone can spot what’s wrong?
Thanks,
Christian