Streams Problem

I am trying to stream the cufft_c2c_radix2 source to create a pipeline to increase performance for a very intensive application requiring 512 point FFTs to be computed in the nanosecond scale.

For some reason streams arent working, and in fact increase computation time. Can any one help with this?

Cheers

#include <stdio.h>

#include <cutil.h>

#include <cufft_c2c_radix2.cu>

#define VALUES

#define Check( condition ) {if( (condition) != 0 ) { printf( "\n FAILURE in %s, line %d\n", __FILE__, __LINE__ );}}

__global__ void init_array(int *g_data, int *factor, int num_iterations)

{ 

   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   for(int i=0;i<num_iterations;i++)

     g_data[idx] += *factor;	// non-coalesced on purpose, to burn time

}

int correct_data(int *a, const int n, const int c)

{

    for(int i = 0; i < n; i++)

        if(a[i] != c)

  {

  	printf("%d: %d %d\n", i, a[i], c);

            return 0;

  }

   return 1;

}

int main(int argc, char *argv[])

{

    int cuda_device = 0;

   const int nstreams =2     ;               // number of streams for CUDA calls

    int nreps = 10; 

	int Runtimes = 1;// number of times each experiment is repeated

    int n = 512;       // number of float2s in the data set

    int batch = 32 * 1024;

    dim3 threads, blocks;           // kernel launch configuration

    float elapsed_time, time_memcpy, time_kernel;   // timing variables

	int WorkingMemoryBlock = (n*batch/nstreams);

	int TotalMemoryBlock = (n*batch);

	int niterations;	// number of iterations for the loop inside the kernel

	int MemoryBlocks = 2;

   if( argc > 1 )

        cuda_device = atoi( argv[1] );

   // check the compute capability of the device

    int num_devices=0;

    CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );

    if(0==num_devices)

    {

        printf("your system does not have a CUDA capable device\n");

        return 1;

	}

	

    // check if the command-line chosen device ID is within range, exit if not

    if( cuda_device >= num_devices )

	{

        printf("choose device ID between 0 and %d\n", num_devices-1);

        return 1;

	}

   cudaSetDevice( cuda_device );

   cudaDeviceProp device_properties;

	

    CUDA_SAFE_CALL( cudaGetDeviceProperties(&device_properties, cuda_device) );

    if( (1 == device_properties.major) && (device_properties.minor < 1))

        printf("%s does not have compute capability 1.1 or later\n\n", device_properties.name);

    if(device_properties.minor > 1)

        niterations = 5;

    else

        niterations = 1;    // reduced workload for compute capability 1.0 and 1.1

	printf("running on: %s\n\n", device_properties.name );

   // allocate host memory (pinned is required for achieve asynchronicity)

    cData *out; 

    CUDA_SAFE_CALL( cudaMallocHost((void**)&out, TotalMemoryBlock*sizeof(float2)) ); 

    cData *in = 0;  

    CUDA_SAFE_CALL( cudaMallocHost((void**)&in, TotalMemoryBlock*sizeof(float2)) ); 

   // allocate device memory

    cData *d_in = 0;             // pointers to data and init value in the device memory

    CUDA_SAFE_CALL( cudaMalloc((void**)&d_in, TotalMemoryBlock*sizeof(float2)) );

    //CUDA_SAFE_CALL( cudaMalloc((void**)&d_out, TotalMemoryBlock*sizeof(float2) ));

   

	

  

	for  (int i=0; i<n; i++)

{

	in[i].x = 0;

	in[i].y = 0;

	out[i].x = 6;

	out[i].y = 0;

}

in[0].x = 8;

in[513].y = 8;

   // allocate and initialize an array of stream handles

    cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));

    for(int i = 0; i <= nstreams; i++)

        CUDA_SAFE_CALL( cudaStreamCreate(&(streams[i])) );

   // create CUDA event handles

    cudaEvent_t start_event, stop_event;

	CUDA_SAFE_CALL( cudaEventCreate(&start_event) );

    CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );

    

	//Stride Information

	cufftStride_st Strides;

	Strides.ibStride = n;

	Strides.ieStride = 1;

	Strides.obStride = n;

	Strides.oeStride = 1;

   threads=dim3(n/2,1);

    blocks=dim3((batch/nstreams),1);

    

    //////////////////////////////////////////////////////////////////////

    // time non-streamed execution for reference

    threads=dim3(n/2, 1);

    blocks=dim3(batch, 1);

    cudaEventRecord(start_event, 0);

    for(int k = 0; k < nreps; k++)

    {

   Check(cudaMemcpy(d_in, in, TotalMemoryBlock* sizeof(float2) , cudaMemcpyHostToDevice));

   cufft_c2c_radix2 <<< blocks, threads ,n*sizeof(float2)>>>(n, (float)TP/n, 9, d_in,d_in,-1,Strides);

         Check(cudaMemcpy(out, d_in, TotalMemoryBlock* sizeof(float2), cudaMemcpyDeviceToHost));

//   cudaStreamSynchronize(Stream[0]);

   }

    cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);

    CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

    printf("Per FFT Full non-streamed:\t%.2f \n", elapsed_time / batch/ nreps *1000);

	//////////////////////////////////////////////////////////////////////

    // time execution with nstreams streams

   

	threads=dim3(n/2,1);

    blocks=dim3((batch/nstreams),1);

	for  ( int i=0; i<n; i++)

{

	in[i].x = 0;

	in[i].y = 0;

	out[i].x = 6;

	out[i].y = 0;

}

in[0].x = 8;

in[513].y = 8;

  //Check(cudaMemcpy(d_out, in, WorkingMemoryBlock*nstreams* sizeof(float2) , cudaMemcpyHostToDevice));

  //cudaStreamSynchronize( streams[0] );

   cudaEventRecord(start_event, 0);

  //Check( cudaMemcpy(d_in , in , WorkingMemoryBlock*nstreams*sizeof(float2) , cudaMemcpyHostToDevice));

   for(int k = 0; k < nreps; k++)

    {

  for(int i = 0; i < nstreams; i++)

  Check( cudaMemcpyAsync(d_in +(i * WorkingMemoryBlock), in +(i * WorkingMemoryBlock), WorkingMemoryBlock*sizeof(float2) , cudaMemcpyHostToDevice,i));

  //cudaStreamSynchronize( streams[i] );

  for(int i = 0; i < nstreams; i++)

  cufft_c2c_radix2 <<< blocks, threads ,n*sizeof(float2),i >>>(n, TP/n, 9, d_in + (i * WorkingMemoryBlock),d_in + (i*WorkingMemoryBlock),-1,Strides);

  //cudaStreamSynchronize( streams[i] );

  for(int i = 0; i < nstreams; i++)

  Check( cudaMemcpyAsync( out +(i * WorkingMemoryBlock),d_in +(i * WorkingMemoryBlock), (WorkingMemoryBlock*sizeof(float2)), cudaMemcpyDeviceToHost, i) );

  cudaStreamSynchronize( streams[0] );

  

   }

	cudaEventRecord(stop_event, 0);

	cudaEventSynchronize(stop_event);

	CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

    printf("Per FFT %d streams:\t%.2f \n", nstreams, elapsed_time / batch/nreps*1000);

   

	// check whether the output is correct

	#ifdef VALUES

  printf("-------------------------------\n");

  printf( "[%5.0d]: %2.1f \n",511,out[511].x);

  printf( "[%5.0d]: %2.1f \n",1023,out[1023].y);

  printf("-------------------------------\n");

	#endif

   // release resources

    for(int i = 0; i < nstreams; i++)

        cudaStreamDestroy(streams[i]);

    cudaEventDestroy(start_event);

    cudaEventDestroy(stop_event);

    cudaFreeHost(out);

    cudaFreeHost(in);

    //cudaFree(d_out);

    cudaFree(d_in);

   CUT_EXIT(argc, argv);

   return 0;

}

107 views, and no posts?

It has been suggested that i check whether the application is bandwidth limited, how do i do this?

Is the problem in fact a bug?

I would really appreciate some input.

Looks like you’re using “i” instead of “streams[i]” in the kernels and async memcpy calls. Also you might consider looping around all the calls for stream[i], instead of looping on all streams for the first op, another loop on the next op, and so on. Also double check all your kernel calls. In some of my code I had inadvertently dropped a zero placeholder for the shared mem arg, so all the streams were defaulting to 0.

One thing that seems strange in the streams model is that by default a “0” or default stream is created, independent of the other streams created. It almost seems that the proper thing to do is:

streams[0]=0;

  for ( i=1; i < nstream; i++ )

	cudaStreamCreate(&(streams[i]));

I’ve just started using streams myself. The CUDA2.1 beta VisualProfiler can also report the streams the kernels are executed on. You can ask for the info, but the profiler gui won’t display it. But it’s in the project_run.csv file, which readily imports into oocalc. Very handy.