Having tried to implement some algorithms which use both cuFFT library and streams I encountered very strange behaviour.
I found out that for particular transform sizes cuFFT procedures were executed in default stream even that they were set to execute in other streams.
Below please find an example of code that reproduces it.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cufft.h"
#include <helper_cuda.h>
#include <stdio.h>
#include <cstdlib>
//try to change it to 2048 and examine the difference
#define CUFFT_TRANSFORM_SIZE 2049
int main()
{
int dev=0;
checkCudaErrors(cudaSetDevice(dev));
size_t kernelSize = sizeof(cuComplex)*CUFFT_TRANSFORM_SIZE;
cuComplex* X = (cuComplex*) malloc(kernelSize);
//filling the array
for(int i =0; i < CUFFT_TRANSFORM_SIZE; i++) {
X[i].x = i % 2413;
X[i].y = i % 1234;
}
cuComplex *X_dev;
checkCudaErrors(cudaMalloc(&X_dev,kernelSize));
checkCudaErrors(cudaMemcpy(X_dev,X,kernelSize,cudaMemcpyHostToDevice));
cudaStream_t stream;
checkCudaErrors(cudaStreamCreate(&stream));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, CUFFT_TRANSFORM_SIZE, CUFFT_C2C, 1));
checkCudaErrors(cufftSetStream(plan,stream));
checkCudaErrors(cufftExecC2C(plan,X_dev,X_dev,CUFFT_FORWARD));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cufftDestroy(plan));
checkCudaErrors(cudaStreamDestroy(stream));
checkCudaErrors(cudaFree(X_dev));
free(X);
return 0;
}
The problem is that, when the TRANSFORM_SIZE is not the power of 2 like 2049, all cufft functions are executed in default stream which destroys concurrent execution in another streams. But when TRANSFORM_SIZE is power of 2 cufft function are executed in right stream (other than default).
Below are screen shoots form Nsight :)
Please help me, as I am run out of ideas what I am doing wrong :)
BTW: I am using CUDA 5.5; Window 7; GTX 260. Also tested on TeslaK20; windows server 2008; cuda 5.5