Jetson Nano cuFFT and streams

Dear all,
I’m having a hard time time to compute an FFT with cuFFT in separated CPU threads. I’ve searched all over the internet but most of the examples do not cover the Nano architecture.
What I’ve tried was to use separate streams and associate the fft plan to the corresponding stream. Each CPU thread uses the is own FFT plan to do is own calculations
I think I’m almost there because some times it works and sometimes not… It is quite strange… Sometimes the calculations are done as I expect and on other times I have the following error “Segmentation fault (core dumped)”.

Here it is “simple code example” showing how I am doing things.
Thank you In advance !
Note: I’m relatively a Cuda rookie 😊


#include <stdio.h>
#include <stdlib.h>
#include
#include <time.h>
#include
#include
#include
#include
#include
#include <unistd.h>

#include <cuda.h>
#include <cufft.h>
#include <cufftw.h>
#include <cuda_runtime_api.h>

#define DATASIZE 8192
#define BATCH 8

#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
//fprintf(stderr,“GPUassert: %s %s %dn”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

using namespace std;

void ProcessCh0();
void ProcessCh1();

int main(int argc, char **argv)
{
//cudaDeviceSynchronize();

thread thProcessCh0(ProcessCh0);
thread thProcessCh1(ProcessCh1);

thProcessCh0.join();
thProcessCh1.join();

cout << "Cuda FFT tests: End.\n";
return 0;

}

void ProcessCh0()
{
cufftReal *deviceInputData0;
cufftComplex *deviceOutputData0;
cufftHandle handle0;
cudaStream_t stream0;

cufftReal *ch0;
float dt0;
int f0;
int size = DATASIZE;
ch0 = (cufftReal*)malloc(size*sizeof(cufftReal));
f0 = 100;
dt0 = 1/(cufftReal) f0;

for(int i = 0; i < size; i++)
{
	ch0[i] = sin(dt0 * i);
}

gpuErrchk(cudaMallocManaged((void**)&deviceInputData0, size * BATCH * sizeof(cufftReal)));
gpuErrchk(cudaMallocManaged((void**)&deviceOutputData0, (size / 2 + 1) * BATCH * sizeof(cufftComplex)));
gpuErrchk(cudaStreamCreate(&stream0));
cufftPlan1d(&handle0, size, CUFFT_R2C, BATCH);
cufftSetStream(handle0, stream0);
cudaStreamAttachMemAsync(stream0, deviceOutputData0);

while(true)
{
	gpuErrchk(cudaMemcpyAsync(deviceInputData0, ch0, size * BATCH * sizeof(cufftReal), cudaMemcpyHostToDevice, stream0));
	cufftExecR2C(handle0, deviceInputData0, deviceOutputData0);
	cudaDeviceSynchronize();
	cout << "ProcessCh0-> deviceOutputData0[10].x:  " << deviceOutputData0[10].x << endl;
	cout << "ProcessCh0-> deviceOutputData0[25].y:  " << deviceOutputData0[25].y << endl;
	usleep(500000);
}

free(ch0);
cudaStreamDestroy(stream0);
cufftDestroy(handle0);
gpuErrchk(cudaFree(deviceOutputData0));
gpuErrchk(cudaFree(deviceInputData0));

}

void ProcessCh1()
{
cufftReal *deviceInputData1;
cufftComplex *deviceOutputData1;
cufftHandle handle1;
cudaStream_t stream1;

cufftReal *ch1;
float dt1;
int f1;
int size = DATASIZE;
ch1 = (cufftReal*)malloc(size*sizeof(cufftReal));
f1 = 200;
dt1 = 1/(cufftReal) f1;

for(int i = 0; i < size; i++)
{
	ch1[i] = sin(dt1 * i);
}

gpuErrchk(cudaMallocManaged((void**)&deviceInputData1, size * BATCH * sizeof(cufftReal)));
gpuErrchk(cudaMallocManaged((void**)&deviceOutputData1, (size / 2 +	1) * BATCH * sizeof(cufftComplex)));
gpuErrchk(cudaStreamCreate(&stream1));
cufftPlan1d(&handle1, size, CUFFT_R2C, BATCH);
cufftSetStream(handle1, stream1);
cudaStreamAttachMemAsync(stream1, deviceOutputData1);

while(true)
{
	gpuErrchk(cudaMemcpyAsync(deviceInputData1, ch1, size * BATCH * sizeof(cufftReal), cudaMemcpyHostToDevice, stream1));
	cufftExecR2C(handle1, deviceInputData1, deviceOutputData1);
	cudaDeviceSynchronize();
	cout << "ProcessCh1-> deviceOutputData1[10].x: " << deviceOutputData1[10].x << endl;
	cout << "ProcessCh1-> deviceOutputData1[25].y: " << deviceOutputData1[25].y << endl;
	usleep(500000);
}

free(ch1);
cudaStreamDestroy(stream1);
cufftDestroy(handle1);
gpuErrchk(cudaFree(deviceOutputData1));
gpuErrchk(cudaFree(deviceInputData1));

}


Hi,

Instead of cudaDeviceSynchronize, would you mind to use stream synchronize first?
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g82b5784f674c17c6df64affe618bf45e

Thanks.

1 Like

Hello AastaLLL,

Thank you, it was simple as that !

Best regards.