CUFFT_INVALID_TYPE no longer in use?

MagicYoung · February 17, 2023, 7:31am

Hi all,

I was running a test for multiGPU cufft, something I can not understand is that I can have a simple 1D C2C transformation, but if I then have another transformation executed (call cufftXtExecDescriptorC2C(plan, d_idata, d_idata, CUFFT_FORWARD) again), the return value is 3, which corresponds to CUFFT_INVALID_TYPE. And the cufft reference says that it is no longer used. Is that a bug or so?

Code here for check, I’m running on CUDA 12.0 with 4 RTX3060, and everything looks good if just one cufftXtExecDescriptorC2C(plan, d_idata, d_idata, CUFFT_FORWARD), second time calling of this function gives you the error.

#include “cuda_runtime.h”
#include “cufft.h”
#include
#include “cufftXt.h”
#include
#include “curand_kernel.h”

const int Nx = 1024;

// For cufft check
#define CHECKFFT(x) check((x), FILE, LINE)
inline void check(cufftResult err, std::string const file, int const line) {
if (err != CUFFT_SUCCESS) {
std::cerr << "Error: " << err << ", line: " << line << " in " << file << std::endl;
exit(3);
}
}

global void initialize(cufftComplex* A, int devIdx, int devDim)
{
int t_tot = blockDim.x * gridDim.x;
int local_Nx = ((Nx - 1) / devDim + 1);
int offset = devIdx * local_Nx;
float factor = 1./Nx;
for (int k = 0; k < (local_Nx - 1) / t_tot + 1; k++)
{
int index = blockIdx.x * blockDim.x + threadIdx.x + k * t_tot;
if (index < local_Nx)
{
//Random initialization
/int seed = threadIdx.x;
curandState s;
curand_init(seed, 0, 0, &s);
A[index] = curand_uniform(&s);/

		//Sin initialization
		A[index].x = sinf(2 * 3.1415 * (index + offset) * factor);
		A[index].y = 0;
	}
}

}

global void normalize(cufftComplex* A, int devIdx, int devDim)
{
int t_tot = blockDim.x * gridDim.x;
int local_Nx = ((Nx - 1) / devDim + 1);
double factor = 1./Nx;
for (int k = 0; k < (local_Nx - 1) / t_tot + 1; k++)
{
int index = blockIdx.x * blockDim.x + threadIdx.x + k * t_tot;
if (index < local_Nx)
{
A[index].x *= factor;
}
}
}

void multi_gpu_cufft(cufftComplex * h_idata, cufftComplex * h_odata, int devDim)
{
int* deviceList;
if (devDim == 1)
{
devDim = 2;
deviceList = new int[2];
deviceList[0] = 0;
deviceList[1] = 0;
}
else
{
// Set GPU’s to use and list device properties
deviceList = new int[devDim];
for (int i = 0; i < devDim; i++)
{
deviceList[i] = i;

		cudaSetDevice(deviceList[i]);

		cudaDeviceProp prop;
		cudaGetDeviceProperties(&prop, deviceList[i]);
		printf("  Device name: %s\n", prop.name);
		printf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
		printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
		printf("  Peak Memory Bandwidth (GB/s): %f\n\n", 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
	}
}

// Create empty plan that will be used for the FFT
cufftHandle plan;
CHECKFFT(cufftCreate(&plan));

// Tell cuFFT which GPUs to use
CHECKFFT(cufftXtSetGPUs(plan, devDim, deviceList));

// Make the plan
size_t* worksize = new size_t[devDim];

// Have to use extensible plans for multiGPU, and for 1D, only C2C for multiGPU
CHECKFFT(cufftMakePlan1d(plan, Nx, CUFFT_C2C, 1, worksize));
// Descriptor pointer pointing to input data and library, only in-place transform is supported for multi-GPU
cudaLibXtDesc* d_idata;

// Allocate memory across multi-GPU
CHECKFFT(cufftXtMalloc(plan, (cudaLibXtDesc **)&d_idata, CUFFT_XT_FORMAT_INPLACE));

// Copy data from "host" to multi-GPU devices
CHECKFFT(cufftXtMemcpy(plan, d_idata, h_idata, CUFFT_COPY_HOST_TO_DEVICE));

//Forward transform execution, again, only support in-place transform
CHECKFFT(cufftXtExecDescriptorC2C(plan, d_idata, d_idata, CUFFT_FORWARD));

// Copy the output data from multiple gpus to the "host" result variable (automatically reorders the data from output to natural order)
CHECKFFT(cufftXtMemcpy(plan, h_odata, d_idata, CUFFT_COPY_DEVICE_TO_HOST));

//Inverse transform execution, again, only support in-place transform
CHECKFFT(cufftXtExecDescriptorC2C(plan, d_idata, d_idata, CUFFT_INVERSE));

// Copy the output data from multiple gpus to the "host" result variable (automatically reorders the data from output to natural order)
CHECKFFT(cufftXtMemcpy(plan, h_idata, d_idata, CUFFT_COPY_DEVICE_TO_HOST));

cufftXtFree(d_idata);

cufftDestroy(plan);

delete[] deviceList;

}

int main()
{
int n;
cudaGetDeviceCount(&n);
std::cout << "Number of device: " << n << std::endl;

cufftComplex * h_idata, * h_odata;
cudaMallocManaged(&h_idata, Nx * sizeof(cufftComplex));
cudaMallocManaged(&h_odata, Nx * sizeof(cufftComplex));

for (int i = 0; i < n; i++)
{
	cudaSetDevice(i);
	initialize << < 16, 256 >> > (&h_idata[i * Nx / n], i, n);
}

//synchronize all devices and then CPU begins to output
for (int i = 0; i < n; i++)
{
	cudaSetDevice(i);
	cudaDeviceSynchronize();
}

// Multi-GPU cufft
multi_gpu_cufft(h_idata, h_odata, n);

for (int i = 0; i < n; i++)
{
	cudaSetDevice(i);
	normalize << < 16, 256 >> > (&h_idata[i * Nx / n], i, n);
}

cudaFree(h_idata);
cudaFree(h_odata);

cudaError err = cudaGetLastError();
if (err != cudaSuccess)
{
	printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err));
	exit(EXIT_FAILURE);
}

return 0;

}