Cannot get FFTShift working for cuFFT

I’ve been trying to create a version of a 2D FFT-Shift algorithm in for cuFFT but have so far been unsuccesful. I’m well aware of this thread and this library, but none of the code (and I’ve tried all of it for both of those links!) has worked for me so far. Even more interesting, when I replicate some of the code in Python, it works like a charm. Am I doing something wrong device synchronization? I’ve tried in place and out of place versions of the fftShift kernl, nothings worked thus far. I’m relatively new to CUDA but it’s a bit frustrating how none of the stuff I find online seems to work in my case. This is the code I have, with results at the bottom:

template <typename T>
__global__
void cufftShift_2D_kernel(T* data, int N)
{

	int sEq1 = (N * N + N) / 2;
	int sEq2 = (N * N - N) / 2;

	int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
	int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

	int index = (yIndex * N) + xIndex;

	T regTemp;

	if (xIndex < N / 2)
	{
		if (yIndex < N / 2)
		{
			regTemp = data[index];
			data[index] = data[index + sEq1];
			data[index + sEq1] = regTemp;
		}
	}
	else
	{
		if (yIndex < N / 2)
		{
			regTemp = data[index];
			data[index] = data[index + sEq2];
			data[index + sEq2] = regTemp;
		}
	}
}

__global__ void multiplyBuffers(cufftComplex *kernel, cufftComplex *field, int size) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if (idx < size) {
		cufftComplex x = kernel[idx];
		cufftComplex y = field[idx];

		field[idx].x = x.x * y.x - x.y * y.y;
		field[idx].y = x.x * y.y + x.y * y.x; 
	}
}


template <class T>
struct GPUBuffer {
	thrust::device_vector<T> buffer;
	T* p;
	std::size_t mem_size;

	GPUBuffer() = delete;

	GPUBuffer(std::size_t size) : 
		buffer(thrust::device_vector<T>(size)),
		p(thrust::raw_pointer_cast(buffer.data())),
		mem_size(buffer.size() * sizeof(T)) {}
};

int main()
{
	const int field_size = 32;

	std::vector<float> field = std::vector<float>(field_size * field_size, 0.f);

	const std::vector<float> kernel = getKernelShell();
	dump_array_to_file(kernel, scaled_r * 2, scaled_r * 2, "kernel.txt");

	std::vector<float> cells = getCells();
	dump_array_to_file(cells, m_info.m_w, m_info.m_h, "cells.txt");

	cells = upscale_array(cells, m_info.m_w, m_info.m_h, scale);
	dump_array_to_file(cells, m_info.m_w * scale, m_info.m_h * scale, "upscaled.txt");

	place_cells(field, cells, 0, 0, field_size, m_info.m_w * scale, m_info.m_h * scale);
	dump_array_to_file(field, field_size, field_size, "field.txt");

	std::vector<float> padded = pad_kernel(kernel, scaled_r * 2, field_size);
	dump_array_to_file(padded, field_size, field_size, "padded.txt");

	const int threadsPerBlock = 256;
	const int blocksPerGrid = (field.size() + threadsPerBlock - 1) / threadsPerBlock;

	cufftHandle normal;

	cufftPlan2d(&normal, field_size, field_size, CUFFT_C2C);

	GPUBuffer<cufftComplex> kernel_gpu(field.size());
	GPUBuffer<cufftComplex> field_gpu(field.size());
	GPUBuffer<cufftComplex> inv_gpu(field.size());
	GPUBuffer<cufftComplex> shift_gpu(field.size());

	std::vector<cufftComplex> host_output(field.size());

	std::vector<float> real_output(field.size());

	for (size_t i = 0; i < field.size(); i++)
	{
		field_gpu.buffer[i] = { field[i], 0.f};
		kernel_gpu.buffer[i] = { padded[i], 0.f};
	}

	cufftExecC2C(normal, kernel_gpu.p, kernel_gpu.p, CUFFT_FORWARD);

	cudaMemcpy(host_output.data(), kernel_gpu.p, kernel_gpu.mem_size, cudaMemcpyDeviceToHost);

	for (size_t i = 0; i < field.size(); ++i) {
		real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
	}

	dump_array_to_file(real_output, field_size, field_size, "fft_kernel_cuda.txt");

	for (int i = 0; i < 1; ++i) {
		cufftExecC2C(normal, field_gpu.p, field_gpu.p, CUFFT_FORWARD);
		multiplyBuffers<<<threadsPerBlock, blocksPerGrid>>>(kernel_gpu.p, field_gpu.p, field.size());
		cufftExecC2C(normal, field_gpu.p, inv_gpu.p, CUFFT_INVERSE);
		cudaMemcpy(shift_gpu.p, inv_gpu.p, inv_gpu.mem_size, cudaMemcpyDeviceToDevice);
		cufftShift_2D_kernel<<<threadsPerBlock, blocksPerGrid>>>(shift_gpu.p, field_size);
	}
	cudaMemcpy(host_output.data(), field_gpu.p, field_gpu.mem_size, cudaMemcpyDeviceToHost);

	for (size_t i = 0; i < field.size(); ++i) {
		real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
	}
	dump_array_to_file(real_output, field_size, field_size, "cuda_fft.txt");

	cudaMemcpy(host_output.data(), inv_gpu.p, inv_gpu.mem_size, cudaMemcpyDeviceToHost);

	for (size_t i = 0; i < field.size(); ++i) {
		real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
	}

	dump_array_to_file(real_output, field_size, field_size, "cuda_inv.txt");

	cudaMemcpy(host_output.data(), shift_gpu.p, shift_gpu.mem_size, cudaMemcpyDeviceToHost);

	for (size_t i = 0; i < field.size(); ++i) {
		real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
	}

	dump_array_to_file(real_output, field_size, field_size, "cuda_shifted.txt");

	cufftDestroy(normal);
	return 0;
}

And here’s the Python version of the same kernel, which works as expected:

def shift(array: NDArray):
    N = np.int32(np.sqrt(array.size))
    offsetA = (N * N + N) // 2
    offsetB = (N * N - N) // 2
    for (y, x), _ in np.ndenumerate(array.reshape((N, N))):
        idx = y * N + x
        if x < N // 2:
            if y < N // 2:
                array[idx], array[idx + offsetA] = array[idx + offsetA], array[idx]
        else:
            if y < N // 2:
                array[idx], array[idx + offsetB] = array[idx + offsetB], array[idx]
    return array.reshape((N, N))

Here’s the results:

This should probably be cufftShift_2D_kernel<<<blocksPerGrid, threadsPerBlock>>>(shift_gpu.p, field_size);

Additionally, your kernel expects a two-dimensional thread layout, but you only use 1-D block dimensions and / or grid dimensions.

Hi, I’ve swapped the order for both kernel function calls. It’s unfortunately still not quite correct:

What do you mean by “two-dimensional thread layout”? How does it make a difference if I convert the thread ID to one dimension anyway?

Also why wouldn’t that type of indexing work in the fftshift kernel, when it works in the other kernels?

If this code is launched with a 1D thread layout like in your case, yIndex will be 0 for all threads which does not seem to be intended.

There are multiple ways to approach the correct indexing.

A 2D kernel launch which uses 1 thread per pixel in the image could look like this:

dim3 threadsPerBlock(32,4); //128 threads, 32 in x-dimension, 4 in y-dimension
dim3 blocksInGrid(
   (image_dimension_x + threadsPerBlock.x - 1) / threadsPerBlock.x),
   (image_dimension_y + threadsPerBlock.y - 1) / threadsPerBlock.y)
):
kernel<<<blocksInGrid, threadsPerBlock>>>(...)

You could perform an index transformation in the kernel to get a 2d index from a 1d index

//in kernel
const int linear_index = threadIdx.x + blockIdx.x * blockDim.x;
const int xIndex = linear_index % N;
const int yIndex = linear_index / N;

You could write your kernel with a strided for-loop that allows the kernel to work with fewer that N threads per dimension

//in kernel
for(int yIndex = blockIdx.y * blockDim.y + threadIdx.y; yIndex < N; yIndex += blockDim.y * gridDim.y){
    for(int xIndex = blockIdx.x * blockDim.x + threadIdx.x; xIndex < N; xIndex += blockDim.x * gridDim.x){
             ...
    }
}

Alright, I’m now using your block/sizes for the fftShift kernel and it seems to work. However, when I try to use the same dimensions for the multiplyBuffers kernel, that one breaks. I can sorta use some manual numbers to get it to work, but I’d like to understand why I need to use different values for data that has the same size/dimension.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.