I’ve been trying to create a version of a 2D FFT-Shift algorithm in for cuFFT but have so far been unsuccesful. I’m well aware of this thread and this library, but none of the code (and I’ve tried all of it for both of those links!) has worked for me so far. Even more interesting, when I replicate some of the code in Python, it works like a charm. Am I doing something wrong device synchronization? I’ve tried in place and out of place versions of the fftShift kernl, nothings worked thus far. I’m relatively new to CUDA but it’s a bit frustrating how none of the stuff I find online seems to work in my case. This is the code I have, with results at the bottom:
template <typename T>
__global__
void cufftShift_2D_kernel(T* data, int N)
{
int sEq1 = (N * N + N) / 2;
int sEq2 = (N * N - N) / 2;
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
int index = (yIndex * N) + xIndex;
T regTemp;
if (xIndex < N / 2)
{
if (yIndex < N / 2)
{
regTemp = data[index];
data[index] = data[index + sEq1];
data[index + sEq1] = regTemp;
}
}
else
{
if (yIndex < N / 2)
{
regTemp = data[index];
data[index] = data[index + sEq2];
data[index + sEq2] = regTemp;
}
}
}
__global__ void multiplyBuffers(cufftComplex *kernel, cufftComplex *field, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
cufftComplex x = kernel[idx];
cufftComplex y = field[idx];
field[idx].x = x.x * y.x - x.y * y.y;
field[idx].y = x.x * y.y + x.y * y.x;
}
}
template <class T>
struct GPUBuffer {
thrust::device_vector<T> buffer;
T* p;
std::size_t mem_size;
GPUBuffer() = delete;
GPUBuffer(std::size_t size) :
buffer(thrust::device_vector<T>(size)),
p(thrust::raw_pointer_cast(buffer.data())),
mem_size(buffer.size() * sizeof(T)) {}
};
int main()
{
const int field_size = 32;
std::vector<float> field = std::vector<float>(field_size * field_size, 0.f);
const std::vector<float> kernel = getKernelShell();
dump_array_to_file(kernel, scaled_r * 2, scaled_r * 2, "kernel.txt");
std::vector<float> cells = getCells();
dump_array_to_file(cells, m_info.m_w, m_info.m_h, "cells.txt");
cells = upscale_array(cells, m_info.m_w, m_info.m_h, scale);
dump_array_to_file(cells, m_info.m_w * scale, m_info.m_h * scale, "upscaled.txt");
place_cells(field, cells, 0, 0, field_size, m_info.m_w * scale, m_info.m_h * scale);
dump_array_to_file(field, field_size, field_size, "field.txt");
std::vector<float> padded = pad_kernel(kernel, scaled_r * 2, field_size);
dump_array_to_file(padded, field_size, field_size, "padded.txt");
const int threadsPerBlock = 256;
const int blocksPerGrid = (field.size() + threadsPerBlock - 1) / threadsPerBlock;
cufftHandle normal;
cufftPlan2d(&normal, field_size, field_size, CUFFT_C2C);
GPUBuffer<cufftComplex> kernel_gpu(field.size());
GPUBuffer<cufftComplex> field_gpu(field.size());
GPUBuffer<cufftComplex> inv_gpu(field.size());
GPUBuffer<cufftComplex> shift_gpu(field.size());
std::vector<cufftComplex> host_output(field.size());
std::vector<float> real_output(field.size());
for (size_t i = 0; i < field.size(); i++)
{
field_gpu.buffer[i] = { field[i], 0.f};
kernel_gpu.buffer[i] = { padded[i], 0.f};
}
cufftExecC2C(normal, kernel_gpu.p, kernel_gpu.p, CUFFT_FORWARD);
cudaMemcpy(host_output.data(), kernel_gpu.p, kernel_gpu.mem_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < field.size(); ++i) {
real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
}
dump_array_to_file(real_output, field_size, field_size, "fft_kernel_cuda.txt");
for (int i = 0; i < 1; ++i) {
cufftExecC2C(normal, field_gpu.p, field_gpu.p, CUFFT_FORWARD);
multiplyBuffers<<<threadsPerBlock, blocksPerGrid>>>(kernel_gpu.p, field_gpu.p, field.size());
cufftExecC2C(normal, field_gpu.p, inv_gpu.p, CUFFT_INVERSE);
cudaMemcpy(shift_gpu.p, inv_gpu.p, inv_gpu.mem_size, cudaMemcpyDeviceToDevice);
cufftShift_2D_kernel<<<threadsPerBlock, blocksPerGrid>>>(shift_gpu.p, field_size);
}
cudaMemcpy(host_output.data(), field_gpu.p, field_gpu.mem_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < field.size(); ++i) {
real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
}
dump_array_to_file(real_output, field_size, field_size, "cuda_fft.txt");
cudaMemcpy(host_output.data(), inv_gpu.p, inv_gpu.mem_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < field.size(); ++i) {
real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
}
dump_array_to_file(real_output, field_size, field_size, "cuda_inv.txt");
cudaMemcpy(host_output.data(), shift_gpu.p, shift_gpu.mem_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < field.size(); ++i) {
real_output[i] = sqrt(host_output[i].x * host_output[i].x + host_output[i].y * host_output[i].y);
}
dump_array_to_file(real_output, field_size, field_size, "cuda_shifted.txt");
cufftDestroy(normal);
return 0;
}
And here’s the Python version of the same kernel, which works as expected:
def shift(array: NDArray):
N = np.int32(np.sqrt(array.size))
offsetA = (N * N + N) // 2
offsetB = (N * N - N) // 2
for (y, x), _ in np.ndenumerate(array.reshape((N, N))):
idx = y * N + x
if x < N // 2:
if y < N // 2:
array[idx], array[idx + offsetA] = array[idx + offsetA], array[idx]
else:
if y < N // 2:
array[idx], array[idx + offsetB] = array[idx + offsetB], array[idx]
return array.reshape((N, N))
Here’s the results: