#include <cuComplex.h>
#include <cufft.h>
#include <iostream>
#include <vector>
#include "cufftXt.h"
#include "helper_cuda.h"
#include "multigpu.cuh"
void print3Dfloat2(std::vector<float2> hOut, int nb, int nr, int nc) {
for (int k = 0; k < nb; k++) {
for (int i = 0; i < nr; i++) {
for (int j = 0; j < nc; j++) {
auto zzyyxx = (k * (nr * nc)) + (i * nc) + j;
printf("%.2f + %.2fi ", hOut[zzyyxx].x, hOut[zzyyxx].y);
}
printf("\n");
}
printf("\nbatch %d *************\n", k);
}
}
int runMultiGPU() {
static const int numGPUs = 2;
int gpus[numGPUs] = {0, 1};
// The real-to-complex transform is implicitly a forward transform. For an in-place
// real-to-complex transform where FFTW compatible output is desired, the input size must be
// padded to ⌊N/2⌋ + 1 complex elements.
// wtf?
int nb = 8;
int nr = 8;
int nc = 8;
// Fill with junk data
std::vector<float> hIn(nb * nr * nc);
for (int k = 0; k < nb; k++) {
for (int i = 0; i < nr; i++) {
for (int j = 0; j < nc; j++) {
hIn[k * (nr * nc) + i * nc + j] = i * nc + j;
}
}
}
cufftHandle plan;
checkCudaErrors(cufftCreate(&plan));
checkCudaErrors(cufftXtSetGPUs(plan, numGPUs, gpus));
// dimension of fft
int rank = 2;
int n[2] = {nr, nc};
// input/output sizes with pitches ("unpitched")
int inEmbed[] = {nr, nc};
int onEmbed[] = {nr, nc / 2 + 1};
// dist between batches
int iDist = nr * nc;
int oDist = nr * (nc / 2 + 1);
// stride between adjacent entries in row
int iStride = 1;
int oStride = 1;
size_t workSize[2];
cufftMakePlanMany(
plan, rank, n, inEmbed, iStride, iDist, onEmbed, oStride, oDist, CUFFT_R2C, nb, workSize);
cudaLibXtDesc* dX;
checkCudaErrors(cufftXtMalloc(plan, &dX, CUFFT_XT_FORMAT_INPLACE));
checkCudaErrors(cufftXtMemcpy(plan, dX, (void*)hIn.data(), CUFFT_COPY_HOST_TO_DEVICE));
checkCudaErrors(cufftXtExecDescriptorR2C(plan, dX, dX));
checkCudaErrors(cudaGetLastError());
std::vector<float2> hOut(nb * nr * (nc / 2 + 1));
checkCudaErrors(cufftXtMemcpy(plan, (void*)hOut.data(), dX, CUFFT_COPY_DEVICE_TO_HOST));
checkCudaErrors(cudaDeviceSynchronize());
print3Dfloat2(hOut, nb, nr, (nc / 2 + 1));
checkCudaErrors(cufftXtFree(dX));
checkCudaErrors(cufftDestroy(plan));
checkCudaErrors(cudaDeviceReset());
return 0;
}
and
/tmp/tmp.uxOI2XUOe3/cmake-build-debugremote/cuda_blob
2016.00 + 0.00i -32.00 + 77.25i -32.00 + 32.00i -32.00 + 13.25i -32.00 + 0.00i
-256.00 + 618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + -0.00i
-256.00 + -106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
batch 0 *************
2016.00 + 0.00i -32.00 + 77.25i -32.00 + 32.00i -32.00 + 13.25i -32.00 + 0.00i
-256.00 + 618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + -0.00i
-256.00 + -106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
batch 1 *************
2016.00 + 0.00i -32.00 + 77.25i -32.00 + 32.00i -32.00 + 13.25i -32.00 + 0.00i
-256.00 + 618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + -0.00i
-256.00 + -106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
batch 2 *************
2016.00 + 0.00i -32.00 + 77.25i -32.00 + 32.00i -32.00 + 13.25i -32.00 + 0.00i
-256.00 + 618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + -0.00i
-256.00 + -106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
batch 3 *************
2016.00 + 0.00i -32.00 + 77.25i -32.00 + 32.00i -32.00 + 13.25i -32.00 + 0.00i
-256.00 + 618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + -0.00i
-256.00 + -106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
batch 4 *************
2016.00 + 0.00i -32.00 + 77.25i -32.00 + 32.00i -32.00 + 13.25i -32.00 + 0.00i
-256.00 + 618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + -0.00i
-256.00 + -106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
batch 5 *************
2016.00 + 0.00i -32.00 + 77.25i -32.00 + 32.00i -32.00 + 13.25i -32.00 + 0.00i
-256.00 + 618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + -0.00i
-256.00 + -106.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -256.00i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
-256.00 + -618.04i 0.00 + 0.00i 0.00 + 0.00i 0.00 + -0.00i 0.00 + 0.00i
batch 6 *************
4624943986770180707023128100864.00 + 0.00i -4624943986770180707023128100864.00 + 144115205255725056.00i 4624943986770180707023128100864.00 + 288230719749095424.00i -4624943986770180707023128100864.00 + -144115205255725056.00i 4624943986770180707023128100864.00 + 0.00i
-3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i
144115205255725056.00 + 4624943986770180707023128100864.00i 101904850786189312.00 + -4624943986770180707023128100864.00i -288230685389357056.00 + 4624943986770180707023128100864.00i -101904850786189312.00 + -4624943986770180707023128100864.00i 144115205255725056.00 + 4624943986770180707023128100864.00i
3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i
-4624943986770180707023128100864.00 + 0.00i 4624943986770180707023128100864.00 + 144115205255725056.00i -4624943986770180707023128100864.00 + -257698037760.00i 4624943986770180707023128100864.00 + -144115205255725056.00i -4624943986770180707023128100864.00 + -0.00i
3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i
144115205255725056.00 + -4624943986770180707023128100864.00i 101904850786189312.00 + 4624943986770180707023128100864.00i 274877906944.00 + -4624943986770180707023128100864.00i -101904850786189312.00 + 4624943986770180707023128100864.00i 144115205255725056.00 + -4624943986770180707023128100864.00i
-3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i 3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i
batch 7 *************
the first seven batches are correct but the last one isn’t. i’ve experimented with various batch sizes and fft sizes. i haven’t figured it out exactly but sometimes that last two batches are wrong and sometimes it’s just the last one.