cufftXtExecDescriptorR2C losing ~2 batches

#include <cuComplex.h>
#include <cufft.h>
#include <iostream>
#include <vector>

#include "cufftXt.h"
#include "helper_cuda.h"
#include "multigpu.cuh"

void print3Dfloat2(std::vector<float2> hOut, int nb, int nr, int nc) {
    for (int k = 0; k < nb; k++) {
        for (int i = 0; i < nr; i++) {
            for (int j = 0; j < nc; j++) {
                auto zzyyxx = (k * (nr * nc)) + (i * nc) + j;
                printf("%.2f + %.2fi    ", hOut[zzyyxx].x, hOut[zzyyxx].y);
            }
            printf("\n");
        }
        printf("\nbatch %d *************\n", k);
    }
}

int runMultiGPU() {
    static const int numGPUs = 2;
    int gpus[numGPUs] = {0, 1};

    // The real-to-complex transform is implicitly a forward transform. For an in-place
    // real-to-complex transform where FFTW compatible output is desired, the input size must be
    // padded to ⌊N/2⌋ + 1 complex elements.
    // wtf?
    int nb = 8;
    int nr = 8;
    int nc = 8;

    // Fill with junk data
    std::vector<float> hIn(nb * nr * nc);
    for (int k = 0; k < nb; k++) {
        for (int i = 0; i < nr; i++) {
            for (int j = 0; j < nc; j++) {
                hIn[k * (nr * nc) + i * nc + j] = i * nc + j;
            }
        }
    }

    cufftHandle plan;
    checkCudaErrors(cufftCreate(&plan));
    checkCudaErrors(cufftXtSetGPUs(plan, numGPUs, gpus));

    // dimension of fft
    int rank = 2;
    int n[2] = {nr, nc};
    // input/output sizes with pitches ("unpitched")
    int inEmbed[] = {nr, nc};
    int onEmbed[] = {nr, nc / 2 + 1};
    // dist between batches
    int iDist = nr * nc;
    int oDist = nr * (nc / 2 + 1);
    // stride between adjacent entries in row
    int iStride = 1;
    int oStride = 1;

    size_t workSize[2];
    cufftMakePlanMany(
        plan, rank, n, inEmbed, iStride, iDist, onEmbed, oStride, oDist, CUFFT_R2C, nb, workSize);

    cudaLibXtDesc* dX;
    checkCudaErrors(cufftXtMalloc(plan, &dX, CUFFT_XT_FORMAT_INPLACE));
    checkCudaErrors(cufftXtMemcpy(plan, dX, (void*)hIn.data(), CUFFT_COPY_HOST_TO_DEVICE));
    checkCudaErrors(cufftXtExecDescriptorR2C(plan, dX, dX));
    checkCudaErrors(cudaGetLastError());

    std::vector<float2> hOut(nb * nr * (nc / 2 + 1));
    checkCudaErrors(cufftXtMemcpy(plan, (void*)hOut.data(), dX, CUFFT_COPY_DEVICE_TO_HOST));
    checkCudaErrors(cudaDeviceSynchronize());

    print3Dfloat2(hOut, nb, nr, (nc / 2 + 1));

    checkCudaErrors(cufftXtFree(dX));
    checkCudaErrors(cufftDestroy(plan));
    checkCudaErrors(cudaDeviceReset());

    return 0;
}

and

/tmp/tmp.uxOI2XUOe3/cmake-build-debugremote/cuda_blob
2016.00 + 0.00i    -32.00 + 77.25i    -32.00 + 32.00i    -32.00 + 13.25i    -32.00 + 0.00i    
-256.00 + 618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 0.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + -0.00i    
-256.00 + -106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    

batch 0 *************
2016.00 + 0.00i    -32.00 + 77.25i    -32.00 + 32.00i    -32.00 + 13.25i    -32.00 + 0.00i    
-256.00 + 618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 0.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + -0.00i    
-256.00 + -106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    

batch 1 *************
2016.00 + 0.00i    -32.00 + 77.25i    -32.00 + 32.00i    -32.00 + 13.25i    -32.00 + 0.00i    
-256.00 + 618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 0.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + -0.00i    
-256.00 + -106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    

batch 2 *************
2016.00 + 0.00i    -32.00 + 77.25i    -32.00 + 32.00i    -32.00 + 13.25i    -32.00 + 0.00i    
-256.00 + 618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 0.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + -0.00i    
-256.00 + -106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    

batch 3 *************
2016.00 + 0.00i    -32.00 + 77.25i    -32.00 + 32.00i    -32.00 + 13.25i    -32.00 + 0.00i    
-256.00 + 618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 0.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + -0.00i    
-256.00 + -106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    

batch 4 *************
2016.00 + 0.00i    -32.00 + 77.25i    -32.00 + 32.00i    -32.00 + 13.25i    -32.00 + 0.00i    
-256.00 + 618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 0.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + -0.00i    
-256.00 + -106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    

batch 5 *************
2016.00 + 0.00i    -32.00 + 77.25i    -32.00 + 32.00i    -32.00 + 13.25i    -32.00 + 0.00i    
-256.00 + 618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + 0.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + -0.00i    
-256.00 + -106.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -256.00i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    
-256.00 + -618.04i    0.00 + 0.00i    0.00 + 0.00i    0.00 + -0.00i    0.00 + 0.00i    

batch 6 *************
4624943986770180707023128100864.00 + 0.00i    -4624943986770180707023128100864.00 + 144115205255725056.00i    4624943986770180707023128100864.00 + 288230719749095424.00i    -4624943986770180707023128100864.00 + -144115205255725056.00i    4624943986770180707023128100864.00 + 0.00i    
-3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    
144115205255725056.00 + 4624943986770180707023128100864.00i    101904850786189312.00 + -4624943986770180707023128100864.00i    -288230685389357056.00 + 4624943986770180707023128100864.00i    -101904850786189312.00 + -4624943986770180707023128100864.00i    144115205255725056.00 + 4624943986770180707023128100864.00i    
3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    
-4624943986770180707023128100864.00 + 0.00i    4624943986770180707023128100864.00 + 144115205255725056.00i    -4624943986770180707023128100864.00 + -257698037760.00i    4624943986770180707023128100864.00 + -144115205255725056.00i    -4624943986770180707023128100864.00 + -0.00i    
3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    
144115205255725056.00 + -4624943986770180707023128100864.00i    101904850786189312.00 + 4624943986770180707023128100864.00i    274877906944.00 + -4624943986770180707023128100864.00i    -101904850786189312.00 + 4624943986770180707023128100864.00i    144115205255725056.00 + -4624943986770180707023128100864.00i    
-3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    3270329307707972955843936124928.00 + -3270329307707972955843936124928.00i    -3270329307707972955843936124928.00 + 3270329307707972955843936124928.00i    

batch 7 *************

the first seven batches are correct but the last one isn’t. i’ve experimented with various batch sizes and fft sizes. i haven’t figured it out exactly but sometimes that last two batches are wrong and sometimes it’s just the last one.