Ankits, like I tried to explain, my data size exceeds what can fit on the GPU memory. So what I’m doing instead is multiple “runs” of 1d batched ffts. Below is an implementation:
#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <math.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main(){
FILE *fid_in, *fid_out;
cudaDeviceProp prop;
int ndev;
cufftReal *hostInputData;
cufftComplex *hostOutputData;
//Query device properties
cudaGetDeviceCount(&ndev);
cudaGetDeviceProperties(&prop,0);
// Open input and output files
fid_in = fopen(in_filename, "rb");
fid_out = fopen(out_filename, "wb");
// Calculate size file
fseek(fid_in, 0L, SEEK_END); // move until end of file
long sz = ftell(fid_in); // file size
fseek(fid_in, 0L, SEEK_SET); //rewind for future readings
// Calculate max batch size
int raw_precision = 2; // bytes
int fft_precision = 4; // input is cufftReal (float 32 bit) output is cufftComplex (2 * float 32 bit) but half the samples
int max_elem = prop.totalGlobalMem/2/fft_precision; //divide by 2 to account for input and output arrays, then over the precision to get the max number of elements
int BATCH_MAX = floor((max_elem/DATASIZE)*0.47); // If I set anything above 50%, I get a memory allocation error and the fft output is all 0.
int nruns = ceil((double) sz/2/(BATCH_MAX*DATASIZE)); // number of times the max batch is needed to cover entire input data
// Raw data memory allocation
raw_data = (uint16_t *)malloc(sizeof(uint16_t)*BATCH_MAX*DATASIZE);
// --- Host side input data allocation and initialization -> Convert from unsigned 16-bit integer to 32-bit float
gpuErrchk(cudaMallocHost((void**)&hostInputData, DATASIZE * BATCH_MAX * sizeof(cufftReal)));
// --- Device side input data allocation and initialization
cufftReal *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH_MAX * sizeof(cufftReal)));
// --- Host side output data allocation
gpuErrchk(cudaMallocHost((void**)&hostOutputData, (DATASIZE/2 + 1) * BATCH_MAX * sizeof(cufftComplex)));
// --- Device side output data allocation
cufftComplex *deviceOutputData;
gpuErrchk(cudaMalloc((void**)&deviceOutputData, (DATASIZE / 2 + 1) * BATCH_MAX * sizeof(cufftComplex)));
// --- Batched 1D FFTs
cufftHandle handle;
int rank = 1; // --- 1D FFTs
int n[] = { DATASIZE }; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
// --- Number of batched executions
cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_R2C, BATCH_MAX);
// Start loop of 1d batched ffts
for (int irun = 0; irun < nruns; irun++){
if(irun==nruns-1){
//adjust size if last run of batches as it may be smaller
long sz_current = ftell(fid_in);
BATCH = (sz-sz_current)/2/DATASIZE;
cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_R2C, BATCH);
}
//read my data
fread(raw_data, sizeof(uint16_t), DATASIZE*BATCH, fid_in);
// Convert from 16uint to single-precision float
for (int i=0; i<BATCH; i++)
for (int j=0; j<DATASIZE; j++)
hostInputData[i*DATASIZE + j] = (cufftReal) raw_data[i*DATASIZE + j];
// ---- Host->Device copy the input
gpuErrchk(cudaMemcpy(deviceInputData, hostInputData, DATASIZE * BATCH_MAX * sizeof(cufftReal), cudaMemcpyHostToDevice));
// ---- Calculate fft
cufftExecR2C(handle, deviceInputData, deviceOutputData);
// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceOutputData, (DATASIZE / 2 + 1) * BATCH_MAX * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
// Write results to file
fwrite(hostOutputData, sizeof(cufftComplex), (DATASIZE / 2 + 1) * BATCH, fid_out);
}
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceOutputData));
gpuErrchk(cudaFree(deviceInputData));
fclose(fid_in);
fclose(fid_out);
return 0;
}
I currently have hard coded to use 47% of the GPU memory, if I go over 50% I get a GPU memory allocation error even though I am not using all the GPU memory and cufft returns zeros. The problem I have is that I can’t use all or most of the memory and I am not sure why. My guess is that BATCH_MAX*DATASIZE exceeds the grid limit size along x? If that is the case, what is an alternative approach to this problem in order to fully utilize the resources available?