I am trying to do multi-GPU 3D R2C FFT but now seeing a segment fault when I added the last line (cuffXtMemcpy). The segment fault will go away if I commented out that line. Can anyone take a quick look? The code is very short (please jump to main() and ignore those headers and error-checking functions upfront).
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <typeinfo>
#include <string>
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <vector>
#include <numeric>
#include <cfloat>
#include <algorithm>
#include <fstream>
#include <iostream>
using std::vector;
using std::string;
//=== my abort and CUDA error-checking function
inline void abort(string msg, const char *file, int line){
fprintf(stderr, "Error in file '%s' at line '%i': %s\n",
file, line, msg.c_str());
exit(EXIT_FAILURE);
}
__host__ __device__ const char* cudaGetErrorString(int errCode){
const char* errStrings[17] = {
"The cuFFT operation was successful (CUFFT_SUCCESS)",
"cuFFT was passed an invalid plan handle (CUFFT_INVALID_PLAN)",
"cuFFT failed to allocate GPU or CPU memory (CUFFT_ALLOC_FAILED)",
"No longer used (CUFFT_INVALID_TYPE)",
"User specified an invalid pointer or parameter (CUFFT_INVALID_VALUE)",
"Driver or internal cuFFT library error (CUFFT_INTERNAL_ERROR)",
"Failed to execute an FFT on the GPU (CUFFT_EXEC_FAILED)",
"The cuFFT library failed to initialize (CUFFT_SETUP_FAILED)",
"User specified an invalid transform size (CUFFT_INVALID_SIZE)",
"No longer used (CUFFT_UNALIGNED_DATA)",
"Missing parameters in call (CUFFT_INCOMPLETE_PARAMETER_LIST)",
"Execution of a plan was on different GPU than plan creation (CUFFT_INVALID_DEVICE)",
"Internal plan database error (CUFFT_PARSE_ERROR)",
"No workspace has been provided prior to plan execution (CUFFT_NO_WORKSPACE)",
"Function does not implement functionality for parameters given (CUFFT_NOT_IMPLEMENTED)",
"Used in previous versions (CUFFT_LICENSE_ERROR)",
"Operation is not supported for parameters given (CUFFT_NOT_SUPPORTED)"
};
return errStrings[errCode];
}
template<typename T>
inline void checkCudaErrors(T errCode, const char *func, const char *file, int line){
if(errCode){
cudaError_t type_cuda;
cufftResult type_cufft;
string errType;
if(typeid(errCode) == typeid(type_cuda)) errType = "General CUDA error";
else if(typeid(errCode) == typeid(type_cufft)) errType = "CUFFT error";
else {
fprintf(stderr, "Unknown error type\n");
fflush(stderr);
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s in file '%s' at line '%i' when calling '%s': %d (%s)\n",
errType.c_str(), file, line, func, errCode, cudaGetErrorString(errCode));
fflush(stderr);
exit(EXIT_FAILURE);
}
}
#define ABORT(msg) abort( msg, __FILE__, __LINE__ )
#define CUCHK(val) checkCudaErrors( (val), #val, __FILE__, __LINE__ )
int main(){
int n[3]{1024, 512, 768};
size_t size = (size_t)n[0]*(size_t)n[1]*(size_t)n[2];
size_t size2 = (size_t)(n[0]+2)*(size_t)n[1]*(size_t)n[2];
float *ha = new float [size]();
#pragma omp parallel for
for(size_t i = 0; i < size; i++){
int iy = i/((size_t)n[1]*(size_t)n[0]);
int ix = i%((size_t)n[1]*(size_t)n[0]);
int iz = ix%n[0];
ix = ix/n[0];
ha[i] = cos(iz) + sin(ix + iy);
}
//2. multi-GPU C2R and R2C FFT
cufftHandle planR2CmGPU, planC2RmGPU;
CUCHK(cufftCreate(&planR2CmGPU));
CUCHK(cufftCreate(&planC2RmGPU));
int nGPUs = 2;
int whichGPUs[2] = {0, 1};
size_t workSize2[2];
CUCHK(cufftXtSetGPUs(planR2CmGPU, nGPUs, whichGPUs));
CUCHK(cufftMakePlan3d(planR2CmGPU, n[2], n[1], n[0], CUFFT_R2C, workSize2));
CUCHK(cufftXtSetGPUs(planC2RmGPU, nGPUs, whichGPUs));
CUCHK(cufftMakePlan3d(planC2RmGPU, n[2], n[1], n[0], CUFFT_C2R, workSize2));
cudaLibXtDesc *da2;
CUCHK(cufftXtMalloc(planR2CmGPU, &da2, CUFFT_XT_FORMAT_INPLACE));
//CUCHK(cufftXtMemcpy(planR2CmGPU, da2, ha, CUFFT_COPY_HOST_TO_DEVICE)); //problematic line
//4. clean up
CUCHK(cufftXtFree(da2));
CUCHK(cufftDestroy(planR2CmGPU));
CUCHK(cufftDestroy(planC2RmGPU));
delete [] ha;
return 0;
}