Hey guys,
i have some problems with executing my mex code including some cufft transforms.
Everything is working fine when i let matlab execute the mex function one time. But when i try to execute it a second time (sometimes also one or two times more…), matlab crashes and gives me a segmentation fault.
The stack trace shows me that the crash is always in the cufftPlan2d() function. I also tried the cufftPlanMany() but whith this it is the same problem.
Funny thing is, when im building a large for() loop around the whole cufft planning and execution functions and it does not give me any mistakes at the first matlab execution. The second one again crashes.
Im not able to figure out where the mistake is.
My Configuration:
Geforce GTX 465
GPU Computing Toolkit 4.2
Visual Studio 2010 Professional (compiled direct in the ide not out of matlab)
Matlab R2011 x64
bit shortened
/* The gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
// Get scalars
double p_scale = mxGetScalar(prhs[3]);
double roundTolerance = mxGetScalar(prhs[2]);
size_t dimX1 = mxGetM(prhs[0]);
size_t dimY1 = mxGetN(prhs[0]);
// Create output matrix
plhs[0] = mxCreateDoubleMatrix(dimX1, dimY1, mxCOMPLEX);
// Get matricies
double* Ar = mxGetPr(prhs[0]);
double* Ai = mxGetPi(prhs[0]);
double* Or = mxGetPr(plhs[0]);
double* Oi = mxGetPi(plhs[0]);
// Define device pointers
double* d_Ar;
double* d_Ai;
cufftDoubleComplex* d_In1;
cufftDoubleComplex* d_Out;
// Allocate memory and copy data to device
size_t numEl = mxGetNumberOfElements(prhs[0]);
size_t sizeEl = numEl * sizeof(double);
cudaErrors( cudaMalloc(&d_Ar, sizeEl ) , __LINE__, __FILE__);
cudaErrors( cudaMemcpy(d_Ar, Ar, sizeEl, cudaMemcpyHostToDevice) , __LINE__, __FILE__);
if( Ai != NULL ) {
cudaErrors( cudaMalloc(&d_Ai, sizeEl ) , __LINE__, __FILE__);
cudaErrors( cudaMemcpy(d_Ai, Ai, sizeEl, cudaMemcpyHostToDevice) , __LINE__, __FILE__);
}
sizeEl = numEl * sizeof(cufftDoubleComplex);
cudaErrors( cudaMalloc(&d_In1, sizeEl ) , __LINE__, __FILE__);
cudaErrors( cudaMalloc(&d_Out, sizeEl ) , __LINE__, __FILE__);
// Call kernel to copy data in d_InX and transform into cufft format
dim3 dimThreads(THREADS_PER_2D_BLOCK, THREADS_PER_2D_BLOCK);
dim3 dimBlocks = calcDimBlocks(dimX1, dimY1);
if( Ai != NULL) {
copyMatToCufft<<<dimBlocks, dimThreads>>>(d_In1, d_Ar, d_Ai, dimX1, dimY1);
} else {
copyMatToCufft_noIm<<<dimBlocks, dimThreads>>>(d_In1, d_Ar, dimX1, dimY1);
}
cudaThreadSynchronize();
///////////////////////// TRANSFORM //////////////////////////////////
cufftHandle plan;
cufftPlan2d(&plan, dimX1, dimY1, CUFFT_Z2Z); // CRASH at second call
cufftExecZ2Z(plan, d_In1, d_Out, CUFFT_INVERSE);
cudaThreadSynchronize() ;
cufftDestroy(plan);
///////////////////////// TRANSFORM //////////////////////////////////
// Call kernel to copy result from d_Out into correct output format
copyCufftToMat<<<dimBlocks, dimThreads>>>(d_Out, d_Ar, d_Ai, dimX1, dimY1);
cudaErrors( cudaThreadSynchronize() , __LINE__, __FILE__);
sizeEl = numEl * sizeof(double);
cudaErrors( cudaMemcpy(Or, d_Ar, sizeEl, cudaMemcpyDeviceToHost) , __LINE__, __FILE__);
cudaErrors( cudaMemcpy(Oi, d_Ai, sizeEl, cudaMemcpyDeviceToHost) , __LINE__, __FILE__);
cudaErrors( cudaFree(d_Ar) , __LINE__, __FILE__);
cudaErrors( cudaFree(d_Ai) , __LINE__, __FILE__);
cudaErrors( cudaFree(d_In1) , __LINE__, __FILE__);
cudaErrors( cudaFree(d_Out) , __LINE__, __FILE__);
return;
}
// Calculate grid size
__host__ dim3 calcDimBlocks(int dimX, int dimY)
{
// dimX for a matrix is Y dimension for cuda blocks and vice versa
dim3 dimBlocks(1,1,1);
double remainder = (((double) dimX) / THREADS_PER_2D_BLOCK) - dimX / THREADS_PER_2D_BLOCK;
if(remainder == 0) {
dimBlocks.y = dimX / THREADS_PER_2D_BLOCK;
} else{
dimBlocks.y = dimX / THREADS_PER_2D_BLOCK +1;
}
remainder = (((double) dimY) / THREADS_PER_2D_BLOCK) - dimY / THREADS_PER_2D_BLOCK;
if(remainder == 0) {
dimBlocks.x = dimY / THREADS_PER_2D_BLOCK;
} else{
dimBlocks.x = dimY / THREADS_PER_2D_BLOCK +1;
}
return dimBlocks;
}
// Copy a real matlab matrix in cufft data format
__global__ void copyMatToCufft_noIm(cufftDoubleComplex* d_Cufft, double* d_MatR, int dimX, int dimY)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if( row < dimX && col < dimY ){
d_Cufft[row * dimY + col].x = d_MatR[col * dimX + row];
d_Cufft[row * dimY + col].y = 0;
}
}
// Copy a complex matlab matrix in cufft data format
__global__ void copyMatToCufft(cufftDoubleComplex* d_Cufft, double* d_MatR, double* d_MatI, int dimX, int dimY)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if( row < dimX && col < dimY ){
d_Cufft[row * dimY + col].x = d_MatR[col * dimX + row];
d_Cufft[row * dimY + col].y = d_MatI[col * dimX + row];
}
}
// Copy complex cufft matrix into matlab format
__global__ void copyCufftToMat(cufftDoubleComplex* d_Cufft, double* d_MatR, double* d_MatI, int dimX, int dimY)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if( row < dimX && col < dimY ){
d_MatR[col * dimX + row] = d_Cufft[row * dimY + col].x;
d_MatI[col * dimX + row] = d_Cufft[row * dimY + col].y;
}
}
Hope someone here does see the error.
Thanks!