Segmentation fault (core dumped) after running mex cuda code

I have a mex function that calls a cu function to evaluate phi = 1/2*x’Ax - b’*x, where x and b are both m by 1 vectors and A is an m by m matrix. The code can be compiled and executed and it also gives me the correct answer.

However, when I exited MATLAB, I kept getting the error Segmentation fault (core dumped). What I did is that, generate A, b and x in MATLAB, use a mex function to pass them to cuda. Then I evaluate phi = 1/2*x’Ax - b’*x on GPU (using the cuda linear algebra library cublas) and use mex to
transfer phi back to MATLAB.

Can anyone help me to see where the problem is? Thanks in advance.

Btw, here is how I compile it:

nvcc -arch=sm_20 -c test.cu -Xcompiler -fPIC -I/site/local/matlab-r2012a/extern/include/
mex -L/usr/local/cuda/lib64 -lcudart -lcublas test.o

To open the MATLAB, you need to link the libstdc++ library:

LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 matlab

In MATLAB, I did the following to test the code:

N = 500; x = randn(N,1);B = randn(N);A = B'*B; b = randn(N,1);    
tic, 1/2*x'*A*x-b'*x, toc    
tic, phi = cublas_mex_test(x,A,b),toc

Here is my test.cu:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "mex.h"
#include <time.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

// this is the actual function that evaluates phi = 1/2*x'*A*x - b'*x
int PhiEval(double *x, double *A, double *b, double &phi, size_t m)
{ 
	cudaError_t cudaStat; 
	cublasStatus_t stat;
	cublasHandle_t handle;
	clock_t start, end;

	start = clock();
	//host data
	double *bx, *xAx;
	bx = (double*)malloc(1*sizeof(double)); 
	xAx = (double*)malloc(1*sizeof(double)); 

	//device data
	double* A_d, * x_d, *b_d, *Ax_d, *bx_d, *xAx_d;
	cudaStat = cudaMalloc ((void**)&A_d, m*m*sizeof(double));
	cudaStat = cudaMalloc ((void**)&x_d, m*sizeof(double));
	cudaStat = cudaMalloc ((void**)&b_d, m*sizeof(double));
	cudaStat = cudaMalloc ((void**)&Ax_d, m*sizeof(double));
    cudaStat = cudaMalloc ((void**)&bx_d, 1*sizeof(double));
	cudaStat = cudaMalloc ((void**)&xAx_d, 1*sizeof(double));
	if (cudaStat != cudaSuccess) {
		printf ("device memory allocation failed");
		return EXIT_FAILURE;
	}
	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to allocate the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	start = clock();
	stat = cublasCreate(&handle);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("CUBLAS initialization failed\n");
		return EXIT_FAILURE;
	}
	
	// copy host data to device
	stat = cublasSetMatrix (m,m, sizeof(double), A, m, A_d, m);
	stat = cublasSetVector (m, sizeof(double), x, 1, x_d, 1);
	stat = cublasSetVector (m, sizeof(double), b, 1, b_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("data download failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}
	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to copy the data to GPU.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	start = clock();
	//calculate A*x and store the result in Ax_d
	double alpha = 1;
	double beta = 0;
	stat = cublasDgemv(handle, CUBLAS_OP_N, m,m, &alpha, A_d, m, x_d, 1, &beta, Ax_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("data download failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}

	//calculate x'*A*x and store the result in xAx_d
	stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, x_d, m, Ax_d, 1, &beta, xAx_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("inner product failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}
	stat = cublasGetVector (1, sizeof(double),xAx_d, 1, xAx, 1); // copy the result x'*A*x to host

	//calculate b'*x and store the result in bx_d
	stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, b_d, m, x_d, 1, &beta, bx_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("inner product failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}
	stat = cublasGetVector (1, sizeof(double),bx_d, 1, bx, 1);
	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to call functions in cublas.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	//calculate phi = 1/2*x'*A*x - b'*x
	phi = .5*xAx[0]-bx[0];

	start = clock();
	//free the memory
	cudaFree (A_d);
	cudaFree (x_d);
	cudaFree (b_d);
	cudaFree (Ax_d);
	cudaFree(xAx_d);
	cudaFree(bx_d);
	cublasDestroy(handle);
	free(bx); free(xAx);

	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to free the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	return EXIT_SUCCESS;
}

/* the gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[])
{
	double phi;
	double *A, *b, *x;

	size_t mrows,ncols;

	/*  check for proper number of arguments */
	if(nrhs!=3) 
		mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumInputs",
            "Three inputs required.");
	if(nlhs!=1) 
		mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumOutputs",
            "One output required.");

	/*  create a pointer to the input vector x */
	x = mxGetPr(prhs[0]);
  
	/*  create a pointer to the input matrix A */
	A = mxGetPr(prhs[1]);

	/*  create a pointer to the input vector b */
	b = mxGetPr(prhs[2]);
  
	/*  get the dimensions of the matrix input A */
	mrows = mxGetM(prhs[1]);
	ncols = mxGetN(prhs[1]);

	if(mrows!=ncols) 
		mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidMatrixInput",
            "A has to be a square matrix");

	/*  call the cpp subroutine */
	PhiEval(x,A,b,phi,mrows);   
  
	plhs[0] = mxCreateDoubleScalar(phi);
}

Based on a similar posting on SO, it seems that the method described here will help:

http://www.mathworks.de/matlabcentral/answers/45307

I wanted to chime in that I also had this issue a while back with some CUDA MEX code and following the advice of the above post I was able to fix the issue with a mexAtExit “cleanup” function that calls cudaThreadExit()

Thanks, I did the same thing. Now this error is gone! Thank you everybody!