Segmentation fault (core dumped) after running mex cuda code

codeCcode · February 20, 2014, 9:40pm

I have a mex function that calls a cu function to evaluate phi = 1/2*x’Ax - b’*x, where x and b are both m by 1 vectors and A is an m by m matrix. The code can be compiled and executed and it also gives me the correct answer.

However, when I exited MATLAB, I kept getting the error Segmentation fault (core dumped). What I did is that, generate A, b and x in MATLAB, use a mex function to pass them to cuda. Then I evaluate phi = 1/2*x’Ax - b’*x on GPU (using the cuda linear algebra library cublas) and use mex to
transfer phi back to MATLAB.

Can anyone help me to see where the problem is? Thanks in advance.

Btw, here is how I compile it:

nvcc -arch=sm_20 -c test.cu -Xcompiler -fPIC -I/site/local/matlab-r2012a/extern/include/
mex -L/usr/local/cuda/lib64 -lcudart -lcublas test.o

To open the MATLAB, you need to link the libstdc++ library:

LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 matlab

In MATLAB, I did the following to test the code:

N = 500; x = randn(N,1);B = randn(N);A = B'*B; b = randn(N,1);    
tic, 1/2*x'*A*x-b'*x, toc    
tic, phi = cublas_mex_test(x,A,b),toc

Here is my test.cu:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "mex.h"
#include <time.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

// this is the actual function that evaluates phi = 1/2*x'*A*x - b'*x
int PhiEval(double *x, double *A, double *b, double &phi, size_t m)
{ 
	cudaError_t cudaStat; 
	cublasStatus_t stat;
	cublasHandle_t handle;
	clock_t start, end;

	start = clock();
	//host data
	double *bx, *xAx;
	bx = (double*)malloc(1*sizeof(double)); 
	xAx = (double*)malloc(1*sizeof(double)); 

	//device data
	double* A_d, * x_d, *b_d, *Ax_d, *bx_d, *xAx_d;
	cudaStat = cudaMalloc ((void**)&A_d, m*m*sizeof(double));
	cudaStat = cudaMalloc ((void**)&x_d, m*sizeof(double));
	cudaStat = cudaMalloc ((void**)&b_d, m*sizeof(double));
	cudaStat = cudaMalloc ((void**)&Ax_d, m*sizeof(double));
    cudaStat = cudaMalloc ((void**)&bx_d, 1*sizeof(double));
	cudaStat = cudaMalloc ((void**)&xAx_d, 1*sizeof(double));
	if (cudaStat != cudaSuccess) {
		printf ("device memory allocation failed");
		return EXIT_FAILURE;
	}
	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to allocate the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	start = clock();
	stat = cublasCreate(&handle);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("CUBLAS initialization failed\n");
		return EXIT_FAILURE;
	}
	
	// copy host data to device
	stat = cublasSetMatrix (m,m, sizeof(double), A, m, A_d, m);
	stat = cublasSetVector (m, sizeof(double), x, 1, x_d, 1);
	stat = cublasSetVector (m, sizeof(double), b, 1, b_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("data download failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}
	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to copy the data to GPU.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	start = clock();
	//calculate A*x and store the result in Ax_d
	double alpha = 1;
	double beta = 0;
	stat = cublasDgemv(handle, CUBLAS_OP_N, m,m, &alpha, A_d, m, x_d, 1, &beta, Ax_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("data download failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}

	//calculate x'*A*x and store the result in xAx_d
	stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, x_d, m, Ax_d, 1, &beta, xAx_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("inner product failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}
	stat = cublasGetVector (1, sizeof(double),xAx_d, 1, xAx, 1); // copy the result x'*A*x to host

	//calculate b'*x and store the result in bx_d
	stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, b_d, m, x_d, 1, &beta, bx_d, 1);
	if (stat != CUBLAS_STATUS_SUCCESS) {
		printf ("inner product failed");
		cudaFree (A_d);
		cudaFree (x_d);
		cudaFree (b_d);
		cudaFree (Ax_d);
		cudaFree(xAx_d);
		cudaFree(bx_d);
		cublasDestroy(handle);
		free(bx); free(xAx);
		return EXIT_FAILURE;
	}
	stat = cublasGetVector (1, sizeof(double),bx_d, 1, bx, 1);
	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to call functions in cublas.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	//calculate phi = 1/2*x'*A*x - b'*x
	phi = .5*xAx[0]-bx[0];

	start = clock();
	//free the memory
	cudaFree (A_d);
	cudaFree (x_d);
	cudaFree (b_d);
	cudaFree (Ax_d);
	cudaFree(xAx_d);
	cudaFree(bx_d);
	cublasDestroy(handle);
	free(bx); free(xAx);

	end = clock();
	printf ("It takes: %d clicks (%.7f seconds) to free the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

	return EXIT_SUCCESS;
}

/* the gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[])
{
	double phi;
	double *A, *b, *x;

	size_t mrows,ncols;

	/*  check for proper number of arguments */
	if(nrhs!=3) 
		mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumInputs",
            "Three inputs required.");
	if(nlhs!=1) 
		mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumOutputs",
            "One output required.");

	/*  create a pointer to the input vector x */
	x = mxGetPr(prhs[0]);
  
	/*  create a pointer to the input matrix A */
	A = mxGetPr(prhs[1]);

	/*  create a pointer to the input vector b */
	b = mxGetPr(prhs[2]);
  
	/*  get the dimensions of the matrix input A */
	mrows = mxGetM(prhs[1]);
	ncols = mxGetN(prhs[1]);

	if(mrows!=ncols) 
		mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidMatrixInput",
            "A has to be a square matrix");

	/*  call the cpp subroutine */
	PhiEval(x,A,b,phi,mrows);   
  
	plhs[0] = mxCreateDoubleScalar(phi);
}

Robert_Crovella · February 21, 2014, 3:33pm

Based on a similar posting on SO, it seems that the method described here will help:

vacaloca · February 23, 2014, 6:07pm

I wanted to chime in that I also had this issue a while back with some CUDA MEX code and following the advice of the above post I was able to fix the issue with a mexAtExit “cleanup” function that calls cudaThreadExit()

codeCcode · February 25, 2014, 4:02am

Thanks, I did the same thing. Now this error is gone! Thank you everybody!

Topic		Replies	Views
CUFFT in MEX file crashes at second execution Segmentation Fault in cufftPlan2d() or cufftPlanMany() CUDA Programming and Performance	7	3221	June 28, 2012
Help me with Matlab & Cuda! CUDA Programming and Performance	2	1112	May 29, 2009
Segmentation fault (core dumped) CUDA Programming and Performance	5	8256	September 14, 2021
matlab, mex files and loops CUDA Programming and Performance	12	11644	March 9, 2012
Making a Mex file from CUDA code CUDA Programming and Performance	0	336	March 17, 2021
mex function not Cuda kernel CUDA Programming and Performance	3	1233	April 16, 2012
CUDA MEX library crash in MATLAB CUDA Programming and Performance	2	5923	October 27, 2008
How to make a proper cleanup after a matlab mex file that uses cuda? CUDA Programming and Performance	2	5382	June 25, 2012
Matlab CUDA Mex-file CUDA Programming and Performance	1	3619	May 26, 2009
using CUDA in Matlab mex-Files (timing problem) CUDA Programming and Performance	9	13034	October 29, 2010

Segmentation fault (core dumped) after running mex cuda code

Related topics