I have a mex function that calls a cu function to evaluate phi = 1/2*x’Ax - b’*x, where x and b are both m by 1 vectors and A is an m by m matrix. The code can be compiled and executed and it also gives me the correct answer.
However, when I exited MATLAB, I kept getting the error Segmentation fault (core dumped). What I did is that, generate A, b and x in MATLAB, use a mex function to pass them to cuda. Then I evaluate phi = 1/2*x’Ax - b’*x on GPU (using the cuda linear algebra library cublas) and use mex to
transfer phi back to MATLAB.
Can anyone help me to see where the problem is? Thanks in advance.
Btw, here is how I compile it:
nvcc -arch=sm_20 -c test.cu -Xcompiler -fPIC -I/site/local/matlab-r2012a/extern/include/
mex -L/usr/local/cuda/lib64 -lcudart -lcublas test.o
To open the MATLAB, you need to link the libstdc++ library:
LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 matlab
In MATLAB, I did the following to test the code:
N = 500; x = randn(N,1);B = randn(N);A = B'*B; b = randn(N,1);
tic, 1/2*x'*A*x-b'*x, toc
tic, phi = cublas_mex_test(x,A,b),toc
Here is my test.cu:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "mex.h"
#include <time.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
// this is the actual function that evaluates phi = 1/2*x'*A*x - b'*x
int PhiEval(double *x, double *A, double *b, double &phi, size_t m)
{
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
clock_t start, end;
start = clock();
//host data
double *bx, *xAx;
bx = (double*)malloc(1*sizeof(double));
xAx = (double*)malloc(1*sizeof(double));
//device data
double* A_d, * x_d, *b_d, *Ax_d, *bx_d, *xAx_d;
cudaStat = cudaMalloc ((void**)&A_d, m*m*sizeof(double));
cudaStat = cudaMalloc ((void**)&x_d, m*sizeof(double));
cudaStat = cudaMalloc ((void**)&b_d, m*sizeof(double));
cudaStat = cudaMalloc ((void**)&Ax_d, m*sizeof(double));
cudaStat = cudaMalloc ((void**)&bx_d, 1*sizeof(double));
cudaStat = cudaMalloc ((void**)&xAx_d, 1*sizeof(double));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to allocate the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
start = clock();
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
// copy host data to device
stat = cublasSetMatrix (m,m, sizeof(double), A, m, A_d, m);
stat = cublasSetVector (m, sizeof(double), x, 1, x_d, 1);
stat = cublasSetVector (m, sizeof(double), b, 1, b_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to copy the data to GPU.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
start = clock();
//calculate A*x and store the result in Ax_d
double alpha = 1;
double beta = 0;
stat = cublasDgemv(handle, CUBLAS_OP_N, m,m, &alpha, A_d, m, x_d, 1, &beta, Ax_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
//calculate x'*A*x and store the result in xAx_d
stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, x_d, m, Ax_d, 1, &beta, xAx_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("inner product failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
stat = cublasGetVector (1, sizeof(double),xAx_d, 1, xAx, 1); // copy the result x'*A*x to host
//calculate b'*x and store the result in bx_d
stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, b_d, m, x_d, 1, &beta, bx_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("inner product failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
stat = cublasGetVector (1, sizeof(double),bx_d, 1, bx, 1);
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to call functions in cublas.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
//calculate phi = 1/2*x'*A*x - b'*x
phi = .5*xAx[0]-bx[0];
start = clock();
//free the memory
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to free the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
/* the gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
double phi;
double *A, *b, *x;
size_t mrows,ncols;
/* check for proper number of arguments */
if(nrhs!=3)
mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumInputs",
"Three inputs required.");
if(nlhs!=1)
mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumOutputs",
"One output required.");
/* create a pointer to the input vector x */
x = mxGetPr(prhs[0]);
/* create a pointer to the input matrix A */
A = mxGetPr(prhs[1]);
/* create a pointer to the input vector b */
b = mxGetPr(prhs[2]);
/* get the dimensions of the matrix input A */
mrows = mxGetM(prhs[1]);
ncols = mxGetN(prhs[1]);
if(mrows!=ncols)
mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidMatrixInput",
"A has to be a square matrix");
/* call the cpp subroutine */
PhiEval(x,A,b,phi,mrows);
plhs[0] = mxCreateDoubleScalar(phi);
}