I need to perform lu decomposition to several small matrices in the same time. However I get segmentation fault. I attach my source code as following and my Gpu card is C2075 with cuda 5.5 and the driver is 319.37. I appreciate any suggestion.
//matrix_b include batchsize number of (dim*dim) matrices
int matrix_gpu_lu_batch(double *matrix_b, int dim, int batchsize )
{
int m = dim;
cublasStatus_t status_lu;
cublasStatus_t status;
cudaError_t cudaStatus;
cudaError_t err1,err2,err3;
cublasHandle_t handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
{
fprintf(stdout, "CUBLAS initialization failed!\n");
cudaDeviceReset();
exit(1);
}
cudaStream_t streamArray;
cublasSetStream(handle, streamArray);
//matrix buffers for LU decomposition, I need to keep the original matrices
double *d_lubuffer;
err1 = cudaMalloc((void **)&d_lubuffer, dim*dim*batchsize*sizeof(double));
//pointer array hold all matrix pointers
double **h_ptrlubuffer = new double *[batchsize];
for(int i=0;i<batchsize;i++){
err1 = cudaMemcpy(&d_lubuffer[i*m*m],&matrix_b[i*m*m],dim*dim*sizeof(double),cudaMemcpyDeviceToDevice); //matrix_b locates in device memory
h_ptrlubuffer[i] = &d_lubuffer[i*m*m];
}
//pointer array in device memory hold all matrix pointers
double **d_ptrlubuffer;
err1 = cudaMalloc((void **)&d_ptrlubuffer,batchsize*sizeof(double *));
err2 = cudaMemcpy(d_ptrlubuffer,h_ptrlubuffer,batchsize*sizeof(double *),cudaMemcpyHostToDevice);
int *d_ipiv;
err1 = cudaMalloc((void **)&d_ipiv, dim*batchsize*sizeof(int));
int *d_info;
err1 = cudaMalloc((void **)&d_info, batchsize*sizeof(int));
status_lu = cublasDgetrfBatched(handle,dim,d_ptrlubuffer,dim,d_ipiv,d_info,batchsize);
cublasDestroy(handle);
cudaDeviceReset();
}