cublasXgetrf/cublasXgetri error with pointer to Array of pointers

I am having an issue understanding implementing the cublasXgetrf/cublasXgetri fun in Cuda 6.5 on a 3.0 device. In my programming I am attempting to keep all memory allocations to a minimum and memory movement to a min too. The problem I am having is understanding and implementing a method for casting a device pointer to memory to an pointer to an array of device memory required for the 3rd argument of cublasXgetrf/i. I found an example on the stackoverflow on how to do this in cuda on a 3.5 device, but not one for 3.0. My attempt to do this is below:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
#include <math.h>
#define CUDA(call) do {     \
    cudaError_t err = call; \
    if (err != cudaSuccess)                     \
    {                                           \
        printf("CUDA ERROR at line : %d, file : %s, %s\n", __LINE__, __FILE__, cudaGetErrorString(err)); \
        exit(-1);                          \
    }                                      \
    } while(0);


#define cublascall(call) \                                                                                         
    do                   \
    {  \
     cublasStatus_t status = (call); \
     if(CUBLAS_STATUS_SUCCESS != status) { \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset(); \
            exit(EXIT_FAILURE); \
        } \
     } \
     while(0);
void invertMatrixGPU(float* a_i, float* c_o, int n, int ldda,    cublasHandle_t hdl)
{
    int *p = (int *)malloc(n*sizeof(int));
    int *info = (int *)malloc(sizeof(int));
    int batch;
    int INFOh = 0;
    batch = 1;

    float **a =NULL;

    cudaMalloc(a,sizeof(float**));
    cudaMemcpy(*a,a_i,sizeof(float*),cudaMemcpyHostToDevice);
    //float **c =NULL;//= (float **)malloc(sizeof(float *));
    //*c = c_o;
    //cudaMalloc(&c,sizeof(float*));

    // See
    //http://docs.nvidia.com/cuda/pdf/CUDA_Dynamic_Parallelism_Programming_Guide.pdf
    //http://stackoverflow.com/questions/27094612/cublas-matrix-inversion-from-device
    cublascall(cublasSgetrfBatched(hdl, n, a, ldda, p, info, batch));
    cudaMemcpy(&INFOh,info,sizeof(int),cudaMemcpyDeviceToHost);

    if(INFOh != 0)
    {
        fprintf(stderr, "Inversion Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
    cublascall(cublasSgetriBatched(hdl, n, (const float **)a, ldda, p, c, ldda, info, batch));
    cudaMemcpy(&INFOh,info,sizeof(int),cudaMemcpyDeviceToHost);

    if(INFOh != 0)
    {
        fprintf(stderr, "Inversion Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
}


int main() {
    // Initialize GPU for CUDA
    CUDA(cudaSetDevice(0));

    cublasHandle_t handle;
    cublasCreate(&handle);

    float *matrix = (float*)malloc(sizeof(float)*4*4);
    for (int i=0;i<16;i++)
    {
        matrix[i] = i;
    }

    float *matrix_d = NULL;
    CUDA(cudaMalloc(&matrix_d,sizeof(float)*4*4));
   CUDA(cudaMemcpy(matrix_d,matrix,sizeof(float)*4*4,cudaMemcpyHostToDevice));
    float *matrix_di = NULL;
    CUDA(cudaMalloc(&matrix_di,sizeof(float)*4*4));

    for (int i = 0;i<10;i++){
        invertMatrixGPU(matrix_d, matrix_di,4,4, handle);
    }
    free(matrix);
    cudaFree(matrix_d);
    cudaFree(matrix_di);
    cublasDestroy(handle);

}

running this code under nvprof results in segmentation fault at the cublasXsegtrf/i line. Can anyone help me out? I am trying to not copy the code back from the device

The following looks suspicious:

cudaMalloc(a,sizeof(float**));

This allocates memory for a single pointer. That is probably not what you want/need.

Thanks for the feedback. I was able to fix this by playing around with a few items. The cublasXgetrf/i calls take a pointer to an array of pointers. I was getting stuck in my head the need for a device pointer to an array of device pointer. I was able to solve this use by doing the following:

int *P;
    int *INFO;

    cudaMalloc<int>(&P,n * batch * sizeof(int));
    cudaMalloc<int>(&INFO,batch * sizeof(int));
    //host pointer to array of device pointers
    float *A[] = { a_i };
    //device pointer to array of device pointers
    float** A_d;
    cudaMalloc<float*>(&A_d,sizeof(A));
    cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice);
    cublasSgetrf(hdl, n, A_d, ldda, P, INFO, batch);