I am having an issue understanding implementing the cublasXgetrf/cublasXgetri fun in Cuda 6.5 on a 3.0 device. In my programming I am attempting to keep all memory allocations to a minimum and memory movement to a min too. The problem I am having is understanding and implementing a method for casting a device pointer to memory to an pointer to an array of device memory required for the 3rd argument of cublasXgetrf/i. I found an example on the stackoverflow on how to do this in cuda on a 3.5 device, but not one for 3.0. My attempt to do this is below:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
#include <math.h>
#define CUDA(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) \
{ \
printf("CUDA ERROR at line : %d, file : %s, %s\n", __LINE__, __FILE__, cudaGetErrorString(err)); \
exit(-1); \
} \
} while(0);
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) { \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while(0);
void invertMatrixGPU(float* a_i, float* c_o, int n, int ldda, cublasHandle_t hdl)
{
int *p = (int *)malloc(n*sizeof(int));
int *info = (int *)malloc(sizeof(int));
int batch;
int INFOh = 0;
batch = 1;
float **a =NULL;
cudaMalloc(a,sizeof(float**));
cudaMemcpy(*a,a_i,sizeof(float*),cudaMemcpyHostToDevice);
//float **c =NULL;//= (float **)malloc(sizeof(float *));
//*c = c_o;
//cudaMalloc(&c,sizeof(float*));
// See
//http://docs.nvidia.com/cuda/pdf/CUDA_Dynamic_Parallelism_Programming_Guide.pdf
//http://stackoverflow.com/questions/27094612/cublas-matrix-inversion-from-device
cublascall(cublasSgetrfBatched(hdl, n, a, ldda, p, info, batch));
cudaMemcpy(&INFOh,info,sizeof(int),cudaMemcpyDeviceToHost);
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cublascall(cublasSgetriBatched(hdl, n, (const float **)a, ldda, p, c, ldda, info, batch));
cudaMemcpy(&INFOh,info,sizeof(int),cudaMemcpyDeviceToHost);
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
int main() {
// Initialize GPU for CUDA
CUDA(cudaSetDevice(0));
cublasHandle_t handle;
cublasCreate(&handle);
float *matrix = (float*)malloc(sizeof(float)*4*4);
for (int i=0;i<16;i++)
{
matrix[i] = i;
}
float *matrix_d = NULL;
CUDA(cudaMalloc(&matrix_d,sizeof(float)*4*4));
CUDA(cudaMemcpy(matrix_d,matrix,sizeof(float)*4*4,cudaMemcpyHostToDevice));
float *matrix_di = NULL;
CUDA(cudaMalloc(&matrix_di,sizeof(float)*4*4));
for (int i = 0;i<10;i++){
invertMatrixGPU(matrix_d, matrix_di,4,4, handle);
}
free(matrix);
cudaFree(matrix_d);
cudaFree(matrix_di);
cublasDestroy(handle);
}
running this code under nvprof results in segmentation fault at the cublasXsegtrf/i line. Can anyone help me out? I am trying to not copy the code back from the device