NVBLAS on forked process not working (error code 14)

Dear NVIDIA support,

I tried to run nvblas routines on a forked process. but that seems not working.

In the following i attach a small test code that reproduces the issue.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/wait.h>

// gcc blas_multiply.c -o blas_mul  -I/usr/include/x86_64-linux-gnu -L"/lib/x86_64-linux-gnu/" -lopenblas -lpthread

// gcc blas_multiply.c -o nvblas_mul  -I/usr/include/x86_64-linux-gnu -L"/usr/local/cuda/targets/x86_64-linux/lib/" -lnvblas -L"/lib/x86_64-linux-gnu/" -lopenblas

extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);

void initialize_random_matrix(double* M, uint n){
  for(uint i =0; i <n; i++){
    for(uint j=0; j<n; j++){
      M[i*n+j] = (double) rand()/RAND_MAX;

void do_calculation(double* A, double* B, double* C, uint n, uint times){
  char ta = 'N';
  char tb = 'N';
  double alpha = 1.2;
  double beta = 0.001;

  // do some calculation N times
  for(uint i = 0; i < times; i++){
    // C = A * B
    dgemm_(&ta, &tb, &n, &n, &n, &alpha, A, &n, B, &n, &beta, C, &n);
    // B = A * C
    dgemm_(&ta, &tb, &n, &n, &n, &alpha, A, &n, C, &n, &beta, B, &n);
    // C = A * B
    dgemm_(&ta, &tb, &n, &n, &n, &alpha, A, &n, B, &n, &beta, C, &n);

void print_result(double *M, uint n){
  printf("Results: %lf %lf\n", M[0], M[(n*n)-1]);

int main ( int argc, char* argv[] ) {

  srand ( time ( NULL));
  int n;
  int do_fork = 0;

  if (argc != 3){
    printf("Using defaults\n");
    n = 100;
    do_fork = 0;
    n = atoi(argv[1]);
    do_fork = atoi(argv[2]);
    if(do_fork > 0){
      do_fork = 1;

  // Create arrays that represent the matrices A,B,C
  double*  A = (double*) malloc(n*n * sizeof(double));
  double*  B = (double*) malloc(n*n * sizeof(double));
  double*  C = (double*) malloc(n*n * sizeof(double));

  // Fill A and B with random numbers
  initialize_random_matrix(A, n);
  initialize_random_matrix(B, n);

  // Write something on A
  A[(int) n/2] = 99.0;

  if(do_fork == 1){
    pid_t pID = fork();
    if (pID == 0){
        printf("Start calculation on forked process\n");

        printf("A[%d] = %lf\n", n/2, A[n/2]);

        do_calculation(A, B, C, n, 100);

        print_result(C, n);
    // parent process because return value non-zero.
    else  {
      printf("Main process waiting for fork process completion\n");
    printf("Start calculation on main process\n");

    printf("A[%d] = %lf\n", n/2, A[n/2]);

    do_calculation(A, B, C, n, 100);

    print_result(C, n);

  // Clean up

  return 0;

if I run the program:

  • ./nvblas_mul 10000 0: without forking (argv[2] == 0) the program runs properly. The gpus are used (checking it with nvidia-smi).
  • ./nvblas_mul 10000 1: with forking (the child process is on charge of execution calculation) the program fails. In the nvblas logs i got: [NVBLAS] dgemm[gpu]: ta=N, tb=N, m=10000, n=10000, k=10000 | GPU GI CI PID Type Process name GPU Memory |
    [NVBLAS] cublasXtDgemm failed with error=14

If I build the same program with openblas (obviously calculation is not offloaded to GPUs) everythings works (also in child forked process).

Thanks for helping