Dear NVIDIA support,
I tried to run nvblas routines on a forked process. but that seems not working.
In the following i attach a small test code that reproduces the issue.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/wait.h>
// gcc blas_multiply.c -o blas_mul -I/usr/include/x86_64-linux-gnu -L"/lib/x86_64-linux-gnu/" -lopenblas -lpthread
// gcc blas_multiply.c -o nvblas_mul -I/usr/include/x86_64-linux-gnu -L"/usr/local/cuda/targets/x86_64-linux/lib/" -lnvblas -L"/lib/x86_64-linux-gnu/" -lopenblas
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
void initialize_random_matrix(double* M, uint n){
for(uint i =0; i <n; i++){
for(uint j=0; j<n; j++){
M[i*n+j] = (double) rand()/RAND_MAX;
}
}
}
void do_calculation(double* A, double* B, double* C, uint n, uint times){
char ta = 'N';
char tb = 'N';
double alpha = 1.2;
double beta = 0.001;
// do some calculation N times
for(uint i = 0; i < times; i++){
// C = A * B
dgemm_(&ta, &tb, &n, &n, &n, &alpha, A, &n, B, &n, &beta, C, &n);
// B = A * C
dgemm_(&ta, &tb, &n, &n, &n, &alpha, A, &n, C, &n, &beta, B, &n);
// C = A * B
dgemm_(&ta, &tb, &n, &n, &n, &alpha, A, &n, B, &n, &beta, C, &n);
}
}
void print_result(double *M, uint n){
printf("Results: %lf %lf\n", M[0], M[(n*n)-1]);
}
int main ( int argc, char* argv[] ) {
srand ( time ( NULL));
int n;
int do_fork = 0;
if (argc != 3){
printf("Using defaults\n");
n = 100;
do_fork = 0;
}
else{
n = atoi(argv[1]);
do_fork = atoi(argv[2]);
if(do_fork > 0){
do_fork = 1;
}
}
// Create arrays that represent the matrices A,B,C
double* A = (double*) malloc(n*n * sizeof(double));
double* B = (double*) malloc(n*n * sizeof(double));
double* C = (double*) malloc(n*n * sizeof(double));
// Fill A and B with random numbers
initialize_random_matrix(A, n);
initialize_random_matrix(B, n);
// Write something on A
A[(int) n/2] = 99.0;
if(do_fork == 1){
pid_t pID = fork();
if (pID == 0){
printf("Start calculation on forked process\n");
printf("A[%d] = %lf\n", n/2, A[n/2]);
do_calculation(A, B, C, n, 100);
print_result(C, n);
}
// parent process because return value non-zero.
else {
printf("Main process waiting for fork process completion\n");
wait(NULL);
}
}
else{
printf("Start calculation on main process\n");
printf("A[%d] = %lf\n", n/2, A[n/2]);
do_calculation(A, B, C, n, 100);
print_result(C, n);
}
// Clean up
free(A);
free(B);
free(C);
return 0;
}
if I run the program:
- ./nvblas_mul 10000 0: without forking (argv[2] == 0) the program runs properly. The gpus are used (checking it with nvidia-smi).
-
./nvblas_mul 10000 1: with forking (the child process is on charge of execution calculation) the program fails. In the nvblas logs i got: [NVBLAS] dgemm[gpu]: ta=N, tb=N, m=10000, n=10000, k=10000 | GPU GI CI PID Type Process name GPU Memory |
[NVBLAS] cublasXtDgemm failed with error=14
If I build the same program with openblas (obviously calculation is not offloaded to GPUs) everythings works (also in child forked process).
Thanks for helping