use gpu and cpu with c language

I want to use gpu and cpu by c language, and mpi do the node communication. Now I have some trouble. Please help me if anyone knows. Thank you.

MM.h

#include <stdio.h>

#include <stdlib.h>

#include <sys/time.h>

#include <dlfcn.h>

#include <omp.h>

#include <cuda.h>

#include <cuda_runtime_api.h>

#include "mpi.h"

#define IDX2C(i, j, ld) (((j)*(ld))+(i))

main.c

#include "MM.h"

int main(int argc, char **argv)

{

	int rank;

	MPI_Status status;

	MPI_Init(&argc,&argv);

	MPI_Comm_rank(MPI_COMM_WORLD,&rank);

	assignDeviceToProcess();

	MPI_Finalize();

}

cuda.c

#define _GNU_SOURCE

#include <stdlib.h>

#include <stdio.h>

#include <stddef.h>

#include <string.h>

#include <dlfcn.h>

#include <ctype.h>

#include <mpi.h>

#include "cuda_runtime.h"

#include "cublas.h"

#define imin(a,B) (((a)<(B))?(a):(B))

#define imax(a,B) (((a)<(B))?(B):(a))

#include <time.h>

#include <sys/types.h>

#include <sys/times.h>

#include <sys/time.h>

double *dev_scratch;

	static int first_time=1;

	static int myrank=0;

int stringCmp( const void *a, const void *B)

{

	 return strcmp(a,B);

}

void  assignDeviceToProcess()

{

	   char	 host_name[MPI_MAX_PROCESSOR_NAME];

	   char (*host_names)[MPI_MAX_PROCESSOR_NAME];

	   int n, namelen, color, rank, nprocs;

	   size_t bytes;

	   MPI_Comm nodeComm;

	   int dev, err1;

	   struct cudaDeviceProp deviceProp;

	   /* Check if the device has been alreasy assigned */

	   if(first_time)

		{

		 first_time=0;

	   MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	   MPI_Get_processor_name(host_name,&namelen);

	   bytes = nprocs * sizeof(char[MPI_MAX_PROCESSOR_NAME]);

	   host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);

	   

	   strcpy(host_names[rank], host_name);

	   for (n=0; n<nprocs; n++)

	   {

		MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD); 

	   }

	   qsort(host_names, nprocs,  sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);

	   color = 0;

	   for (n=1; n<nprocs; n++)

	   {

		 if(strcmp(host_names[n-1], host_names[n])) color++;

		 if(strcmp(host_name, host_names[n]) == 0) break;

	   }

	   //printf ("node %s color %d \n",host_name, color );

	   MPI_Comm_split(MPI_COMM_WORLD, color, 0, &nodeComm);

	   MPI_Comm_rank(nodeComm, &myrank);

	   printf ("Assigning device %d  to process on node %s rank %d \n",myrank,  host_name, rank );

	   /* Assign device to MPI process, initialize BLAS and probe device properties */

	   cudaSetDevice(myrank);

	   cublasInit();

	   /* allocate scratch space for library on device */

	  err1=cudaMalloc ((void**)&dev_scratch, 3.6*134217728 * sizeof(dev_scratch[0]));

	  if (err1 ) printf ("Error allocating scratch space %f on %s rank %d device %d\n",3.6*134217728, host_name, rank, myrank);

	  }

}

makefile

DEST=exe

CC=mpicc

RUN=mpirun

#INCdirCUDA=/home/test/dgemm_test/cuda /usr/local/cuda/include 

INCdirCUDA=/usr/local/cuda/include

INCdirMKL=/opt/intel/Compiler/11.1/056/mkl/include

INCdirATLAS=/opt/atlas/include

LIBdir=/usr/local/cuda/lib64

Lflag= -lcublas  

$(DEST): main.o  cuda.o 

	$(CC) -o exe main.o cuda.o  libgoto2_nehalemp-r1.06.a  -L$(LIBdir) $(Lflag)

main.o: main.c

	$(CC) main.c -c -I$(INCdirCUDA) -I$(INCdirMKL) -I$(INCdirATLAS)

cuda.o: cuda.c

	$(CC) cuda.c -c -I$(INCdirCUDA) -I$(INCdirMKL) -I$(INCdirATLAS)

	

run:

	$(RUN) -hostfile hostfile exe

clean:

	rm *.o exe

hostfile

ypc0900 slots=1

ypc0901 slots=1

after compile,when i execute it. the error is

t@ypc0900 init]$ make run

mpirun -hostfile hostfile exe

Assigning device 0 to process on node ypc0900.comp.is.uec.ac.jp rank 0

Assigning device 1 to process on node ypc0901.comp.is.uec.ac.jp rank 1

[ypc0900:14863] *** Process received signal ***

[ypc0900:14863] Signal: Segmentation fault (11)

[ypc0900:14863] Signal code: Address not mapped (1)

[ypc0900:14863] Failing at address: (nil)

[ypc0901:13503] *** Process received signal ***

[ypc0901:13503] Signal: Segmentation fault (11)

[ypc0901:13503] Signal code: Address not mapped (1)

[ypc0901:13503] Failing at address: (nil)

[ypc0900:14863] [ 0] /lib64/libpthread.so.0 [0x3452c0e4c0]

[ypc0900:14863] [ 1] /usr/lib64/libcuda.so [0x2b2d31719980]

[ypc0900:14863] [ 2] /usr/lib64/libcuda.so [0x2b2d3171f3c4]

[ypc0900:14863] [ 3] /usr/lib64/libcuda.so [0x2b2d316ef557]

[ypc0900:14863] [ 4] /usr/lib64/libcuda.so [0x2b2d3169acf7]

[ypc0900:14863] [ 5] /usr/lib64/libcuda.so [0x2b2d316ac52b]

[ypc0900:14863] [ 6] /usr/lib64/libcuda.so [0x2b2d31691940]

[ypc0900:14863] [ 7] /usr/lib64/libcuda.so [0x2b2d3168aa8a]

[ypc0900:14863] [ 8] /usr/lib64/libcuda.so(cuCtxCreate+0x57) [0x2b2d316e5187]

[ypc0900:14863] [ 9] /usr/local/cuda/lib64/libcudart.so.2 [0x2b2d2c744aa2]

[ypc0900:14863] [10] /usr/local/cuda/lib64/libcudart.so.2 [0x2b2d2c74528c]

[ypc0900:14863] [11] /usr/local/cuda/lib64/libcudart.so.2(cudaFree+0x2d) [0x2b2d2c7283dd]

[ypc0900:14863] [12] /usr/local/cuda/lib64/libcublas.so.2(cublasInitCtx+0x30) [0x2b2d2ae30110]

[ypc0900:14863] [13] /usr/local/cuda/lib64/libcublas.so.2 [0x2b2d2ae7a1f7]

[ypc0900:14863] [14] /usr/local/cuda/lib64/libcublas.so.2(cublasInit+0x50) [0x2b2d2ae302b0]

[ypc0900:14863] [15] exe(assignDeviceToProcess+0x1e6) [0x402276]

[ypc0900:14863] [16] exe(main+0x57) [0x402047]

[ypc0900:14863] [17] /lib64/libc.so.6(__libc_start_main+0xf4) [0x345201d974]

[ypc0900:14863] [18] exe [0x401f39]

[ypc0900:14863] *** End of error message ***

[ypc0901:13503] [ 0] /lib64/libpthread.so.0 [0x380240e4c0]

[ypc0901:13503] [ 1] /usr/lib64/libcuda.so [0x2b151cb48980]

[ypc0901:13503] [ 2] /usr/lib64/libcuda.so [0x2b151cb4e3c4]

[ypc0901:13503] [ 3] /usr/lib64/libcuda.so [0x2b151cb1e557]

[ypc0901:13503] [ 4] /usr/lib64/libcuda.so [0x2b151cac9cf7]

[ypc0901:13503] [ 5] /usr/lib64/libcuda.so [0x2b151cadb52b]

[ypc0901:13503] [ 6] /usr/lib64/libcuda.so [0x2b151cac0940]

[ypc0901:13503] [ 7] /usr/lib64/libcuda.so [0x2b151cab9a8a]

[ypc0901:13503] [ 8] /usr/lib64/libcuda.so(cuCtxCreate+0x57) [0x2b151cb14187]

[ypc0901:13503] [ 9] /usr/local/cuda/lib64/libcudart.so.2 [0x2b1517968aa2]

[ypc0901:13503] [10] /usr/local/cuda/lib64/libcudart.so.2 [0x2b151796928c]

[ypc0901:13503] [11] /usr/local/cuda/lib64/libcudart.so.2(cudaFree+0x2d) [0x2b151794c3dd]

[ypc0901:13503] [12] /usr/local/cuda/lib64/libcublas.so.2(cublasInitCtx+0x30) [0x2b1516054110]

[ypc0901:13503] [13] /usr/local/cuda/lib64/libcublas.so.2 [0x2b151609e1f7]

[ypc0901:13503] [14] /usr/local/cuda/lib64/libcublas.so.2(cublasInit+0x50) [0x2b15160542b0]

[ypc0901:13503] [15] exe(assignDeviceToProcess+0x1e6) [0x402276]

[ypc0901:13503] [16] exe(main+0x57) [0x402047]

[ypc0901:13503] [17] /lib64/libc.so.6(__libc_start_main+0xf4) [0x380181d974]

[ypc0901:13503] [18] exe [0x401f39]

[ypc0901:13503] *** End of error message ***


mpirun noticed that process rank 1 with PID 13503 on node ypc0901 exited on signal 11 (Segmentation fault).


make: *** [run] エラー 139

thank you for your patient.
init.rar (676 KB)