program hello_world
implicit none
include 'mpif.h'
integer ierr,numprocs,myid
integer a(2),b(2),c(2)
a=2
b=1
c=0
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,myid,ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,numprocs,ierr)
print*,'myid=',myid,numprocs
call add(a,b,c)
if(myid.eq.0)then
print*,'c=',c(1),c(2)
endif
call MPI_FINALIZE(ierr)
end program hello_world
#include<stdio.h>
#include<cuda_runtime.h>
extern "C" __global__ void add1(int *a_device,int *b_device,int *c_device)
{
int it=threadIdx.x+blockDim.x*blockIdx.x;
if(it<2){
c_device[it]=a_device[it]+b_device[it];
}
}
extern "C" void add_(int *a ,int *b,int *c)
{
int *a_device,*b_device,*c_device;
cudaMalloc(&a_device,2*sizeof(int));
cudaMalloc(&b_device,2*sizeof(int));
cudaMalloc(&c_device,2*sizeof(int));
cudaMemset(a_device,0,2*sizeof(int));
cudaMemset(b_device,0,2*sizeof(int));
cudaMemset(c_device,0,2*sizeof(int));
float mstimer;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMemcpy(a_device,a,2*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(b_device,b,2*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(c_device,c,2*sizeof(int),cudaMemcpyHostToDevice);
add1 <<<1,2>>> (a_device,b_device,c_device);
cudaMemcpy(a,a_device,2*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(b,b_device,2*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(c,c_device,2*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&mstimer,start,stop);
printf("CUDA: time= %g(s)\n",mstimer*1.e-3) ;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(a_device);
cudaFree(b_device);
cudaFree(c_device);
}
makefile:
CUDA_INSTALL_PATH=/usr/local/cuda-10.1
MPI_INSTALL_PATH=/opt/mpich2-1.4.1p1
NVCC =$(CUDA_INSTALL_PATH)/bin/nvcc
MPIF90= $(MPI_INSTALL_PATH)/bin/mpif90
LIBS =-lcudart -lcurand -L$(CUDA_INSTALL_PATH)/lib64
FFILES=hello_world.f90
CUFILES=add.cu
OBJECTS=hello_world.o add.o
EXENAME=test.x
all:
$(MPIF90) -c $(FFILES)
$(NVCC) -arch=sm_35 -c $(CUFILES)
$(MPIF90) -o $(EXENAME) $(LIBS) -lstdc++ $(OBJECTS)
clean:
rm -f *.o
rm -f test.x
problem: The program compiled successfully。But when I run this program,only the main process 0 is working. I’d like to solve this problem…