#include #include #include #include #include #include #define CUCHK(call) { \ cudaError_t err = call; \ if(cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString(err)); \ fflush(stderr); \ exit(EXIT_FAILURE); \ }} using namespace std; __global__ void kernel(int N, float *array){ int index = blockIdx.x*blockDim.x + threadIdx.x; int stride = blockDim.x*gridDim.x; for(int i = index; i < N; i += stride) array[i] += 2; } int main(int argc, char** argv){ MPI_Init(&argc, &argv); int size; MPI_Comm_size(MPI_COMM_WORLD, &size); int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); int n = 1024*1024*10; int bytes = n*sizeof(float); int npass = atoi(argv[1]); //allocate host array float *ha1, *ha2; CUCHK(cudaMallocHost((void**)&ha1, bytes)); CUCHK(cudaMallocHost((void**)&ha2, bytes)); for(int i = 0; i < n; i++) ha1[i] = i; //allocate device array float *da1, *da2; cudaSetDevice(rank+1); CUCHK(cudaMalloc((void**)&da1, bytes)); CUCHK(cudaMemcpy(da1, ha1, bytes, cudaMemcpyHostToDevice)); kernel <<< 256, 1024 >>> (n, da1); CUCHK(cudaMalloc((void**)&da2, bytes)); CUCHK(cudaDeviceSynchronize()); MPI_Barrier(MPI_COMM_WORLD); CUCHK(cudaProfilerStart()); //=== 1. normal mpi auto start = std::chrono::high_resolution_clock::now(); for(int i = 0; i < npass; ++i){ if(rank == 0){ CUCHK(cudaMemcpy(ha1, da1, bytes, cudaMemcpyDeviceToHost)); MPI_Send(ha1, n, MPI_FLOAT, 1, 0, MPI_COMM_WORLD); } if(rank == 1){ MPI_Recv(ha2, n, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); CUCHK(cudaMemcpy(da2, ha2, bytes, cudaMemcpyHostToDevice)); } } CUCHK(cudaDeviceSynchronize()); MPI_Barrier(MPI_COMM_WORLD); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed1 = end - start; cout << "Time spent in normal MPI: " << elapsed1.count() << " seconds" << endl; //=== 2. cuda-aware mpi MPI_Barrier(MPI_COMM_WORLD); start = std::chrono::high_resolution_clock::now(); for(int i = 0; i < npass; ++i){ if(rank == 0) MPI_Send(da1, n, MPI_FLOAT, 1, 0, MPI_COMM_WORLD); if(rank == 1) MPI_Recv(da2, n, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } CUCHK(cudaDeviceSynchronize()); MPI_Barrier(MPI_COMM_WORLD); end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed2 = end - start; cout << "Time spent in cuda-aware MPI: " << elapsed2.count() << " seconds" << endl; CUCHK(cudaProfilerStop()); //QC CUCHK(cudaMemcpy(ha2, da2, bytes, cudaMemcpyDeviceToHost)); float error = -1; if(rank == 1 ) for(int i = 0; i < n; i++) if(error < fabs(ha2[i] - i - 2)) error = fabs(ha2[i] - i - 2); if(rank == 1) cout << "max error = " << error << endl; MPI_Finalize(); }