Summary
General questions to multi-GPU programming and communication between GPUs. Primary focus: malloc, setDevice, memcpy, synchronize
Relevant Area
Multi GPU programming, multi GPU matrix multiplication
Description
General understanding concerning the usage of cudaSetDevice, the implications of switching devices, correct usage of memcpy (cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, …), correct usage of cudaDeviceSynchronize, …
NVIDIA GPU or System
P5000
NVIDIA Software Version
NVIDIA-SMI 418.87.00 Driver Version: 418.87.00 CUDA Version: 10.1
OS
Linux Fedora 29
Other Details
// Setup: for the sake of completeness only
int numGPU=2;
int primaryGPU = 0;
checkCudaErrors(cudaSetDevice(primaryGPU));
multi_A22 [2] = {/* Matrix 0 */, //each matrix 20x40, 2 halves of a bigger 40x40 matrix
/* Matrix 1 */}; //sparse matrix, ~3 values per row
int separatorRow[3] = (0, 20, 40); //defines at which row the matrix was cut
int separatorVal[3] = (0, 60, 120); //defines at which value the matrix was cut
int multi_M[2] = (20, 20); //number of rows per Matrix
int multi_N[2] = (40, 40); //number of columns per Matrix
int multi_nz[2] = (60, 60); //number of non-zeros per Matrix
for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
multi_I[curGPU] = multi_A22[curGPU].outerIndexPtr(); //rowPtr (size +1)
multi_J[curGPU] = multi_A22[curGPU].innerIndexPtr(); //colPtr (size nz)
multi_val[curGPU] = multi_A22[curGPU].valuePtr(); //valuesPtr (size nz)
}
checkCudaErrors(cudaSetDevice(primaryGPU));
cudaDeviceSynchronize();
///Code that is needed on the primary gpu
cudaMemcpy(d_x, x, N * sizeof(double), cudaMemcpyHostToDevice);
checkCudaErrors(cudaMalloc((void **) &d_x, N * sizeof(double)));
///Code that needs to be run on the individual GPUs
for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
checkCudaErrors(cudaSetDevice(curGPU));
cudaDeviceSynchronize();
checkCudaErrors(cudaMalloc((void **) &multi_d_row[curGPU], (multi_M[curGPU] + 1) * sizeof(int)));
checkCudaErrors(cudaMalloc((void **) &multi_d_col[curGPU], multi_nz[curGPU] * sizeof(int)));
checkCudaErrors(cudaMalloc((void **) &multi_d_val[curGPU], multi_nz[curGPU] * sizeof(double)));
checkCudaErrors(cudaMalloc((void **) &multi_d_Ax[curGPU], multi_M[curGPU] * sizeof(double)));
checkCudaErrors(cudaMalloc((void **) &multi_d_p[curGPU], multi_N[curGPU] * sizeof(double)));
}
for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
checkCudaErrors(cudaSetDevice(curGPU));
cudaDeviceSynchronize();
cudaMemcpy(multi_d_row[curGPU], multi_I[curGPU], (multi_M[curGPU] + 1) * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(multi_d_col[curGPU], multi_J[curGPU], multi_nz[curGPU] * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(multi_d_val[curGPU], multi_val[curGPU], multi_nz[curGPU] * sizeof(double),
cudaMemcpyHostToDevice);
}
checkCudaErrors(cudaSetDevice(curGPU));
cudaDeviceSynchronize();
///Questions:
/*
* 1) When is cudaDeviceSynchronize necessary?
* 2) Does cudaSetDevice before an operation result in the operation being executed on the given GPU?
* a) Is the setup above done correctly?
* b) Which operations are required to enable functions like cusparseDcsrmv to be run on the individual GPU?
* 3) Is the cudaMemcpyDeviceToDevice handled correctly?
*/
// the relevant part
///Code that is needed on the primary gpu
checkCudaErrors(cudaMalloc((void ) &d_Ax, N * sizeof(double)));
///Code that needs to be run on the individual GPUs
for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
checkCudaErrors(cudaSetDevice(curGPU));
cudaDeviceSynchronize();
checkCudaErrors(cudaMalloc((void **) &multi_d_Ax[curGPU], multi_M[curGPU] * sizeof(double)));
}
checkCudaErrors(cudaSetDevice(primaryGPU));
//1)
cudaDeviceSynchronize();
//2)
double alpha = 1.0;
double beta = 0.0;
for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
checkCudaErrors(cudaSetDevice(curGPU));
cudaDeviceSynchronize();
cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, multi_M[curGPU], multi_N[curGPU],
multi_nz[curGPU], &alpha, descr, multi_d_val[curGPU], multi_d_row[curGPU],
multi_d_col[curGPU], d_x, &beta, multi_d_Ax[curGPU]);
}
// 3)
for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
checkCudaErrors(cudaSetDevice(curGPU));
cudaDeviceSynchronize();
double *dst = d_Ax + separatorRow[curGPU];
double *src = multi_d_Ax[curGPU];
size_t count = separatorRow[curGPU + 1] - separatorRow[curGPU];
if (curGPU == primaryGPU) {
cublasStatus = cublas_copy(cublasHandle, count, src, 1, dst, 1);
} else {
cudaMemcpy(dst, src, count * sizeof(double), cudaMemcpyDeviceToDevice);
}
}
}