CUDA Multi GPU matrix multiplication advice

Hi folks,

in preparation for my bachelor thesis i’ve been working on a matrix matrix multiplication implementation on a multi gpu basis in order to get some reference times, so i came up with the following code based on the multi gpu cuda sample.

I’d really appreciate it, if you would take a look and provide any further suggestions.

Best regards

#include "matMulMultiGPU.h"
const int MAX_GPU_COUNT = 3;
int main() {
	int block_size = 32;
	GPUplan plans[MAX_GPU_COUNT];
	size_t ptxSize;
	char *filename = "./matMul.cu";
	dim3 dimsA(8192, 8192, 1);
	dim3 dimsB(8192, 8192, 1);
	dim3 dimsC(dimsB.x, dimsA.y, 1);
	dim3 threads(block_size, block_size);
	dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
	unsigned int size_A = dimsA.x * dimsA.y;
	unsigned int size_B = dimsA.x * dimsA.y;
	unsigned int mem_size_A = sizeof(float) * size_A;
	unsigned int mem_size_B = sizeof(float) * size_B;
	unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);

	float* h_mat_a, *h_mat_b, *h_mat_c;

	std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);
	std::streampos pos = inputFile.tellg();
	size_t inputSize = (size_t)pos;
	char * memBlock = new char[inputSize + 1];
	inputFile.seekg(0, std::ios::beg);
	inputFile.read(memBlock, inputSize);
	inputFile.close();
	memBlock[inputSize] = '\x0';

	nvrtcProgram prog;
	nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL);
	nvrtcCompileProgram(prog, 0, NULL);
	nvrtcGetPTXSize(prog, &ptxSize);

	char *ptx = (char *)malloc(sizeof(char) * ptxSize);
	nvrtcGetPTX(prog, ptx);
	nvrtcDestroyProgram(&prog);

	CUmodule module;
	cuInit(0);
	//Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked) 
	int nrows = dimsA.y / MAX_GPU_COUNT;
	int lastrows = dimsA.y - nrows*(MAX_GPU_COUNT - 1);
	for (int i = 0; i < MAX_GPU_COUNT; i++)
	{	
		CUDA_ERROR_CHECK(cuDeviceGet(&plans[i].device, i));
		CUDA_ERROR_CHECK(cuCtxCreate(&plans[i].context, 0, plans[i].device));
		CUDA_ERROR_CHECK(cuCtxSetCurrent(plans[i].context));
		CUDA_ERROR_CHECK(cuStreamCreate(&plans[i].stream, CU_STREAM_NON_BLOCKING));
		int rows = nrows;
		if (i == MAX_GPU_COUNT - 1) {
			rows = lastrows;
			rows = (rows + block_size - 1);
		}
		CUDA_ERROR_CHECK(cuMemAlloc(&plans[i].d_mat_a, rows * dimsA.x * sizeof(float)));
		CUDA_ERROR_CHECK(cuMemAlloc(&plans[i].d_mat_b, mem_size_B));
		CUDA_ERROR_CHECK(cuMemAlloc(&plans[i].d_mat_c, rows * dimsC.x * sizeof(float)));
		CUDA_ERROR_CHECK(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
		CUDA_ERROR_CHECK(cuModuleGetFunction(&plans[i].kernel_addr, module, "matrixMul"));
		if (i == 0) {
			CUDA_ERROR_CHECK(cuMemHostAlloc((void**)&h_mat_a, mem_size_A, CU_MEMHOSTALLOC_PORTABLE));
			CUDA_ERROR_CHECK(cuMemHostAlloc((void**)&h_mat_b, mem_size_B, CU_MEMHOSTALLOC_PORTABLE));
			CUDA_ERROR_CHECK(cuMemHostAlloc((void**)&h_mat_c, mem_size_C, CU_MEMHOSTALLOC_PORTABLE));

			for (int i = 0; i < dimsA.x * dimsA.y; i++) {
				h_mat_a[i] = 1.0f;
			}
			for (int i = 0; i < dimsB.x * dimsB.y; i++) {
				h_mat_b[i] = 1.0f;
			}
		}
	}

	// Data transfer and kernel execution
	for (int i = 0; i < MAX_GPU_COUNT; i++)
	{
		int rows = nrows;
		int grid_y = nrows / threads.y;
		if (i == MAX_GPU_COUNT - 1) {
			rows = lastrows;
			grid_y = (rows + threads.y - 1) / threads.y;
		}
		int offset = dimsA.x *( nrows * i);
		//Set device
		CUDA_ERROR_CHECK(cuCtxSetCurrent(plans[i].context));
		//Copy input data from CPU
		CUDA_ERROR_CHECK(cuMemcpyHtoDAsync(plans[i].d_mat_a, &(h_mat_a[offset]), rows * dimsA.x * sizeof(float), plans[i].stream));
		CUDA_ERROR_CHECK(cuMemcpyHtoDAsync(plans[i].d_mat_b, h_mat_b, mem_size_B, plans[i].stream));
		void *arr[] = { (void *)&plans[i].d_mat_a, (void *)&plans[i].d_mat_b, (void *)&plans[i].d_mat_c, (void *)&dimsA.x, (void *)&dimsB.x };
		CUDA_ERROR_CHECK(cuLaunchKernel(plans[i].kernel_addr,
				grid.x, grid_y, grid.z,
				threads.x, threads.y, threads.z,
				0, plans[i].stream,
				&arr[0],
				0));
		CUDA_ERROR_CHECK(cuMemcpyDtoHAsync(&(h_mat_c[offset]), plans[i].d_mat_c, rows * dimsA.x * sizeof(float), plans[i].stream));
	}

	for (int i = 0; i < MAX_GPU_COUNT; i++) {
		CUDA_ERROR_CHECK(cuCtxSetCurrent(plans[i].context));
		CUDA_ERROR_CHECK(cuStreamSynchronize(plans[i].stream));
	}
	std::cout << "First: " << h_mat_c[0] << std::endl;
	std::cout << "Last: " << h_mat_c[dimsC.x * dimsC.y - 1] << std::endl;
	//Clean up
	for (int i = 0; i < MAX_GPU_COUNT; i++) {
		CUDA_ERROR_CHECK(cuMemFree(plans[i].d_mat_a));
		CUDA_ERROR_CHECK(cuMemFree(plans[i].d_mat_b));
		CUDA_ERROR_CHECK(cuMemFree(plans[i].d_mat_c));
		CUDA_ERROR_CHECK(cuStreamDestroy(plans[i].stream));
		CUDA_ERROR_CHECK(cuCtxDestroy(plans[i].context));
	}

	return 0;
}

If I were doing multi-GPU matrix multiply, I would use cublasXt:

http://docs.nvidia.com/cuda/cublas/index.html#cublasxt_gemm

First of all thanks for the reply.

To be more specific, besides using cublasXt for a matrix multiplication, are there some major flaws in my usage of the driver api e.g. some unnecessary api calls?