I have a routine that performs a reduce + update across n equally dimensioned arrays (on n gpus). The code utilizes omp and each gpu is commanded by its own thread. I have noticed that the code fails to work if P2P is enabled, but works fine if P2P is disabled, or if I replace the cudamemcpyasync() with the equivalent cublasScopy().
I’m thinking to release a bug report, but was wondering if anyone had any input as to why this isn’t working!!!
darray is a device array container, and accessup[i] is 1 if gpu (n+1)%npu has P2P enabled with gpu n
void darray::update(darray &buffer)
{
int n = omp_get_thread_num();
int ngpu = omp_get_num_threads();
int next=(n+1)%ngpu;
int len = dims0*dims1;
int chunk = (len + ngpu - 1) / ngpu; //number of floats per transfer
cudaStream_t stream;
CUDA(cudaStreamCreate(&stream));
CUBLAS(cublasSetStream(cbh_p[n],stream));
buffer.reshape(dims0, dims1, false);
this_ptr[n] = data;
CUDA(cudaDeviceSynchronize());
_Pragma("omp barrier");
float *ptr = this_ptr[n];
float *ptr_next = this_ptr[next];
float *tmp = buffer.data;
for (int p = 0; p <2*(ngpu - 1); p++) {
int part = (n + p) % ngpu;
int start = part * chunk;
int end = start + chunk;
if (end > len) end = len;
bool cond = (p < ngpu -1);
int elem = end - start;
if (elem>0)
{
int bytes = elem * sizeof(float);
if (!(cond&accessup[n])) CUDA(cudaMemcpyAsync(cond ? tmp + start : ptr + start, ptr_next + start, bytes, cudaMemcpyDefault,stream));
float alpha = 1;
if (cond) CUBLAS(cublasSaxpy(cbh_p[n], elem, &alpha, accessup[n] ? ptr_next + start : tmp + start, 1, ptr + start, 1));
}
CUDA(cudaDeviceSynchronize());
_Pragma("omp barrier");
}
CUBLAS(cublasSetStream(cbh_p[n],NULL));
CUDA(cudaStreamDestroy(stream));
}
I have now tested this code on TWO completely separate platforms (new gpus, new computer) so I can say for sure that it’s not hardware related. So either A) I’m doing something wrong, or B) there is a bug