Hi everyone,
I have a piece of code that I use to compute a pairwise euclidean distance matrix between two list of atom coordinates. It works pretty well, but using nvprof, I realized that most of the time was spent on cudaMemcpy from device to host. I don’t really see how to improve this, or if it is possible.
I tried to use thrust library, but I’m not really familiar with it.
Also using cudaHostRegister() makes things worse, with around 620 ms for the same array (compare to 220 ms with the code below).
The cuda code is called from python using a wrapping function compiled with cython.
The profiling below was executed with the same array of 6406 atoms, giving an out array of 6406*6406=
41036836 entries.
Would you have any suggestions/advices?
Thanks in advance for your help.
Here is the code:
#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#define BLOCK_SIZE 32
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__
void d_getDistances(float *sel1, int sel1_size, float *sel2, int sel2_size, float *out,
float *cellDims)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = blockIdx.x * BLOCK_SIZE + tx;
int col = blockIdx.y * BLOCK_SIZE + ty;
float cD_x = cellDims[0];
float cD_y = cellDims[1];
float cD_z = cellDims[2];
if(row < sel1_size && col < sel2_size)
{
float dist_x = sel1[3 * row] - sel2[3*col];
float dist_y = sel1[3 * row + 1] - sel2[3*col + 1];
float dist_z = sel1[3 * row + 2] - sel2[3*col + 2];
// Apply PBC conditions
dist_x = dist_x - cD_x * roundf( dist_x / cD_x );
dist_y = dist_y - cD_y * roundf( dist_y / cD_y );
dist_z = dist_z - cD_z * roundf( dist_z / cD_z );
float dist = sqrtf(dist_x*dist_x + dist_y*dist_y + dist_z*dist_z);
out[row*sel2_size+col] = dist;
}
}
void cu_getDistances_wrapper(float *sel1, int sel1_size, float *sel2,
int sel2_size, float *out, float *cellDims, int sameSel)
{
// Copying sel1 matrix on GPU memory
float *cu_sel1;
size_t size = 3 * sel1_size * sizeof(float);
gpuErrchk( cudaMalloc(&cu_sel1, size) );
gpuErrchk( cudaMemcpy(cu_sel1, sel1, size, cudaMemcpyHostToDevice) );
// Copying sel2 matrix on GPU memory
float *cu_sel2;
if(sameSel==0)
{
size = 3 * sel2_size * sizeof(float);
gpuErrchk( cudaMalloc(&cu_sel2, size) );
gpuErrchk( cudaMemcpy(cu_sel2, sel2, size, cudaMemcpyHostToDevice) );
}
else
cu_sel2 = cu_sel1;
// Copying cellDims matrix on GPU memory
float *cu_cellDims;
size = 3 * sizeof(float);
gpuErrchk( cudaMalloc(&cu_cellDims, size) );
gpuErrchk( cudaMemcpy(cu_cellDims, cellDims, size, cudaMemcpyHostToDevice) );
// Copying out matrix on GPU memory
float *cu_out;
size = sel1_size * sel2_size * sizeof(float);
gpuErrchk( cudaMalloc(&cu_out, size) );
gpuErrchk( cudaMemset(cu_out, 0, size) );
dim3 dimBlock( BLOCK_SIZE, BLOCK_SIZE, 1 );
dim3 dimGrid( ceil( (float)sel1_size/BLOCK_SIZE), ceil( (float)sel2_size/BLOCK_SIZE), 1);
d_getDistances<<<dimGrid, dimBlock>>>(cu_sel1, sel1_size, cu_sel2, sel2_size, cu_out, cu_cellDims);
gpuErrchk( cudaDeviceSynchronize() );
// Copying result back into host memory
size = sel1_size * sel2_size;
thrust::device_ptr<float> out_ptr(cu_out);
thrust::copy(out_ptr, out_ptr+size, out);
cudaFree(cu_sel1);
cudaFree(cu_sel2);
cudaFree(cu_cellDims);
cudaFree(cu_out);
}
and the profiler output (5 calls of the function):
==11468== Profiling application: c:\users\kevin pounot\appdata\local\programs\python\python37\python.exe C:\Users\Kevin Pounot\AppDat
a\Local\Programs\Python\Python37\Scripts\ipython.exe
==11468== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 77.86% 682.96ms 5 136.59ms 134.04ms 140.54ms [CUDA memcpy DtoH]
22.11% 193.89ms 5 38.778ms 38.418ms 40.094ms d_getDistances(float*, int, float*, int, float*, float*)
0.03% 256.67us 10 25.667us 1.7280us 58.624us [CUDA memcpy HtoD]
0.00% 4.8960us 5 979ns 928ns 1.0240us [CUDA memset]
API calls: 58.26% 684.79ms 5 136.96ms 134.42ms 140.87ms cudaMemcpyAsync
21.68% 254.79ms 5 50.957ms 50.604ms 52.266ms cudaDeviceSynchronize
19.40% 228.07ms 15 15.204ms 15.000us 157.93ms cudaMalloc
0.49% 5.7096ms 20 285.48us 1.7000us 939.40us cudaFree
0.10% 1.1888ms 10 118.88us 54.700us 199.50us cudaMemcpy
0.03% 378.00us 97 3.8960us 100ns 227.20us cuDeviceGetAttribute
0.02% 283.30us 5 56.660us 47.900us 71.000us cudaMemset
0.01% 98.400us 5 19.680us 16.900us 23.600us cudaLaunchKernel
0.01% 72.000us 5 14.400us 12.900us 17.800us cudaStreamSynchronize
0.00% 27.800us 1 27.800us 27.800us 27.800us cuDeviceTotalMem
0.00% 10.100us 1 10.100us 10.100us 10.100us cuDeviceGetPCIBusId
0.00% 4.6000us 3 1.5330us 300ns 3.3000us cuDeviceGetCount
0.00% 1.5000us 2 750ns 400ns 1.1000us cuDeviceGet
0.00% 1.0000us 1 1.0000us 1.0000us 1.0000us cuDeviceGetName
0.00% 500ns 1 500ns 500ns 500ns cuDeviceGetLuid
0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetUuid