Hi all!
I have a molecular dynamics code that has some portions parallelized with cuda. I am using Ubuntu 14.04 and cuda 7.5 drivers.
One of the kernels counts the number of particles that are separated by a certain distance (basically a frequency histogram). In this kernel I allocate dynamically the array P using new:
__global__ void correl_kernel(const double *pos, double *hist1, double *hist2, const int *Npart, int NC, const double* L, int n_bars, int n_poly, double bar_w){
int n = threadIdx.x + blockDim.x * blockIdx.x;
int Tpart = 0;
for(int l = 0; l < NC; ++l)
Tpart += Npart[l];
if(n > 1 && n < Tpart){
int npp = n_poly; // n_poly = 150
double* P = new double[npp];
P[0] = 1.; //THIS IS LINE 916! (where the error is reported when using lots of particles)
int k = 0;
int Tt = 0;
for(int m = 0; m < NC; ++m){
Tt += Npart[m];
if(n < Tt)
break;
k++;
}
for(int i = 0; i < 2; ++i){
double rij[3];
for(int a = 0; a < 3; ++a){
rij[a] = pos[i*3+a] - pos[n*3+a];
rij[a] -= L[a] * floor(rij[a] / L[a] + 0.5);
}
double RIJsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
double RIJ = sqrt(RIJsq);
int bin = static_cast<int>(floor(RIJ / bar_w));
if(bin < n_bars){
double doti = rij[2] / RIJ;
P[1] = doti;
for(int l = 2; l < npp; ++l)
P[l] = ((2.*(l-1)+1.)*doti*P[l-1] - (l-1)*P[l-2])/l;
for(int l = 0; l < npp; ++l){ //use all polyn.
if(i == 0)
atomicAdd(&hist1[(k-2)*n_poly*n_bars+l*n_bars+bin], P[l]); // l=0, l=1, l=2, ...
else if(i == 1)
atomicAdd(&hist2[(k-2)*n_poly*n_bars+l*n_bars+bin], P[l]);
}
}
}
delete[] P;
}
}
If I understood correctly
new
will work for compute cap. 3.5 and up, and my card is a Tesla K20. In fact it compiles correctly and runs fine in general (memcheck does not give any errors), until I use a large number of particles (>~3000) (which does not modify the size of the dynamic array P, just the Tpart variable) a memory error appears when using cuda-memcheck:
========= CUDA-MEMCHECK
========= Invalid __global__ write of size 8
========= at 0x00000418 in /home/carlos/CPP/mdChaSphere/cuda_ready/cuda/long_range/mean_f/v12/dev_functions.cu:916:correl_kernel(double const *, double*, double*, int const *, int, double const *, int, int, double)
========= by thread (12,0,0) in block (0,0,0)
========= Address 0x00000000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel_ptsz + 0x2c5) [0x1472e5]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcudart.so.7.5 [0x14623]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcudart.so.7.5 (cudaLaunch_ptsz + 0x154) [0x3d134]
========= Host Frame:./md_ch_sphere [0x52ba]
========= Host Frame:./md_ch_sphere [0x8b87]
========= Host Frame:./md_ch_sphere [0x8bec]
========= Host Frame:./md_ch_sphere [0x7e02]
========= Host Frame:./md_ch_sphere [0x3e7f]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./md_ch_sphere [0x51cf]
=========
Can someone please orient me in finding the source of the problem?
Thank you