Hi all,
I have been struggling with the strange device memory access problem for couple days. Basically, if I write a value to a piece of device memory, and read it back immediately, it has a big chance that the readback is not equal to the value just wrote. The complete verify code is as follows,
kernel.cu
#include <iostream>
__global__ void
kernel_verify( double * a
, int * b
, double * c
, int npart
, int * counter
)
{
int i = (blockIdx.x<<16) + (blockIdx.y<<8) + threadIdx.x;
if( i>=npart ) return;
int pbase = i*7;
int ibase = i*3;
double px, py, pz;
double ox, oy, oz;
int ix, iy, iz;
px = a[pbase]; py = a[pbase+2]; pz = a[pbase+4];
double temp;
temp = (px-0.0)/0.01 + 0.5;
ix = floor(temp);
ox = temp-ix;
temp = (py-0.0)/0.01 + 0.5;
iy = floor(temp);
oy = temp-iy;
temp = (pz-0.0)/0.01 + 0.5;
iz = floor(temp);
oz = temp-iz;
b[ibase+0] = ix; b[ibase+1] = iy; b[ibase+2] = iz;
c[ibase+0] = ox; c[ibase+1] = oy; c[ibase+2] = oz;
int iix, iiy, iiz;
double oox, ooy, ooz;
iix = b[ibase]; iiy = b[ibase+1]; iiz = b[ibase+2];
oox = c[ibase]; ooy = c[ibase+1]; ooz = c[ibase+2];
if( iix!=ix || iiy!=iy || iiz!=iz )
atomicAdd(counter, 1);
if( fabs(oox-ox)>0.001 || fabs(ooy-oy)>0.001 || fabs(ooz-oz)>0.001)
atomicAdd(counter+1, 1);
}
void
cuda_verify( FLOAT * a
, int * b
, FLOAT * c
, int npart
)
{
dim3 threads(256);
dim3 blocks(npart/65536, 256);
int hc[2] = {0,0};
int * dc;
cudaMalloc( (void **)&dc, sizeof(int)*2 );
cudaMemcpy( dc, hc, sizeof(int)*2, cudaMemcpyHostToDevice );
k_verify <<< blocks, threads >>> ( a, b, c, npart, dc );
cudaMemcpy( hc, dc, sizeof(int)*2, cudaMemcpyDeviceToHost );
cudaFree( dc );
std::cout << "int counter=" << hc[0] << "\n";
std::cout << "double counter=" << hc[1] << "\n";
}
And the kernel.cc file
#include <cuda.h>
#include <cutil_inline.h>
extern void
cuda_verify( double * a
, int * b
, double * c
, int npart
);
int main()
{
// GPU device count
int gpucount;
cudaGetDeviceCount(&gpucount);
// init device
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// verify
double * da;
int * db;
double * dc;
int npart = 20971520;
cudaMalloc( (void **)&da, sizeof(double) * npart * 7 );
cudaMalloc( (void **)&db, sizeof(int) * npart * 3 );
cudaMalloc( (void **)&dc, sizeof(double) * npart * 3 );
double * ha = (double*)malloc(sizeof(double) * npart * 7);
for(int i=0; i<npart*7; ++i) ha[i] = 1.0*i/npart;
cudaMemcpy(da, ha, sizeof(double)*npart*7, cudaMemcpyHostToDevice);
cuda_verify( da, db, dc, npart );
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(ha);
}
More specifically, in the middle of kernel, if I write double array ( double * c ) first, then the int array ( int * b ),
c[ibase+0] = ox; c[ibase+1] = oy; c[ibase+2] = oz;
b[ibase+0] = ix; b[ibase+1] = iy; b[ibase+2] = iz;
it has lots of inconsistency in the double array and none in the int array. Vice versa. If both array b and c are of the same type, then the readback is always consistent with the write value.
Anyone has any idea about whats going on here? Thanks!