Device memory access bug?

Hi all,

I have been struggling with the strange device memory access problem for couple days. Basically, if I write a value to a piece of device memory, and read it back immediately, it has a big chance that the readback is not equal to the value just wrote. The complete verify code is as follows,

kernel.cu

#include <iostream>

__global__ void 

  kernel_verify( double * a

       , int    * b

       , double * c

       , int      npart

       , int    * counter

       )

{

  int i = (blockIdx.x<<16) + (blockIdx.y<<8) + threadIdx.x;

  if( i>=npart ) return;

int pbase = i*7;

  int ibase = i*3;

double px, py, pz;

  double ox, oy, oz;

  int    ix, iy, iz;

px = a[pbase]; py = a[pbase+2]; pz = a[pbase+4];

double temp;

temp = (px-0.0)/0.01 + 0.5;

  ix = floor(temp);

  ox = temp-ix;

temp = (py-0.0)/0.01 + 0.5;

  iy = floor(temp);

  oy = temp-iy;

temp = (pz-0.0)/0.01 + 0.5;

  iz = floor(temp);

  oz = temp-iz;

b[ibase+0] = ix; b[ibase+1] = iy; b[ibase+2] = iz;

  c[ibase+0] = ox; c[ibase+1] = oy; c[ibase+2] = oz;

int iix, iiy, iiz;

  double oox, ooy, ooz;

iix = b[ibase]; iiy = b[ibase+1]; iiz = b[ibase+2];

  oox = c[ibase]; ooy = c[ibase+1]; ooz = c[ibase+2];

if( iix!=ix || iiy!=iy || iiz!=iz ) 

    atomicAdd(counter, 1);

if( fabs(oox-ox)>0.001 || fabs(ooy-oy)>0.001 || fabs(ooz-oz)>0.001) 

    atomicAdd(counter+1, 1);

}

void 

  cuda_verify( FLOAT * a

          , int * b

          , FLOAT * c

          , int npart

          )

{

dim3 threads(256);

  dim3 blocks(npart/65536, 256);

int hc[2] = {0,0};

  int * dc;

  cudaMalloc( (void **)&dc, sizeof(int)*2 );

  cudaMemcpy( dc, hc, sizeof(int)*2, cudaMemcpyHostToDevice );

k_verify <<< blocks, threads >>> ( a, b, c, npart, dc );

cudaMemcpy( hc, dc, sizeof(int)*2, cudaMemcpyDeviceToHost );

  cudaFree( dc );

  std::cout << "int    counter=" << hc[0] << "\n";

  std::cout << "double counter=" << hc[1] << "\n";

}

And the kernel.cc file

#include <cuda.h>

#include <cutil_inline.h>

extern void 

  cuda_verify( double * a

          , int * b

          , double * c

          , int      npart

          );

int main()

{

  // GPU device count

  int gpucount;

  cudaGetDeviceCount(&gpucount);

// init device  

  cudaSetDevice( cutGetMaxGflopsDeviceId() );

// verify

  double * da;

  int * db;

  double * dc;

  int npart = 20971520;

cudaMalloc( (void **)&da, sizeof(double) * npart * 7 );

  cudaMalloc( (void **)&db, sizeof(int)   * npart * 3 );

  cudaMalloc( (void **)&dc, sizeof(double) * npart * 3 );

double * ha = (double*)malloc(sizeof(double) * npart * 7);

  for(int i=0; i<npart*7; ++i)   ha[i] = 1.0*i/npart;

cudaMemcpy(da, ha, sizeof(double)*npart*7, cudaMemcpyHostToDevice);

cuda_verify( da, db, dc, npart );

cudaFree(da);

  cudaFree(db);

  cudaFree(dc);

  free(ha);

}

More specifically, in the middle of kernel, if I write double array ( double * c ) first, then the int array ( int * b ),

c[ibase+0] = ox; c[ibase+1] = oy; c[ibase+2] = oz;

  b[ibase+0] = ix; b[ibase+1] = iy; b[ibase+2] = iz;

it has lots of inconsistency in the double array and none in the int array. Vice versa. If both array b and c are of the same type, then the readback is always consistent with the write value.

Anyone has any idea about whats going on here? Thanks!

Hi all,

I have been struggling with the strange device memory access problem for couple days. Basically, if I write a value to a piece of device memory, and read it back immediately, it has a big chance that the readback is not equal to the value just wrote. The complete verify code is as follows,

kernel.cu

#include <iostream>

__global__ void 

  kernel_verify( double * a

       , int    * b

       , double * c

       , int      npart

       , int    * counter

       )

{

  int i = (blockIdx.x<<16) + (blockIdx.y<<8) + threadIdx.x;

  if( i>=npart ) return;

int pbase = i*7;

  int ibase = i*3;

double px, py, pz;

  double ox, oy, oz;

  int    ix, iy, iz;

px = a[pbase]; py = a[pbase+2]; pz = a[pbase+4];

double temp;

temp = (px-0.0)/0.01 + 0.5;

  ix = floor(temp);

  ox = temp-ix;

temp = (py-0.0)/0.01 + 0.5;

  iy = floor(temp);

  oy = temp-iy;

temp = (pz-0.0)/0.01 + 0.5;

  iz = floor(temp);

  oz = temp-iz;

b[ibase+0] = ix; b[ibase+1] = iy; b[ibase+2] = iz;

  c[ibase+0] = ox; c[ibase+1] = oy; c[ibase+2] = oz;

int iix, iiy, iiz;

  double oox, ooy, ooz;

iix = b[ibase]; iiy = b[ibase+1]; iiz = b[ibase+2];

  oox = c[ibase]; ooy = c[ibase+1]; ooz = c[ibase+2];

if( iix!=ix || iiy!=iy || iiz!=iz ) 

    atomicAdd(counter, 1);

if( fabs(oox-ox)>0.001 || fabs(ooy-oy)>0.001 || fabs(ooz-oz)>0.001) 

    atomicAdd(counter+1, 1);

}

void 

  cuda_verify( FLOAT * a

          , int * b

          , FLOAT * c

          , int npart

          )

{

dim3 threads(256);

  dim3 blocks(npart/65536, 256);

int hc[2] = {0,0};

  int * dc;

  cudaMalloc( (void **)&dc, sizeof(int)*2 );

  cudaMemcpy( dc, hc, sizeof(int)*2, cudaMemcpyHostToDevice );

k_verify <<< blocks, threads >>> ( a, b, c, npart, dc );

cudaMemcpy( hc, dc, sizeof(int)*2, cudaMemcpyDeviceToHost );

  cudaFree( dc );

  std::cout << "int    counter=" << hc[0] << "\n";

  std::cout << "double counter=" << hc[1] << "\n";

}

And the kernel.cc file

#include <cuda.h>

#include <cutil_inline.h>

extern void 

  cuda_verify( double * a

          , int * b

          , double * c

          , int      npart

          );

int main()

{

  // GPU device count

  int gpucount;

  cudaGetDeviceCount(&gpucount);

// init device  

  cudaSetDevice( cutGetMaxGflopsDeviceId() );

// verify

  double * da;

  int * db;

  double * dc;

  int npart = 20971520;

cudaMalloc( (void **)&da, sizeof(double) * npart * 7 );

  cudaMalloc( (void **)&db, sizeof(int)   * npart * 3 );

  cudaMalloc( (void **)&dc, sizeof(double) * npart * 3 );

double * ha = (double*)malloc(sizeof(double) * npart * 7);

  for(int i=0; i<npart*7; ++i)   ha[i] = 1.0*i/npart;

cudaMemcpy(da, ha, sizeof(double)*npart*7, cudaMemcpyHostToDevice);

cuda_verify( da, db, dc, npart );

cudaFree(da);

  cudaFree(db);

  cudaFree(dc);

  free(ha);

}

More specifically, in the middle of kernel, if I write double array ( double * c ) first, then the int array ( int * b ),

c[ibase+0] = ox; c[ibase+1] = oy; c[ibase+2] = oz;

  b[ibase+0] = ix; b[ibase+1] = iy; b[ibase+2] = iz;

it has lots of inconsistency in the double array and none in the int array. Vice versa. If both array b and c are of the same type, then the readback is always consistent with the write value.

Anyone has any idea about whats going on here? Thanks!

Without having deeply looked into your code but having read your description… I think I know what’s going on…

Memory/Threads need time before updates to memory becomes visible to all.

So if a thread just wrote to some piece of memory, and another thread reads that memory then the write might not yet have been completed, or the update might not yet be in thread local cache or whatever.

So this is probably what Synchronize functions are for… the synchronize function makes sure that all threads within a warp (or was it thread of block) see the same memory contents.

So you should probably read the programming guide about “read after write” and that sort of thing.

There are also large synchronization functions to synchronize entire blocks/grids or so… or even kernels… like thread fences system fences something like that.

Also the reason why double arrays might have more problems is because they are 2x4 bytes. It could be that memory transactions happen 4 bytes at a time.

So a double might require two memory transactions. So there is a possibility that perhaps only one of the two happened, this could explain the inconsistencies within the doubles themselfes ?!?

Are you seeing the values of the doubles being corrupted/weird ? Or are some doubles old and some new ?

Without having deeply looked into your code but having read your description… I think I know what’s going on…

Memory/Threads need time before updates to memory becomes visible to all.

So if a thread just wrote to some piece of memory, and another thread reads that memory then the write might not yet have been completed, or the update might not yet be in thread local cache or whatever.

So this is probably what Synchronize functions are for… the synchronize function makes sure that all threads within a warp (or was it thread of block) see the same memory contents.

So you should probably read the programming guide about “read after write” and that sort of thing.

There are also large synchronization functions to synchronize entire blocks/grids or so… or even kernels… like thread fences system fences something like that.

Also the reason why double arrays might have more problems is because they are 2x4 bytes. It could be that memory transactions happen 4 bytes at a time.

So a double might require two memory transactions. So there is a possibility that perhaps only one of the two happened, this could explain the inconsistencies within the doubles themselfes ?!?

Are you seeing the values of the doubles being corrupted/weird ? Or are some doubles old and some new ?

Thank you very much for the reply! Memory visibility and synchronization between threads looks to be a very possible explanation of it, but tried with __syncthreads() still no lucky so far…

This is a simplified version of the kernel, but the problem is still the same.

__global__ void k_verify( int * b , double * c , int * counter )

{

  int i = (blockIdx.x<<16) + (blockIdx.y<<8) + threadIdx.x;

c[i] = 0.1; __syncthreads();

  b[i] = 1;   __syncthreads();

int iix = b[i];

  double oox = c[i];

if( iix!=1 )              atomicAdd(counter, 1);

  if( fabs(oox-0.1)>0.001 ) atomicAdd(counter+1, 1);

}

You may see that each thread is writing/reading its own piece of global memory and has no interference with other threads at all. Now with different ways of writing the global memory (line 5 and 6 in the code snippet), got the following results:

case 1:

c[i] = 0.1; b[i] = 1;

result:

corruptions in int array: 0

corruptions in double array: 7612

case 2:

b[i] = 1; c[i] = 0.1;

result:

corruptions in int array: 11904

corruptions in double array: 0

case 3:

b[i] = 1;   __syncthreads();

c[i] = 0.1; __syncthreads();

result:

corruptions in int array: 10736

corruptions in double array: 5234

To me it looks like totally undetermined behavior that might possibly re-format my harddrive…

Thank you very much for the reply! Memory visibility and synchronization between threads looks to be a very possible explanation of it, but tried with __syncthreads() still no lucky so far…

This is a simplified version of the kernel, but the problem is still the same.

__global__ void k_verify( int * b , double * c , int * counter )

{

  int i = (blockIdx.x<<16) + (blockIdx.y<<8) + threadIdx.x;

c[i] = 0.1; __syncthreads();

  b[i] = 1;   __syncthreads();

int iix = b[i];

  double oox = c[i];

if( iix!=1 )              atomicAdd(counter, 1);

  if( fabs(oox-0.1)>0.001 ) atomicAdd(counter+1, 1);

}

You may see that each thread is writing/reading its own piece of global memory and has no interference with other threads at all. Now with different ways of writing the global memory (line 5 and 6 in the code snippet), got the following results:

case 1:

c[i] = 0.1; b[i] = 1;

result:

corruptions in int array: 0

corruptions in double array: 7612

case 2:

b[i] = 1; c[i] = 0.1;

result:

corruptions in int array: 11904

corruptions in double array: 0

case 3:

b[i] = 1;   __syncthreads();

c[i] = 0.1; __syncthreads();

result:

corruptions in int array: 10736

corruptions in double array: 5234

To me it looks like totally undetermined behavior that might possibly re-format my harddrive…

The obvious explanation would be that you write to [font=“Courier New”]b[ibase+0][/font], [font=“Courier New”]b[ibase+1][/font], and [font=“Courier New”]b[ibase+2][/font], but read back from [font=“Courier New”]c[ibase+3][/font], [font=“Courier New”]c[ibase+4][/font], and [font=“Courier New”]c[ibase+5][/font]…

The obvious explanation would be that you write to [font=“Courier New”]b[ibase+0][/font], [font=“Courier New”]b[ibase+1][/font], and [font=“Courier New”]b[ibase+2][/font], but read back from [font=“Courier New”]c[ibase+3][/font], [font=“Courier New”]c[ibase+4][/font], and [font=“Courier New”]c[ibase+5][/font]…

Sorry that was my typo, and the code was meant to read from b[ibase+0], b[ibase+1], b[ibase+2]. I have fixed that in my post

Sorry that was my typo, and the code was meant to read from b[ibase+0], b[ibase+1], b[ibase+2]. I have fixed that in my post

Please always check error codes. What device are you running this on? Unless you run it on a big Tesla card, the cudaMalloc()s are going to fail due to insufficient memory.

Can you post the exact code you are running? This code would not compile as the kernel is declared as kernel_verify() but called as k_verify().

Please always check error codes. What device are you running this on? Unless you run it on a big Tesla card, the cudaMalloc()s are going to fail due to insufficient memory.

Can you post the exact code you are running? This code would not compile as the kernel is declared as kernel_verify() but called as k_verify().

This index calculation seems suspicious to me in combination with how you launch the kernel.

If for some reason the x or y or perhaps even the threadidx.x overlaps with other threads then the same i would be calculated, which would lead to conflicts.

Since some of them are then writing to the same index it’s undefined which thread will win. The syncthread call will not serialize threads who try to access the same memory cells.

So the first thing you could/should do is make sure that each i is unique.

This is probably easily verified by creating an array like so:

vCount[i] = vCount[i] + 1;

However this needs to be atomic so it would be something like:

atomicAdd( vCount[i], 1 );

Once that’s done examine the vCount array and report back.

All vCount[i] should be 1.

If there are some which are 2 or higher then you have a bug/conflict in the indexing ! ;) :)

This index calculation seems suspicious to me in combination with how you launch the kernel.

If for some reason the x or y or perhaps even the threadidx.x overlaps with other threads then the same i would be calculated, which would lead to conflicts.

Since some of them are then writing to the same index it’s undefined which thread will win. The syncthread call will not serialize threads who try to access the same memory cells.

So the first thing you could/should do is make sure that each i is unique.

This is probably easily verified by creating an array like so:

vCount[i] = vCount[i] + 1;

However this needs to be atomic so it would be something like:

atomicAdd( vCount[i], 1 );

Once that’s done examine the vCount array and report back.

All vCount[i] should be 1.

If there are some which are 2 or higher then you have a bug/conflict in the indexing ! ;) :)

It is a Tesla card with 4GB of device memory. I did check the error code and the cudaMalloc() succeeded with returning of cudaSUCCESS. Following grabbed from ‘deviceQuery’

Device 1: “Tesla T10 Processor”

CUDA Driver Version: 3.10

CUDA Runtime Version: 3.0

CUDA Capability Major revision number: 1

CUDA Capability Minor revision number: 3

Total amount of global memory: 4294770688 bytes

Number of multiprocessors: 30

Number of cores: 240

Total amount of constant memory: 65536 bytes

Total amount of shared memory per block: 16384 bytes

Total number of registers available per block: 16384

Warp size: 32

Maximum number of threads per block: 512

Maximum sizes of each dimension of a block: 512 x 512 x 64

Maximum sizes of each dimension of a grid: 65535 x 65535 x 1

Maximum memory pitch: 2147483647 bytes

Texture alignment: 256 bytes

Clock rate: 1.30 GHz

Concurrent copy and execution: Yes

Run time limit on kernels: No

Integrated: No

Support host page-locked memory mapping: Yes

Compute mode: Default (multiple host threads can use this device simultaneously)

Also the complete verification code (simplified version). You may try smaller number of “BLOCKSIZE” if cudaMalloc() returns with error.

kernel.cu:

#include <iostream>

#define BLOCKSIZE 256

__global__ void 

  kernel_verify( int * b , double * c , int * counter )

{

  int i = (blockIdx.x*BLOCKSIZE*BLOCKSIZE) + (blockIdx.y*BLOCKSIZE) + threadIdx.x;

b[i] = 1;   //__syncthreads();

  c[i] = 2.0; //__syncthreads();

int iix = b[i];

  double oox = c[i];

if( iix!=1 )              atomicAdd(counter, 1);

  if( fabs(oox-2.0)>0.001 ) atomicAdd(counter+1, 1);

}

void 

  cuda_verify( )

{

  int * b;

  double * c;

  int npart = BLOCKSIZE*BLOCKSIZE*BLOCKSIZE+1;

int err;

  err = cudaMalloc( (void **)&b, sizeof(int)    * npart );

  err = cudaMalloc( (void **)&c, sizeof(double) * npart );

  if(err) std::cout<<"error\n";

dim3 threads(BLOCKSIZE);

  dim3 blocks(BLOCKSIZE, BLOCKSIZE);

int hc[2] = {0,0};

  int * dc;

  cudaMalloc( (void **)&dc, sizeof(int)*2 );

  cudaMemcpy( dc, hc, sizeof(int)*2, cudaMemcpyHostToDevice );

kernel_verify <<< blocks, threads >>> ( b, c, dc );

cudaMemcpy( hc, dc, sizeof(int)*2, cudaMemcpyDeviceToHost );

std::cout << "corruptions in int array: " << hc[0] << "\n";

  std::cout << "corruptions in double array: " << hc[1] << "\n";

cudaFree( dc );

  cudaFree( b );

  cudaFree( c );

}

kernel.cc:

#include <cuda.h>

#include <cutil_inline.h>

extern void cuda_verify( );

int main()

{

  // GPU device count

  int gpucount;

  cudaGetDeviceCount(&gpucount);

// init device  

  cudaSetDevice( cutGetMaxGflopsDeviceId() );

// verification

  cuda_verify( );

}

Note that for building the code I added “-arch=sm_13” to enable double precision in the kernel.

Thanks!

It is a Tesla card with 4GB of device memory. I did check the error code and the cudaMalloc() succeeded with returning of cudaSUCCESS. Following grabbed from ‘deviceQuery’

Device 1: “Tesla T10 Processor”

CUDA Driver Version: 3.10

CUDA Runtime Version: 3.0

CUDA Capability Major revision number: 1

CUDA Capability Minor revision number: 3

Total amount of global memory: 4294770688 bytes

Number of multiprocessors: 30

Number of cores: 240

Total amount of constant memory: 65536 bytes

Total amount of shared memory per block: 16384 bytes

Total number of registers available per block: 16384

Warp size: 32

Maximum number of threads per block: 512

Maximum sizes of each dimension of a block: 512 x 512 x 64

Maximum sizes of each dimension of a grid: 65535 x 65535 x 1

Maximum memory pitch: 2147483647 bytes

Texture alignment: 256 bytes

Clock rate: 1.30 GHz

Concurrent copy and execution: Yes

Run time limit on kernels: No

Integrated: No

Support host page-locked memory mapping: Yes

Compute mode: Default (multiple host threads can use this device simultaneously)

Also the complete verification code (simplified version). You may try smaller number of “BLOCKSIZE” if cudaMalloc() returns with error.

kernel.cu:

#include <iostream>

#define BLOCKSIZE 256

__global__ void 

  kernel_verify( int * b , double * c , int * counter )

{

  int i = (blockIdx.x*BLOCKSIZE*BLOCKSIZE) + (blockIdx.y*BLOCKSIZE) + threadIdx.x;

b[i] = 1;   //__syncthreads();

  c[i] = 2.0; //__syncthreads();

int iix = b[i];

  double oox = c[i];

if( iix!=1 )              atomicAdd(counter, 1);

  if( fabs(oox-2.0)>0.001 ) atomicAdd(counter+1, 1);

}

void 

  cuda_verify( )

{

  int * b;

  double * c;

  int npart = BLOCKSIZE*BLOCKSIZE*BLOCKSIZE+1;

int err;

  err = cudaMalloc( (void **)&b, sizeof(int)    * npart );

  err = cudaMalloc( (void **)&c, sizeof(double) * npart );

  if(err) std::cout<<"error\n";

dim3 threads(BLOCKSIZE);

  dim3 blocks(BLOCKSIZE, BLOCKSIZE);

int hc[2] = {0,0};

  int * dc;

  cudaMalloc( (void **)&dc, sizeof(int)*2 );

  cudaMemcpy( dc, hc, sizeof(int)*2, cudaMemcpyHostToDevice );

kernel_verify <<< blocks, threads >>> ( b, c, dc );

cudaMemcpy( hc, dc, sizeof(int)*2, cudaMemcpyDeviceToHost );

std::cout << "corruptions in int array: " << hc[0] << "\n";

  std::cout << "corruptions in double array: " << hc[1] << "\n";

cudaFree( dc );

  cudaFree( b );

  cudaFree( c );

}

kernel.cc:

#include <cuda.h>

#include <cutil_inline.h>

extern void cuda_verify( );

int main()

{

  // GPU device count

  int gpucount;

  cudaGetDeviceCount(&gpucount);

// init device  

  cudaSetDevice( cutGetMaxGflopsDeviceId() );

// verification

  cuda_verify( );

}

Note that for building the code I added “-arch=sm_13” to enable double precision in the kernel.

Thanks!

int i = (blockIdx.x<<16) + (blockIdx.y<<8) + threadIdx.x;

is equivalent to

int i = (blockIdx.x*256*256) + (blockIdx.y*256) + threadIdx.x;

Since I was starting the kernel with 256*256 thread blocks, and each block has 256 threads, this should give me a unique index.

To make sure that I was not making some stupid mistake so I tried with your method and confirmed that there’s no conflicts in the index.

Now I’m kinda screwed. It’s such a simple kernel (see my last reply) but just won’t give me the correct answer :wallbash:

int i = (blockIdx.x<<16) + (blockIdx.y<<8) + threadIdx.x;

is equivalent to

int i = (blockIdx.x*256*256) + (blockIdx.y*256) + threadIdx.x;

Since I was starting the kernel with 256*256 thread blocks, and each block has 256 threads, this should give me a unique index.

To make sure that I was not making some stupid mistake so I tried with your method and confirmed that there’s no conflicts in the index.

Now I’m kinda screwed. It’s such a simple kernel (see my last reply) but just won’t give me the correct answer :wallbash:

Ok a few things I would do differently just in case:

int i;
int iix;
double oox;

i = (blockIdx.zBLOCKSIZEBLOCKSIZE) + (blockIdx.y*BLOCKSIZE) + threadIdx.x;

b[i] = 1;
c[i] = 2.0;

__syncthreads();

iix = b[i];
oox = c[i];

dim3 threads(BLOCKSIZE,0,0); // is this x,y,z ? otherwise swap around
dim3 blocks(0,BLOCKSIZE, BLOCKSIZE); // is this x,y,z ? otherwise swap around

Another explanation could be that your device does not handle integers properly and that they are converted to floating point and that the comparision is slightly off ?!?

Ok a few things I would do differently just in case:

int i;
int iix;
double oox;

i = (blockIdx.zBLOCKSIZEBLOCKSIZE) + (blockIdx.y*BLOCKSIZE) + threadIdx.x;

b[i] = 1;
c[i] = 2.0;

__syncthreads();

iix = b[i];
oox = c[i];

dim3 threads(BLOCKSIZE,0,0); // is this x,y,z ? otherwise swap around
dim3 blocks(0,BLOCKSIZE, BLOCKSIZE); // is this x,y,z ? otherwise swap around

Another explanation could be that your device does not handle integers properly and that they are converted to floating point and that the comparision is slightly off ?!?