Mixing volatile writes and atomicAdd()s with shared memory produces unexpected results

This is a follow-up to my replies in this post:

I have isolated an example of the behavior I mentioned observing in my replies.

I use many atomicAdd()s to increment a variable in shared memory and a volatile write to reset it at a random time.

Sometimes the volatile write doesn’t seem to do anything.

This may be legal for C++, as King_Crimson mentioned in the linked post above, but it is not what I expect. E.g., see: When to use volatile with shared CUDA Memory.

A standalone program that illustrates this issue is below, along with its output.
Update: in a reply below, see the final source code with error checking and fixed type cast.

I have a GTX 1070 Ti.
I see similar results with both debug and release builds with Visual Studio on Windows 10.

The problematic kernel is k1(), not k0().

As the program shows, there is a simple fix for the problem,
using an atomicExch() instead of a volatile write.

Still, I would like to better understand what is going on if anyone can offer an explanation.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>


const unsigned long long int  InitValue   =  0x1000'0000'0000'0000ull;      // the value that initializes the volatile variable
const unsigned long long int  BigValue    =  0x7000'0000'0000'0000ull;      // the amount written by the one volatile write (or atomicExch())
const unsigned long long int  Delta       =  0x0000'0001'0000'0001ull;      // the amount added by each atomicAdd() -- note the two 1s
const int                     N_AddPerThr =  0x0400;                        // all threads (except one) loop this many times and do one atomicAdd() per loop


// for understanding the ffc00 pattern seen in main()'s bad value reports:
//   # of threads doing atomic adds  *  adds (loops) per thread
//   ------------------------------     -----------------------
//                       (1024 - 1)  *  0x0400                   =  0xffc00

// example ***** non-problematic ***** kernel:
// k0() always leads to "ok value" reports from main()
__global__ void k0( volatile unsigned long long int *d ){

  if ( blockIdx.x == 0 ) {
    for ( int i = 0; i < N_AddPerThr; i++)  atomicAdd((unsigned long long int *)d, Delta );
  }

  else if ( blockIdx.x == 19 ) {
    if ( threadIdx.x  ==  0 ) {
//    while ( *d == 0                    );        // don't create a time dependency on an atomicAdd() from another thread
      while ( ( clock() & 0xff0 )  !=  0 );        // wait a random few ns instead

      *d  =  BigValue;
    }
  }
  else {
    while ( ( clock() & 0xff0 )  !=  0 );            // wait a random few ns and then exit
  }

}


// for this kernel see:
//   https://stackoverflow.com/questions/15331009/when-to-use-volatile-with-shared-cuda-memory

// lines commmented out below do not significantly change main()'s reports, but changing the VolatileWrite argument at the time of launch does

// troublesome kernel:
// k1() often leads to "bad value" reports from main()
// k1() uses d only to report the final value of Victim
__global__ void k1( volatile unsigned long long int *d, bool VolatileWrite, int I_Launch ) {

  __shared__  volatile  unsigned long long int      Victim;
//__shared__            unsigned long long int      Victim;
//volatile              unsigned long long int*  VolVictim  =  const_cast<volatile unsigned long long int*>( & Victim );

  if ( blockIdx.x   !=  0 )  return;

  if ( threadIdx.x  ==  0 )      Victim  =  InitValue;
//if ( threadIdx.x  ==  0 )  *VolVictim  =  InitValue;

  __syncthreads();

  if ( threadIdx.x  ==  0 ) {
    while ( ( clock() & 0x03f0 )  !=  0 );        // wait a random few ns

    if ( VolatileWrite ) {
        Victim  =  BigValue;                                         // this does not always have the intended effect, which is to reset the count to BigValue
//  *VolVictim  =  BigValue;
    }
    else  atomicExch( & (unsigned long long) Victim, BigValue );     // this has has the intended effect, and main() reports only "ok value"
  }

  else {
//else if ( threadIdx.x > 31 ) {
    for ( int i = 0; i < N_AddPerThr; i++ )  atomicAdd( & (unsigned long long int) Victim, Delta );
  }

  __syncthreads();

  if ( threadIdx.x  ==  0 )  *d  =      Victim;   // report result to main()
//if ( threadIdx.x  ==  0 )  *d  =  *VolVictim;

// this produces the same result as the printout in main():
//  if ( threadIdx.x  ==  0 ) {  if  ( Victim  <  BigValue ) printf( "got bad value: %llx on launch %x   <-- from k1()\n", Victim, I_Launch ); }

}


int main(){

  unsigned long long int *d;
  cudaMallocManaged(&d, sizeof(*d));

  for ( int Run   =  0; Run < 3; ++ Run ) {
    if (    Run  ==  0 )  std::cout << "\nStarting run that repeatedly launches k0():" << std::endl;
    if (    Run  ==  1 )  std::cout << "\nStarting run that repeatedly launches k1() with volatile write,\n(debug build yields only a few good values -- try another run for examples):" << std::endl;
    if (    Run  ==  2 )  std::cout << "\nStarting run that repeatedly launches k1() with atomicExch(),\n(debug build runs very slow):"   << std::endl;

    int   N_Bad   =  0;      // used to abort after too many bad values
    bool  WasBad  =  false;  // used to show a sample ok value after a bad value, i.e., indicates a bad value was observed in the previous iteration

    for ( int i = 0; i <  1'000; ++ i ) {
      if ( N_Bad >= 30 ) { std::cout << "Aborting after 30 bad values in " << std::dec << i << " launches!  ***** " << std::endl;  break; }
      *d = InitValue;
      if ( Run  ==  0 )  k0<<< 20,1024>>>( d    );           // always leads to "ok  value"
      if ( Run  ==  1 )  k1<<<  1,1024>>>( d,  true, i );    // true  --> volatile write  -- often  leads to "bad value"
      if ( Run  ==  2 )  k1<<<  1,1024>>>( d, false, i );    // false --> atomicExch()    -- always leads to "ok  value"
      cudaDeviceSynchronize();

// report: show all bad values, a sample of good values, and the final value
      std::cout << std::hex;
      if      ( *d  <  BigValue          ) { std::cout << "got bad value: 0x" << *d << " on launch 0x" << i << "  ***** " << std::endl;  ++ N_Bad;  WasBad  =  true; }
      else if (  WasBad  ||
                ( ( i & 0x0ff )  ==  0 ) ) { std::cout << "got ok  value: 0x" << *d << " on launch 0x" << i <<               std::endl;  WasBad  =  false; }  // occasional updates
    }
    std::cout << std::hex;                   std::cout << "final   value: 0x" << *d << std::endl;
  }
}

Output with release build, which shows:

  • one successful run of k0() launches
  • one problematic run of k1() launches (volatile write)
  • one successful run of k1() launches (atomicExch())

A 1 in the MS nybble is a result of initialization.
A 7 in the MS nybble indicates the volatile write succeeded.
The patterns in the non-MS nybbles are the result of atomicAdd()s.
See the source code for an explanation of the 0xffc00 pattern seen in bad values.

Starting run that repeatedly launches k0():
got ok  value: 0x700c4340000c4340 on launch 0x0
got ok  value: 0x7000000000000000 on launch 0x100
got ok  value: 0x7006b7e00006b7e0 on launch 0x200
got ok  value: 0x7000000000000000 on launch 0x300
final   value: 0x700c4340000c4340

Starting run that repeatedly launches k1() with volatile write,
(debug build yields only a few good values -- try another run for examples):
got ok  value: 0x700ff480000ff480 on launch 0x0
got bad value: 0x100ffc00000ffc00 on launch 0x1  *****
got bad value: 0x100ffc00000ffc00 on launch 0x2  *****
got ok  value: 0x700ff6a0000ff6a0 on launch 0x3
got bad value: 0x100ffc00000ffc00 on launch 0x4  *****
got ok  value: 0x700ff7e0000ff7e0 on launch 0x5
got bad value: 0x100ffc00000ffc00 on launch 0x8  *****
got bad value: 0x100ffc00000ffc00 on launch 0x9  *****
got bad value: 0x100ffc00000ffc00 on launch 0xa  *****
got ok  value: 0x700ff5a0000ff5a0 on launch 0xb
got bad value: 0x100ffc00000ffc00 on launch 0xd  *****
got ok  value: 0x700ffa20000ffa20 on launch 0xe
got bad value: 0x100ffc00000ffc00 on launch 0x11  *****
got bad value: 0x100ffc00000ffc00 on launch 0x12  *****
got ok  value: 0x700ff700000ff700 on launch 0x13
got bad value: 0x100ffc00000ffc00 on launch 0x15  *****
got bad value: 0x100ffc00000ffc00 on launch 0x16  *****
got bad value: 0x100ffc00000ffc00 on launch 0x17  *****
got bad value: 0x100ffc00000ffc00 on launch 0x18  *****
got bad value: 0x100ffc00000ffc00 on launch 0x19  *****
got ok  value: 0x700ffa60000ffa60 on launch 0x1a
got bad value: 0x100ffc00000ffc00 on launch 0x1b  *****
got bad value: 0x100ffc00000ffc00 on launch 0x1c  *****
got ok  value: 0x700ff9c0000ff9c0 on launch 0x1d
got bad value: 0x100ffc00000ffc00 on launch 0x1f  *****
got ok  value: 0x700ff9e0000ff9e0 on launch 0x20
got bad value: 0x100ffc00000ffc00 on launch 0x21  *****
got ok  value: 0x700ff900000ff900 on launch 0x22
got bad value: 0x100ffc00000ffc00 on launch 0x23  *****
got bad value: 0x100ffc00000ffc00 on launch 0x24  *****
got bad value: 0x100ffc00000ffc00 on launch 0x25  *****
got bad value: 0x100ffc00000ffc00 on launch 0x26  *****
got bad value: 0x100ffc00000ffc00 on launch 0x27  *****
got bad value: 0x100ffc00000ffc00 on launch 0x28  *****
got bad value: 0x100ffc00000ffc00 on launch 0x29  *****
got bad value: 0x100ffc00000ffc00 on launch 0x2a  *****
got ok  value: 0x700ff780000ff780 on launch 0x2b
got bad value: 0x100ffc00000ffc00 on launch 0x2d  *****
got ok  value: 0x700ffb80000ffb80 on launch 0x2e
got bad value: 0x100ffc00000ffc00 on launch 0x2f  *****
got ok  value: 0x700ffb80000ffb80 on launch 0x30
got bad value: 0x100ffc00000ffc00 on launch 0x31  *****
got ok  value: 0x700ff7c0000ff7c0 on launch 0x32
got bad value: 0x100ffc00000ffc00 on launch 0x35  *****
Aborting after 30 bad values in 54 launches!  *****
final   value: 0x100ffc00000ffc00

Starting run that repeatedly launches k1() with atomicExch(),
(debug build runs very slow):
got ok  value: 0x700ffb80000ffb80 on launch 0x0
got ok  value: 0x700ffb80000ffb80 on launch 0x100
got ok  value: 0x700ffac0000ffac0 on launch 0x200
got ok  value: 0x700ffac0000ffac0 on launch 0x300
final   value: 0x700ff4c0000ff4c0

well, your code doesn’t compile for me, and this certainly looks strange to me also:

The compile error I get is:

t157.cu(70): error: expression must be an lvalue or a function designator
      else atomicExch( & (unsigned long long) Victim, BigValue );
                         ^

I am on linux, although I would not expect device code compilation to be any different (godbolt shows the same issue). I’m on CUDA 12.2, which CUDA version are you on?

Anyway, when fix the two instances of that issue, and I dummy out the “uninteresting” launches of k0 and k1, I get results that don’t seem to indicate a problem:

# cat t157.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>


const unsigned long long int  InitValue   =  0x1000'0000'0000'0000ull;      // the value that initializes the volatile variable
const unsigned long long int  BigValue    =  0x7000'0000'0000'0000ull;      // the amount written by the one volatile write (or atomicExch())
const unsigned long long int  Delta       =  0x0000'0001'0000'0001ull;      // the amount added by each atomicAdd() -- note the two 1s
const int                     N_AddPerThr =  0x0400;                        // all threads (except one) loop this many times and do one atomicAdd() per loop


// for understanding the ffc00 pattern seen in main()'s bad value reports:
//   # of threads doing atomic adds  *  adds (loops) per thread
//   ------------------------------     -----------------------
//                       (1024 - 1)  *  0x0400                   =  0xffc00

// example ***** non-problematic ***** kernel:
// k0() always leads to "ok value" reports from main()
__global__ void k0( volatile unsigned long long int *d ){

  if ( blockIdx.x == 0 ) {
    for ( int i = 0; i < N_AddPerThr; i++)  atomicAdd((unsigned long long int *)d, Delta );
  }

  else if ( blockIdx.x == 19 ) {
    if ( threadIdx.x  ==  0 ) {
//    while ( *d == 0                    );        // don't create a time dependency on an atomicAdd() from another thread
      while ( ( clock() & 0xff0 )  !=  0 );        // wait a random few ns instead

      *d  =  BigValue;
    }
  }
  else {
    while ( ( clock() & 0xff0 )  !=  0 );            // wait a random few ns and then exit
  }

}


// for this kernel see:
//   https://stackoverflow.com/questions/15331009/when-to-use-volatile-with-shared-cuda-memory

// lines commmented out below do not significantly change main()'s reports, but changing the VolatileWrite argument at the time of launch does

// troublesome kernel:
// k1() often leads to "bad value" reports from main()
// k1() uses d only to report the final value of Victim
__global__ void k1( volatile unsigned long long int *d, bool VolatileWrite, int I_Launch ) {

  __shared__  volatile  unsigned long long int      Victim;
//__shared__            unsigned long long int      Victim;
//volatile              unsigned long long int*  VolVictim  =  const_cast<volatile unsigned long long int*>( & Victim );

  if ( blockIdx.x   !=  0 )  return;

  if ( threadIdx.x  ==  0 )      Victim  =  InitValue;
//if ( threadIdx.x  ==  0 )  *VolVictim  =  InitValue;

  __syncthreads();

  if ( threadIdx.x  ==  0 ) {
    while ( ( clock() & 0x03f0 )  !=  0 );        // wait a random few ns

    if ( VolatileWrite ) {
        Victim  =  BigValue;                                         // this does not always have the intended effect, which is to reset the count to BigValue
//  *VolVictim  =  BigValue;
    }
    else  atomicExch( (unsigned long long *) &Victim, BigValue );     // this has has the intended effect, and main() reports only "ok value"
  }

  else {
//else if ( threadIdx.x > 31 ) {
    for ( int i = 0; i < N_AddPerThr; i++ )  atomicAdd((unsigned long long int *) &Victim, Delta );
  }

  __syncthreads();

  if ( threadIdx.x  ==  0 )  *d  =      Victim;   // report result to main()
//if ( threadIdx.x  ==  0 )  *d  =  *VolVictim;

// this produces the same result as the printout in main():
//  if ( threadIdx.x  ==  0 ) {  if  ( Victim  <  BigValue ) printf( "got bad value: %llx on launch %x   <-- from k1()\n", Victim, I_Launch ); }

}


int main(){

  unsigned long long int *d;
  cudaMallocManaged(&d, sizeof(*d));

  for ( int Run   =  0; Run < 3; ++ Run ) {
    if (    Run  ==  0 )  std::cout << "\nStarting run that repeatedly launches k0():" << std::endl;
    if (    Run  ==  1 )  std::cout << "\nStarting run that repeatedly launches k1() with volatile write,\n(debug build yields only a few good values -- try another run for examples):" << std::endl;
    if (    Run  ==  2 )  std::cout << "\nStarting run that repeatedly launches k1() with atomicExch(),\n(debug build runs very slow):"   << std::endl;

    int   N_Bad   =  0;      // used to abort after too many bad values
    bool  WasBad  =  false;  // used to show a sample ok value after a bad value, i.e., indicates a bad value was observed in the previous iteration

    for ( int i = 0; i <  1'000; ++ i ) {
      if ( N_Bad >= 30 ) { std::cout << "Aborting after 30 bad values in " << std::dec << i << " launches!  ***** " << std::endl;  break; }
      *d = InitValue;
      if ( Run  ==  0 )  *d = BigValue;           // always leads to "ok  value"
      if ( Run  ==  1 )  k1<<<  1,1024>>>( d,  true, i );    // true  --> volatile write  -- often  leads to "bad value"
      if ( Run  ==  2 )  *d = BigValue;    // false --> atomicExch()    -- always leads to "ok  value"
      cudaDeviceSynchronize();

// report: show all bad values, a sample of good values, and the final value
      std::cout << std::hex;
      if      ( *d  <  BigValue          ) { std::cout << "got bad value: 0x" << *d << " on launch 0x" << i << "  ***** " << std::endl;  ++ N_Bad;  WasBad  =  true; }
      else if (  WasBad  ||
                ( ( i & 0x0ff )  ==  0 ) ) { std::cout << "got ok  value: 0x" << *d << " on launch 0x" << i <<               std::endl;  WasBad  =  false; }  // occasional updates
    }
    std::cout << std::hex;                   std::cout << "final   value: 0x" << *d << std::endl;
  }
}
# nvcc -o t157 t157.cu
# ./t157

Starting run that repeatedly launches k0():
got ok  value: 0x7000000000000000 on launch 0x0
got ok  value: 0x7000000000000000 on launch 0x100
got ok  value: 0x7000000000000000 on launch 0x200
got ok  value: 0x7000000000000000 on launch 0x300
final   value: 0x7000000000000000

Starting run that repeatedly launches k1() with volatile write,
(debug build yields only a few good values -- try another run for examples):
got ok  value: 0x700ff620000ff620 on launch 0x0
got ok  value: 0x700ffac0000ffac0 on launch 0x100
got ok  value: 0x700ffac0000ffac0 on launch 0x200
got ok  value: 0x700ff520000ff520 on launch 0x300
final   value: 0x700ff7a0000ff7a0

Starting run that repeatedly launches k1() with atomicExch(),
(debug build runs very slow):
got ok  value: 0x7000000000000000 on launch 0x0
got ok  value: 0x7000000000000000 on launch 0x100
got ok  value: 0x7000000000000000 on launch 0x200
got ok  value: 0x7000000000000000 on launch 0x300
final   value: 0x7000000000000000
#

Linux, CUDA 12.2, L4 GPU

And although it is much much slower, I get similar results with a debug (-G) build. Since you’re on windows on a device that could be subject to WDDM timeouts, I would also encourage proper CUDA error checking.

Thanks for your quick reply.

I am using 12.2.
Project properties say “compute_52,sm_52” and “Yes” for generate relocatable code.

I will try adding the error checks tomorrow.

Meanwhile, I fixed the lines you mentioned, first as you did and then using const_cast, which I suppose is the right way to do it.

The result for me is the same in all cases.

In the revised code, I disabled the extraneous k0() run, but left in both k1() runs to illustrate that one works and the other does not (on my machine).

BTW, for anyone who may object to casting away the volatile when calling the atomic…()s, the code that is commented out hints at the attempts I made with a shared victim variable that was not declared volatile and a pointer to it that was declared volatile (VolVictim).

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>


const unsigned long long int  InitValue   =  0x1000'0000'0000'0000ull;      // the value that initializes the volatile variable
const unsigned long long int  BigValue    =  0x7000'0000'0000'0000ull;      // the amount written by the one volatile write (or atomicExch())
const unsigned long long int  Delta       =  0x0000'0001'0000'0001ull;      // the amount added by each atomicAdd() -- note the two 1s
const int                     N_AddPerThr =  0x0400;                        // all threads (except one) loop this many times and do one atomicAdd() per loop


// __global__ void k0( volatile unsigned long long int *d );   // no longer used

// for this kernel see:
//   https://stackoverflow.com/questions/15331009/when-to-use-volatile-with-shared-cuda-memory

// lines commmented out below do not significantly change main()'s reports, but changing the VolatileWrite argument at the time of launch does

// troublesome kernel:
// k1() often leads to "bad value" reports from main()
// k1() uses d only to report the final value of Victim
__global__ void k1( volatile unsigned long long int *d, bool VolatileWrite, int I_Launch ) {

  __shared__  volatile  unsigned long long int      Victim;
//__shared__            unsigned long long int      Victim;
//volatile              unsigned long long int*  VolVictim  =  const_cast<volatile unsigned long long int*>( & Victim );

  if ( blockIdx.x   !=  0 )  return;

  if ( threadIdx.x  ==  0 )      Victim  =  InitValue;
//if ( threadIdx.x  ==  0 )  *VolVictim  =  InitValue;

  __syncthreads();

  if ( threadIdx.x  ==  0 ) {
    while ( ( clock() & 0x03f0 )  !=  0 );        // wait a random few ns

    if ( VolatileWrite ) {
        Victim  =  BigValue;                                         // this does not always have the intended effect, which is to reset the count to BigValue
//  *VolVictim  =  BigValue;
    }
    else  atomicExch( const_cast<unsigned long long int*>( & Victim ), BigValue );     // this has has the intended effect, and main() reports only "ok value"
  }

  else {
//else if ( threadIdx.x > 31 ) {
    for ( int i = 0; i < N_AddPerThr; i++ )  atomicAdd( const_cast<unsigned long long int*>( & Victim ), Delta );
  }

  __syncthreads();

  if ( threadIdx.x  ==  0 )  *d  =      Victim;   // report result to main()
//if ( threadIdx.x  ==  0 )  *d  =  *VolVictim;

// this produces the same result as the printout in main():
//  if ( threadIdx.x  ==  0 ) {  if  ( Victim  <  BigValue ) printf( "got bad value: %llx on launch %x   <-- from k1()\n", Victim, I_Launch ); }

}


int main(){

  unsigned long long int *d;
  cudaMallocManaged(&d, sizeof(*d));

  for ( int Run   =  1; Run < 3; ++ Run ) {
    if (    Run  ==  0 )  std::cout << "\nStarting run that repeatedly launches k0():" << std::endl;
    if (    Run  ==  1 )  std::cout << "\nStarting run that repeatedly launches k1() with volatile write,\n(debug build yields only a few good values -- try another run for examples):" << std::endl;
    if (    Run  ==  2 )  std::cout << "\nStarting run that repeatedly launches k1() with atomicExch(),\n(debug build runs very slow):"   << std::endl;

    int   N_Bad   =  0;      // used to abort after too many bad values
    bool  WasBad  =  false;  // used to show a sample ok value after a bad value, i.e., indicates a bad value was observed in the previous iteration

    for ( int i = 0; i <  1'000; ++ i ) {
      if ( N_Bad >= 30 ) { std::cout << "Aborting after 30 bad values in " << std::dec << i << " launches!  ***** " << std::endl;  break; }
      *d = InitValue;
//    if ( Run  ==  0 )  k0<<< 20,1024>>>( d    );           // always leads to "ok  value"
      if ( Run  ==  1 )  k1<<<  1,1024>>>( d,  true, i );    // true  --> volatile write  -- often  leads to "bad value"
      if ( Run  ==  2 )  k1<<<  1,1024>>>( d, false, i );    // false --> atomicExch()    -- always leads to "ok  value"
      cudaDeviceSynchronize();

// report: show all bad values, a sample of good values, and the final value
      std::cout << std::hex;
      if      ( *d  <  BigValue          ) { std::cout << "got bad value: 0x" << *d << " on launch 0x" << i << "  ***** " << std::endl;  ++ N_Bad;  WasBad  =  true; }
      else if (  WasBad  ||
                ( ( i & 0x0ff )  ==  0 ) ) { std::cout << "got ok  value: 0x" << *d << " on launch 0x" << i <<               std::endl;  WasBad  =  false; }  // occasional updates
    }
    std::cout << std::hex;                   std::cout << "final   value: 0x" << *d << std::endl;
  }
}


Output with release build, which shows:

one problematic run of k1() launches (volatile write)
one successful run of k1() launches (atomicExch())

A 1 in the MS nybble is a result of initialization.
A 7 in the MS nybble indicates the volatile write succeeded.
The patterns in the non-MS nybbles are the result of atomicAdd()s.
See the source code for an explanation of the 0xffc00 pattern seen in bad values.


Starting run that repeatedly launches k1() with volatile write,
(debug build yields only a few good values -- try another run for examples):
got bad value: 0x100ffc00000ffc00 on launch 0x0  *****
got ok  value: 0x700ff700000ff700 on launch 0x1
got bad value: 0x100ffc00000ffc00 on launch 0x2  *****
got ok  value: 0x700ff500000ff500 on launch 0x3
got bad value: 0x100ffc00000ffc00 on launch 0x5  *****
got ok  value: 0x700ffa40000ffa40 on launch 0x6
got bad value: 0x100ffc00000ffc00 on launch 0xa  *****
got bad value: 0x100ffc00000ffc00 on launch 0xb  *****
got ok  value: 0x700ffbc0000ffbc0 on launch 0xc
got bad value: 0x100ffc00000ffc00 on launch 0xd  *****
got ok  value: 0x700ffb20000ffb20 on launch 0xe
got bad value: 0x100ffc00000ffc00 on launch 0xf  *****
got bad value: 0x100ffc00000ffc00 on launch 0x10  *****
got ok  value: 0x700ffaa0000ffaa0 on launch 0x11
got bad value: 0x100ffc00000ffc00 on launch 0x12  *****
got bad value: 0x100ffc00000ffc00 on launch 0x13  *****
got ok  value: 0x700ff880000ff880 on launch 0x14
got bad value: 0x100ffc00000ffc00 on launch 0x15  *****
got bad value: 0x100ffc00000ffc00 on launch 0x16  *****
got ok  value: 0x700ffb20000ffb20 on launch 0x17
got bad value: 0x100ffc00000ffc00 on launch 0x19  *****
got bad value: 0x100ffc00000ffc00 on launch 0x1a  *****
got ok  value: 0x700ff820000ff820 on launch 0x1b
got bad value: 0x100ffc00000ffc00 on launch 0x1c  *****
got ok  value: 0x700ffaa0000ffaa0 on launch 0x1d
got bad value: 0x100ffc00000ffc00 on launch 0x1e  *****
got ok  value: 0x700ff900000ff900 on launch 0x1f
got bad value: 0x100ffc00000ffc00 on launch 0x21  *****
got bad value: 0x100ffc00000ffc00 on launch 0x22  *****
got bad value: 0x100ffc00000ffc00 on launch 0x23  *****
got ok  value: 0x700fec00000fec00 on launch 0x24
got bad value: 0x100ffc00000ffc00 on launch 0x25  *****
got ok  value: 0x700ff940000ff940 on launch 0x26
got bad value: 0x100ffc00000ffc00 on launch 0x27  *****
got bad value: 0x100ffc00000ffc00 on launch 0x28  *****
got ok  value: 0x700ffb20000ffb20 on launch 0x29
got bad value: 0x100ffc00000ffc00 on launch 0x2a  *****
got ok  value: 0x700ff5e0000ff5e0 on launch 0x2b
got bad value: 0x100ffc00000ffc00 on launch 0x2f  *****
got ok  value: 0x700ff4e0000ff4e0 on launch 0x30
got bad value: 0x100ffc00000ffc00 on launch 0x31  *****
got bad value: 0x100ffc00000ffc00 on launch 0x32  *****
got ok  value: 0x700ffa80000ffa80 on launch 0x33
got bad value: 0x100ffc00000ffc00 on launch 0x34  *****
got ok  value: 0x700ffb80000ffb80 on launch 0x35
got bad value: 0x100ffc00000ffc00 on launch 0x36  *****
got ok  value: 0x700ff6c0000ff6c0 on launch 0x37
got bad value: 0x100ffc00000ffc00 on launch 0x38  *****
got bad value: 0x100ffc00000ffc00 on launch 0x39  *****
Aborting after 30 bad values in 58 launches!  *****
final   value: 0x100ffc00000ffc00

Starting run that repeatedly launches k1() with atomicExch(),
(debug build runs very slow):
got ok  value: 0x700ff500000ff500 on launch 0x0
got ok  value: 0x700ffa20000ffa20 on launch 0x100
got ok  value: 0x700ffa20000ffa20 on launch 0x200
got ok  value: 0x700ffb00000ffb00 on launch 0x300
final   value: 0x700ffa80000ffa80

PS, the build output looks like:

1>------ Rebuild All started: Project: Exper05_VolatileAtomic, Configuration: Release x64 ------
1>Compiling CUDA source file main_exper05.cu...
1>
1>C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin\nvcc.exe" -gencode=arch=compute_52,code=\"sm_52,compute_52\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -rdc=true  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\include"     --keep-dir x64\Release  -maxrregcount=0   --machine 64 --compile -cudart static    -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /O2 /FS   /MD " -Xcompiler "/Fdx64\Release\vc141.pdb" -o C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\x64\Release\main_exper05.cu.obj "C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\main_exper05.cu"
1>main_exper05.cu
1>tmpxft_00007448_00000000-7_main_exper05.cudafe1.cpp
1>
1>C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin\nvcc.exe" -dlink  -o x64\Release\Exper05_VolatileAtomic.device-link.obj -Xcompiler "/EHsc /W3 /nologo /O2   /MD " -Xcompiler "/Fdx64\Release\vc141.pdb" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin/crt" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\lib\x64" cudart_static.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib cudart.lib cudadevrt.lib  -gencode=arch=compute_52,code=sm_52   C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\x64\Release\main_exper05.cu.obj
1>cudart_static.lib
1>kernel32.lib
1>user32.lib
1>gdi32.lib
1>winspool.lib
1>comdlg32.lib
1>advapi32.lib
1>shell32.lib
1>ole32.lib
1>oleaut32.lib
1>uuid.lib
1>odbc32.lib
1>odbccp32.lib
1>cudart.lib
1>cudadevrt.lib
1>main_exper05.cu.obj
1>   Creating library C:\Users\Steve\source\repos\Expers\x64\Release\Exper05_VolatileAtomic.lib and object C:\Users\Steve\source\repos\Expers\x64\Release\Exper05_VolatileAtomic.exp
1>LINK : warning LNK4098: defaultlib 'LIBCMT' conflicts with use of other libs; use /NODEFAULTLIB:library
1>LINK : /LTCG specified but no code generation required; remove /LTCG from the link command line to improve linker performance
1>Exper05_VolatileAtomic.vcxproj -> C:\Users\Steve\source\repos\Expers\x64\Release\Exper05_VolatileAtomic.exe
1>Done building project "Exper05_VolatileAtomic.vcxproj".
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

Still no repro with your code in my setup. The only suggestion I have is to change the build to match your GPU. GTX 1070 is a cc6.1 GPU, you are compiling for cc5.2

No, it should not matter, its just an experiment.

Thanks Robert,

I just tried with 6.1. Same results.

I confirmed the 6.1 in the build output. Excerpt:

1>Compiling CUDA source file main_exper05.cu...
1>
1>C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin\nvcc.exe" -gencode=arch=compute_61,code=\"sm_61,compute_61\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -rdc=true  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\include"     --keep-dir x64\Release  -maxrregcount=0   --machine 64 --compile -cudart static    -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /O2 /FS   /MD " -Xcompiler "/Fdx64\Release\vc141.pdb" -o C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\x64\Release\main_exper05.cu.obj "C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\main_exper05.cu"
1>main_exper05.cu

I wonder, too, why I could compile the admittedly strange type cast in my original code without errors. One would think the front end of the compiler would be the same on Windows and Linux.

I was able to reproduce your observation running on a cc3.0 device, on linux, with CUDA 10.2. I don’t have anything further to report at the moment.

(Shared atomics prior to Maxwell were somewhat of a different animal, and now that I think about it I may have to amend my statements about mixing shared atomics with ordinary shared activity pre-Maxwell. I think there may have been some warnings about such a hazard in that case. But unfortunately my Maxwell machine just died on me. It will probably be a while before I can investigate further.)

Ok, glad to see I was not imagining things.

For the record, I added --version to my build options and got:

1>nvcc: NVIDIA (R) Cuda compiler driver
1>Copyright (c) 2005-2023 NVIDIA Corporation
1>Built on Tue_Jun_13_19:42:34_Pacific_Daylight_Time_2023
1>Cuda compilation tools, release 12.2, V12.2.91
1>Build cuda_12.2.r12.2/compiler.32965470_0

For completeness, here is the code with error checking (and correct type cast):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>


const unsigned long long int  InitValue   =  0x1000'0000'0000'0000ull;      // the value that initializes the volatile variable
const unsigned long long int  BigValue    =  0x7000'0000'0000'0000ull;      // the amount written by the one volatile write (or atomicExch())
const unsigned long long int  Delta       =  0x0000'0001'0000'0001ull;      // the amount added by each atomicAdd() -- note the two 1s
const int                     N_AddPerThr =  0x0400;                        // all threads (except one) loop this many times and do one atomicAdd() per loop


// __global__ void k0( volatile unsigned long long int *d );   // no longer used

// for this kernel see:
//   https://stackoverflow.com/questions/15331009/when-to-use-volatile-with-shared-cuda-memory

// lines commmented out below do not significantly change main()'s reports, but changing the VolatileWrite argument at the time of launch does

// troublesome kernel:
// k1() often leads to "bad value" reports from main()
// k1() uses d only to report the final value of Victim
__global__ void k1( volatile unsigned long long int *d, bool VolatileWrite, int I_Launch ) {

  __shared__  volatile  unsigned long long int      Victim;
//__shared__            unsigned long long int      Victim;
//volatile              unsigned long long int*  VolVictim  =  const_cast<volatile unsigned long long int*>( & Victim );

  if ( blockIdx.x   !=  0 )  return;

  if ( threadIdx.x  ==  0 )      Victim  =  InitValue;
//if ( threadIdx.x  ==  0 )  *VolVictim  =  InitValue;

  __syncthreads();

  if ( threadIdx.x  ==  0 ) {
    while ( ( clock() & 0x03f0 )  !=  0 );        // wait a random few ns

    if ( VolatileWrite ) {
        Victim  =  BigValue;                                         // this does not always have the intended effect, which is to reset the count to BigValue
//  *VolVictim  =  BigValue;
    }
    else  atomicExch( const_cast<unsigned long long int*>( & Victim ), BigValue );     // this has has the intended effect, and main() reports only "ok value"
  }

  else {
//else if ( threadIdx.x > 31 ) {
    for ( int i = 0; i < N_AddPerThr; i++ )  atomicAdd( const_cast<unsigned long long int*>( & Victim ), Delta );
  }

  __syncthreads();

  if ( threadIdx.x  ==  0 )  *d  =      Victim;   // report result to main()
//if ( threadIdx.x  ==  0 )  *d  =  *VolVictim;

// this produces the same result as the printout in main():
//  if ( threadIdx.x  ==  0 ) {  if  ( Victim  <  BigValue ) printf( "got bad value: %llx on launch %x   <-- from k1()\n", Victim, I_Launch ); }

}


int main(){

  cudaError_t   ce;

  unsigned long long int *d;
       ce  = cudaMallocManaged(&d, sizeof(*d));
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaMallocManaged() failed: " << cudaGetErrorString(ce);    return 1;
  }


  for ( int Run   =  1; Run < 3; ++ Run ) {
    if (    Run  ==  0 )  std::cout << "\nStarting run that repeatedly launches k0():" << std::endl;
    if (    Run  ==  1 )  std::cout << "\nStarting run that repeatedly launches k1() with volatile write,\n(debug build yields only a few good values -- try another run for examples):" << std::endl;
    if (    Run  ==  2 )  std::cout << "\nStarting run that repeatedly launches k1() with atomicExch(),\n(debug build runs very slow):"   << std::endl;

    int   N_Bad   =  0;      // used to abort after too many bad values
    bool  WasBad  =  false;  // used to show a sample ok value after a bad value, i.e., indicates a bad value was observed in the previous iteration

    for ( int i = 0; i <  1'000; ++ i ) {
      if ( N_Bad >= 30 ) { std::cout << "Aborting after 30 bad values in " << std::dec << i << " launches!  ***** " << std::endl;  break; }
      *d = InitValue;
//    if ( Run  ==  0 )  k0<<< 20,1024>>>( d    );           // always leads to "ok  value"
      if ( Run  ==  1 )  k1<<<  1,1024>>>( d,  true, i );    // true  --> volatile write  -- often  leads to "bad value"
      if ( Run  ==  2 )  k1<<<  1,1024>>>( d, false, i );    // false --> atomicExch()    -- always leads to "ok  value"

           ce  = cudaGetLastError();
      if ( ce != cudaSuccess ) {
        std::cout  << "kernel launch failed: " << cudaGetErrorString(ce);    return 1;
      }

           ce  = cudaDeviceSynchronize();
      if ( ce != cudaSuccess ) {
        std::cout  << "cudaDeviceSynchronize() failed: " << cudaGetErrorString(ce);    return 1;
      }

// report: show all bad values, a sample of good values, and the final value
      std::cout << std::hex;
      if      ( *d  <  BigValue          ) { std::cout << "got bad value: 0x" << *d << " on launch 0x" << i << "  ***** " << std::endl;  ++ N_Bad;  WasBad  =  true; }
      else if (  WasBad  ||
                ( ( i & 0x0ff )  ==  0 ) ) { std::cout << "got ok  value: 0x" << *d << " on launch 0x" << i <<               std::endl;  WasBad  =  false; }  // occasional updates
    }
    std::cout << std::hex;                   std::cout << "final   value: 0x" << *d << std::endl;
  }
  return 0;
}