Mixing volatile writes and atomicAdd()s with shared memory produces unexpected results

For completeness, here is the code with error checking (and correct type cast):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>


const unsigned long long int  InitValue   =  0x1000'0000'0000'0000ull;      // the value that initializes the volatile variable
const unsigned long long int  BigValue    =  0x7000'0000'0000'0000ull;      // the amount written by the one volatile write (or atomicExch())
const unsigned long long int  Delta       =  0x0000'0001'0000'0001ull;      // the amount added by each atomicAdd() -- note the two 1s
const int                     N_AddPerThr =  0x0400;                        // all threads (except one) loop this many times and do one atomicAdd() per loop


// __global__ void k0( volatile unsigned long long int *d );   // no longer used

// for this kernel see:
//   https://stackoverflow.com/questions/15331009/when-to-use-volatile-with-shared-cuda-memory

// lines commmented out below do not significantly change main()'s reports, but changing the VolatileWrite argument at the time of launch does

// troublesome kernel:
// k1() often leads to "bad value" reports from main()
// k1() uses d only to report the final value of Victim
__global__ void k1( volatile unsigned long long int *d, bool VolatileWrite, int I_Launch ) {

  __shared__  volatile  unsigned long long int      Victim;
//__shared__            unsigned long long int      Victim;
//volatile              unsigned long long int*  VolVictim  =  const_cast<volatile unsigned long long int*>( & Victim );

  if ( blockIdx.x   !=  0 )  return;

  if ( threadIdx.x  ==  0 )      Victim  =  InitValue;
//if ( threadIdx.x  ==  0 )  *VolVictim  =  InitValue;

  __syncthreads();

  if ( threadIdx.x  ==  0 ) {
    while ( ( clock() & 0x03f0 )  !=  0 );        // wait a random few ns

    if ( VolatileWrite ) {
        Victim  =  BigValue;                                         // this does not always have the intended effect, which is to reset the count to BigValue
//  *VolVictim  =  BigValue;
    }
    else  atomicExch( const_cast<unsigned long long int*>( & Victim ), BigValue );     // this has has the intended effect, and main() reports only "ok value"
  }

  else {
//else if ( threadIdx.x > 31 ) {
    for ( int i = 0; i < N_AddPerThr; i++ )  atomicAdd( const_cast<unsigned long long int*>( & Victim ), Delta );
  }

  __syncthreads();

  if ( threadIdx.x  ==  0 )  *d  =      Victim;   // report result to main()
//if ( threadIdx.x  ==  0 )  *d  =  *VolVictim;

// this produces the same result as the printout in main():
//  if ( threadIdx.x  ==  0 ) {  if  ( Victim  <  BigValue ) printf( "got bad value: %llx on launch %x   <-- from k1()\n", Victim, I_Launch ); }

}


int main(){

  cudaError_t   ce;

  unsigned long long int *d;
       ce  = cudaMallocManaged(&d, sizeof(*d));
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaMallocManaged() failed: " << cudaGetErrorString(ce);    return 1;
  }


  for ( int Run   =  1; Run < 3; ++ Run ) {
    if (    Run  ==  0 )  std::cout << "\nStarting run that repeatedly launches k0():" << std::endl;
    if (    Run  ==  1 )  std::cout << "\nStarting run that repeatedly launches k1() with volatile write,\n(debug build yields only a few good values -- try another run for examples):" << std::endl;
    if (    Run  ==  2 )  std::cout << "\nStarting run that repeatedly launches k1() with atomicExch(),\n(debug build runs very slow):"   << std::endl;

    int   N_Bad   =  0;      // used to abort after too many bad values
    bool  WasBad  =  false;  // used to show a sample ok value after a bad value, i.e., indicates a bad value was observed in the previous iteration

    for ( int i = 0; i <  1'000; ++ i ) {
      if ( N_Bad >= 30 ) { std::cout << "Aborting after 30 bad values in " << std::dec << i << " launches!  ***** " << std::endl;  break; }
      *d = InitValue;
//    if ( Run  ==  0 )  k0<<< 20,1024>>>( d    );           // always leads to "ok  value"
      if ( Run  ==  1 )  k1<<<  1,1024>>>( d,  true, i );    // true  --> volatile write  -- often  leads to "bad value"
      if ( Run  ==  2 )  k1<<<  1,1024>>>( d, false, i );    // false --> atomicExch()    -- always leads to "ok  value"

           ce  = cudaGetLastError();
      if ( ce != cudaSuccess ) {
        std::cout  << "kernel launch failed: " << cudaGetErrorString(ce);    return 1;
      }

           ce  = cudaDeviceSynchronize();
      if ( ce != cudaSuccess ) {
        std::cout  << "cudaDeviceSynchronize() failed: " << cudaGetErrorString(ce);    return 1;
      }

// report: show all bad values, a sample of good values, and the final value
      std::cout << std::hex;
      if      ( *d  <  BigValue          ) { std::cout << "got bad value: 0x" << *d << " on launch 0x" << i << "  ***** " << std::endl;  ++ N_Bad;  WasBad  =  true; }
      else if (  WasBad  ||
                ( ( i & 0x0ff )  ==  0 ) ) { std::cout << "got ok  value: 0x" << *d << " on launch 0x" << i <<               std::endl;  WasBad  =  false; }  // occasional updates
    }
    std::cout << std::hex;                   std::cout << "final   value: 0x" << *d << std::endl;
  }
  return 0;
}