illegal memory access in CUDA

I have encountered some memory access issue in CUDA. The core of my code is

long long addr0,addr1;
addr0=(long long)my_array;
addr1 = ( addr0 ^ (1 << position));
long long *r_addr0, *r_addr1;
r_addr0 = (long long *)addr0;
r_addr1 = (long long *)addr1;
i = *r_addr0;
j = *r_addr1;

Where my_array is the address of device array.

I store the address of my_array in r_addr0, then i flip the bit of r_addr0 one by one. e.g.

0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0000  0  addr of my_array
0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0001  1  flip last bit
0000 0000 1011 0100 0011 1111 1110 0000 0000 0000 0000  31 flip 31 bit.

I print the address of r_addr0 and r_addr1 each time, and it works well for the first 31 bits, but i encountered illegal memory address issue after 32 bits. I am using Tesla K80 which has 12GB memory on board.

[b]Does anyone know how to figure out this problem.

Complete code see below:[/b]

# include <stdio.h>
# include <stdint.h>
# include "cuda_runtime.h"

//compile nvcc test.cu -o test

__global__ void global_latency (int * my_array, int position, int *d_time);
int row_bits(int * h_a, long long N, int pos, int * h_time);

int main(){
  cudaSetDevice(0);
  long long i, N;
  int *h_a;
  int h_time0;
  int h_time1;
  int *h_time;
  N = 2*1024*1024*1024L;//2G elements, 4 bytes per element, 8 GB memory used.
  printf("\n=====%10.4f GB array with %d GB elements,discover row bits====\n", sizeof(int)*(float)N/1024/1024/1024,N/1024/1024/1024);
  /* allocate arrays on CPU */
  h_a = (int *)malloc(sizeof(int) * N);
  h_time = (int *)malloc(sizeof(int)*N);

 /* initialize array elements*/
  for (i=0L; i<N; i++){
    h_a[i] = i%(1024*1024);
  }

  for (int k=0;k<2;k++){
    h_time[k]=0;
  }
  printf("... ... ...\n... ... ...\n");
  for (int pos = 0; pos < 64; pos++){
    h_time0=0;
    h_time1=0;
   for (int j=0;j<5;j++){  
     row_bits(h_a,N,pos,h_time);
     h_time0 +=h_time[0];
     h_time1 +=h_time[1];
   }
   printf("position = %d, time0 = %d, time1 = %d\n", pos+1,h_time0/5, h_time1/5);
  }
  printf("===============================================\n\n");
  free(h_a);
  return 0;
}

int row_bits(int * h_a, long long N, int pos, int * h_time) {
  cudaError_t error_id;
  int *d_a; 
  /* allocate arrays on GPU */
  error_id = cudaMalloc ((void **) &d_a, sizeof(int) * N);
  if (error_id != cudaSuccess) {
printf("Error 1.0 is %s\n", cudaGetErrorString(error_id));
  }
  /* copy array elements from CPU to GPU */
  error_id = cudaMemcpy(d_a, h_a, sizeof(int) * N, cudaMemcpyHostToDevice);
  if (error_id != cudaSuccess) {
    printf("Error 1.1 is %s\n", cudaGetErrorString(error_id));
  }

  //int *h_time = (int *)malloc(sizeof(int));
  int  *d_time;
  error_id = cudaMalloc ((void **) &d_time, 4*sizeof(int));
  if (error_id != cudaSuccess)
    printf("Error 1.2 is %s\n", cudaGetErrorString(error_id));

  cudaThreadSynchronize ();
  /* launch kernel*/
  dim3 Db = dim3(1);
  dim3 Dg = dim3(1,1,1);

  global_latency <<<Dg, Db>>>(d_a, pos,d_time);

  cudaThreadSynchronize ();

  error_id = cudaGetLastError();
  if (error_id != cudaSuccess) {
    printf("Error kernel is %s\n", cudaGetErrorString(error_id));
  }

  /* copy results from GPU to CPU */
  cudaThreadSynchronize ();

  error_id = cudaMemcpy((void *)h_time, (void *)d_time, 4*sizeof(int),     cudaMemcpyDeviceToHost);
  if (error_id != cudaSuccess) {
    printf("Error 2.0 is %s\n", cudaGetErrorString(error_id));
  }
  cudaThreadSynchronize ();

  /* free memory on GPU */
  cudaFree(d_a);
  cudaFree(d_time);

cudaDeviceReset();
  return 0; 
}

__global__ void global_latency (int * my_array, int position, int *d_time) {

  //int tid = blockIdx.x*blockDim.x+threadIdx.x;

  int start_time=0;
  int mid_time=0;
  int end_time=0;

__shared__ int s_tvalue[2];//2: number of threads per block

  int i, j;
  s_tvalue[0]=0;
  s_tvalue[1]=0;
  long long addr0,addr1;
  //printf("%p\n",my_array);
  //int * p = (int *)0x0;
  //addr0 = (long long)p;
  addr0=(long long)my_array;
  //printf("Address i :%p\n",addr0);
  addr1 = ( addr0 ^ (1 << position));
  //printf("Address i':%p\n",addr1);
  //start_time = clock();
  long long *r_addr0, *r_addr1;
  r_addr0 = (long long *)addr0;
  r_addr1 = (long long *)addr1;

  start_time = clock();

  i = *r_addr0;
  s_tvalue[0] = i;
  mid_time = clock(); 
  j = *r_addr1;
  s_tvalue[1] = j;
  //printf("%p",p);
  //k =(int)p;
  //printf("%d\n",k);

  //printf("%d",k);
  //__syncthreads();
  end_time = clock();

  d_time[0] = mid_time-start_time;
  d_time[1] = end_time-mid_time;
  d_time[2] = s_tvalue[0];
  //printf("[%p]=%lld\n",addr0,d_time[1]);
  d_time[3] = s_tvalue[1];
  //printf("[%p]=%lld\n",addr1,d_time[2]);  
}

cuda requires accesses be naturally aligned. This is covered in the documentation. This means for accessing a long quantity (8 bytes), the address must fall on an 8-byte boundary.

You can’t randomly flip bits in a pointer to a long quantity and expect that the resulting pointer be properly aligned.

It might work in CPU code. It does not work in GPU code.