I have the following kernel which will generate a runtime error of misaligned memory access.
#include <stdlib.h>
#include <stdint.h>
#include "repeat.h"
#define ITER 32
#define BS 160
__global__ void chase_pointers (
uintptr_t *ptr_array,
int *output,
float b)
{
b=b-1; // make sure b is 0
float a;
int pid = threadIdx.x + blockIdx.x*blockDim.x;
int warpid = pid / 32;
int laneid = pid % 32;
union
{
uintptr_t * addr;
struct {
int low;
int high;
} myInt;
} myUnion;
int startIdx = threadIdx.x + blockIdx.x * ITER * BS;
myUnion.addr = &ptr_array[startIdx];
#pragma unroll 16
for (int i = 0; i < ITER; i++) {
myUnion.addr = (uintptr_t *)(*myUnion.addr);
a=__int_as_float(myUnion.myInt.low);
repeat32(a=a+b;);
myUnion.myInt.low = __float_as_int(a);
}
if (laneid == 0){
output[warpid] = myUnion.myInt.low;
}
}
int main (int argc, char *argv[])
{
int arrayLen = 64*1024*1024;
size_t arraySize = arrayLen*8 + BS*8;
uintptr_t *ptr_array = (uintptr_t*)malloc(arraySize);
uintptr_t *ptr_array_d = 0;
cudaMalloc ((void **)&ptr_array_d, arraySize);
/*
* The array is initialized so that
* array[i] = &array[i+N], where N is blocksize (BS)
*
*/
for (int i = 0; i < arrayLen; i++){
ptr_array[i] = (uintptr_t)&ptr_array_d[i+BS];
}
cudaMemcpy (ptr_array_d, ptr_array, arraySize, cudaMemcpyHostToDevice);
int blocks = arrayLen/BS/ITER;
int threads = BS;
int warps = max(1, blocks*threads/32);
/*
* Initialize output array, one element for each warp
*/
int *output = (int*)malloc(sizeof(int)*warps);
int *output_d;
cudaMalloc((void**)&output_d, sizeof(int)*warps);
chase_pointers<<<blocks,threads>>>(ptr_array_d, output_d, 1);
/* clean up */
cudaFree (ptr_array_d);
cudaFree (output_d);
free(ptr_array);
free(output);
cudaDeviceSynchronize();
return EXIT_SUCCESS;
}
When I use cuda-memcheck, sometimes I got the following error
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 8
========= at 0x00000908 in /u/lxzhang/GPU-Benchmark/GPU-Benchmark-Volkov-Diss/test.cu:34:chase_pointers(unsigned long*, int*, float)
========= by thread (127,0,0) in block (12906,0,0)
========= Address 0x7fd27fffffff is misaligned
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2d725e]
========= Host Frame:test [0x22992]
========= Host Frame:test [0x22b87]
========= Host Frame:test [0x56f45]
========= Host Frame:test [0x6c09]
========= Host Frame:test [0x6abd]
========= Host Frame:test [0x6b14]
========= Host Frame:test [0x68d1]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x661a]
... more errors ...
The error points to line 34, which is
myUnion.addr = (uintptr_t *)(*myUnion.addr);
It seems that myUnion.addr is not always aligned at 8 bytes. My question is how can I enforce alignment for the union so that myUnion.addr is always aligned at 8 bytes? Or is it caused by something else?
And it happens on GTX 1080 (Pascal CC=6.1) but always works fine on GTX TITAN (Kepler CC=3.5). So why is that?
The source code is compiled with nvcc-10.0 on GTX 1080.
Thanks in advance.