Tracking down CUDA illegal memory access

I’ve been trying to track down the source of a CUDA illegal memory access. I’ve cut down my code considerably to the simplest compilable example where I consistently get get illegal memory access errors:

In my main cuda file I allocate memory for a number of CUDA pointers and copy them to a constant struct, which is used to hold pointers for global memory (instead of having to pass them in as parameters to kernels; I have many more arrays not included in this example):

#include <stdio.h>
/** Include CUDA libraries. */
#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>

#define NSP 53
#define NN 54
#define FWD_RATES 325
#define REV_RATES 309
#define PRES_MOD_RATES 41


#define cudaErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

struct gpuMemory {
double* dy;
double* fwd_rates;
double* rev_rates;
double* pres_mod;
};

void initialize_pointer(double** ptr, int size) {
    cudaErrorCheck( cudaMalloc((void**)ptr, size * sizeof(double)) );
    cudaErrorCheck( cudaMemset(*ptr, 1, size * sizeof(double)) );
}

__constant__ gpuMemory memory_pointers;

#define CU_LINEAR_OFFSET(I) (threadIdx.x + blockIdx.x * blockDim.x + (I) * blockDim.x * gridDim.x)

__device__ void eval_spec_rates (const double * fwd_rates, const double * rev_rates, const double * pres_mod, double * sp_rates) {
  sp_rates[CU_LINEAR_OFFSET(1)] = -(fwd_rates[CU_LINEAR_OFFSET(2)] - rev_rates[CU_LINEAR_OFFSET(2)])
                                + (fwd_rates[CU_LINEAR_OFFSET(7)] - rev_rates[CU_LINEAR_OFFSET(7)])
                                + (fwd_rates[CU_LINEAR_OFFSET(38)] - rev_rates[CU_LINEAR_OFFSET(38)]) * pres_mod[CU_LINEAR_OFFSET(4)]
                                + (fwd_rates[CU_LINEAR_OFFSET(39)] - rev_rates[CU_LINEAR_OFFSET(39)])
                                + (fwd_rates[CU_LINEAR_OFFSET(40)] - rev_rates[CU_LINEAR_OFFSET(40)])
                                + (fwd_rates[CU_LINEAR_OFFSET(41)] - rev_rates[CU_LINEAR_OFFSET(41)])
                                + (fwd_rates[CU_LINEAR_OFFSET(44)] - rev_rates[CU_LINEAR_OFFSET(44)])
                                + (fwd_rates[CU_LINEAR_OFFSET(46)] - rev_rates[CU_LINEAR_OFFSET(46)])
                                + (fwd_rates[CU_LINEAR_OFFSET(48)] - rev_rates[CU_LINEAR_OFFSET(48)])
                                + (fwd_rates[CU_LINEAR_OFFSET(50)] - rev_rates[CU_LINEAR_OFFSET(50)])
                                + (fwd_rates[CU_LINEAR_OFFSET(52)] - rev_rates[CU_LINEAR_OFFSET(52)])
                                + (fwd_rates[CU_LINEAR_OFFSET(54)] - rev_rates[CU_LINEAR_OFFSET(54)])
                                + (fwd_rates[CU_LINEAR_OFFSET(57)] - rev_rates[CU_LINEAR_OFFSET(57)])
                                + (fwd_rates[CU_LINEAR_OFFSET(59)] - rev_rates[CU_LINEAR_OFFSET(59)])
                                + (fwd_rates[CU_LINEAR_OFFSET(64)] - rev_rates[CU_LINEAR_OFFSET(64)])
                                + (fwd_rates[CU_LINEAR_OFFSET(67)] - rev_rates[CU_LINEAR_OFFSET(67)])
                                + (fwd_rates[CU_LINEAR_OFFSET(68)] - rev_rates[CU_LINEAR_OFFSET(68)])
                                + (fwd_rates[CU_LINEAR_OFFSET(72)] - rev_rates[CU_LINEAR_OFFSET(72)])
                                + (fwd_rates[CU_LINEAR_OFFSET(74)] - rev_rates[CU_LINEAR_OFFSET(74)])
                                + (fwd_rates[CU_LINEAR_OFFSET(76)] - rev_rates[CU_LINEAR_OFFSET(76)])
                                + (fwd_rates[CU_LINEAR_OFFSET(77)] - rev_rates[CU_LINEAR_OFFSET(77)])
                                + (fwd_rates[CU_LINEAR_OFFSET(79)] - rev_rates[CU_LINEAR_OFFSET(79)])
                                - (fwd_rates[CU_LINEAR_OFFSET(82)] - rev_rates[CU_LINEAR_OFFSET(82)]) * pres_mod[CU_LINEAR_OFFSET(18)]
                                - (fwd_rates[CU_LINEAR_OFFSET(83)] - rev_rates[CU_LINEAR_OFFSET(83)])
                                - (fwd_rates[CU_LINEAR_OFFSET(125)] - rev_rates[CU_LINEAR_OFFSET(125)])
                                - (fwd_rates[CU_LINEAR_OFFSET(135)] - rev_rates[CU_LINEAR_OFFSET(134)])
                                + (fwd_rates[CU_LINEAR_OFFSET(136)] - rev_rates[CU_LINEAR_OFFSET(135)])
                                - (fwd_rates[CU_LINEAR_OFFSET(145)] - rev_rates[CU_LINEAR_OFFSET(144)])
                                - (fwd_rates[CU_LINEAR_OFFSET(171)] - rev_rates[CU_LINEAR_OFFSET(170)])
                                + (fwd_rates[CU_LINEAR_OFFSET(173)] - rev_rates[CU_LINEAR_OFFSET(172)]) * pres_mod[CU_LINEAR_OFFSET(26)]
                                + (fwd_rates[CU_LINEAR_OFFSET(190)] - rev_rates[CU_LINEAR_OFFSET(189)])
                                + (fwd_rates[CU_LINEAR_OFFSET(196)] - rev_rates[CU_LINEAR_OFFSET(195)])
                                + (fwd_rates[CU_LINEAR_OFFSET(201)] - rev_rates[CU_LINEAR_OFFSET(200)])
                                + (fwd_rates[CU_LINEAR_OFFSET(208)] - rev_rates[CU_LINEAR_OFFSET(207)])
                                + (fwd_rates[CU_LINEAR_OFFSET(213)] - rev_rates[CU_LINEAR_OFFSET(212)])
                                - (fwd_rates[CU_LINEAR_OFFSET(220)] - rev_rates[CU_LINEAR_OFFSET(219)])
                                + (fwd_rates[CU_LINEAR_OFFSET(265)] - rev_rates[CU_LINEAR_OFFSET(264)])
                                + (fwd_rates[CU_LINEAR_OFFSET(275)] - rev_rates[CU_LINEAR_OFFSET(274)])
                                + (fwd_rates[CU_LINEAR_OFFSET(276)] - rev_rates[CU_LINEAR_OFFSET(275)])
                                + fwd_rates[CU_LINEAR_OFFSET(283)] + fwd_rates[CU_LINEAR_OFFSET(287)]
                                - (fwd_rates[CU_LINEAR_OFFSET(288)] - rev_rates[CU_LINEAR_OFFSET(285)]) * pres_mod[CU_LINEAR_OFFSET(36)]
                                + fwd_rates[CU_LINEAR_OFFSET(292)] + (fwd_rates[CU_LINEAR_OFFSET(298)] - rev_rates[CU_LINEAR_OFFSET(290)])
                                + fwd_rates[CU_LINEAR_OFFSET(299)] + (fwd_rates[CU_LINEAR_OFFSET(308)] - rev_rates[CU_LINEAR_OFFSET(293)])
                                + (fwd_rates[CU_LINEAR_OFFSET(313)] - rev_rates[CU_LINEAR_OFFSET(298)])
                               ;

  sp_rates[CU_LINEAR_OFFSET(2)] = -(fwd_rates[CU_LINEAR_OFFSET(1)] - rev_rates[CU_LINEAR_OFFSET(1)]) * pres_mod[CU_LINEAR_OFFSET(1)]
                                + (fwd_rates[CU_LINEAR_OFFSET(2)] - rev_rates[CU_LINEAR_OFFSET(2)])
                                + (fwd_rates[CU_LINEAR_OFFSET(5)] - rev_rates[CU_LINEAR_OFFSET(5)])
                                + (fwd_rates[CU_LINEAR_OFFSET(6)] - rev_rates[CU_LINEAR_OFFSET(6)])
                                + (fwd_rates[CU_LINEAR_OFFSET(8)] - rev_rates[CU_LINEAR_OFFSET(8)])
                                + (fwd_rates[CU_LINEAR_OFFSET(9)] - rev_rates[CU_LINEAR_OFFSET(9)])
                                + (fwd_rates[CU_LINEAR_OFFSET(13)] - rev_rates[CU_LINEAR_OFFSET(13)])
                                + (fwd_rates[CU_LINEAR_OFFSET(20)] - rev_rates[CU_LINEAR_OFFSET(20)])
                                + (fwd_rates[CU_LINEAR_OFFSET(23)] - rev_rates[CU_LINEAR_OFFSET(23)])
                                + (fwd_rates[CU_LINEAR_OFFSET(27)] - rev_rates[CU_LINEAR_OFFSET(27)])
                                - (fwd_rates[CU_LINEAR_OFFSET(32)] - rev_rates[CU_LINEAR_OFFSET(32)]) * pres_mod[CU_LINEAR_OFFSET(3)]
                                - (fwd_rates[CU_LINEAR_OFFSET(33)] - rev_rates[CU_LINEAR_OFFSET(33)])
                                - (fwd_rates[CU_LINEAR_OFFSET(34)] - rev_rates[CU_LINEAR_OFFSET(34)])
                                - (fwd_rates[CU_LINEAR_OFFSET(35)] - rev_rates[CU_LINEAR_OFFSET(35)])
                                - (fwd_rates[CU_LINEAR_OFFSET(36)] - rev_rates[CU_LINEAR_OFFSET(36)])
                                - (fwd_rates[CU_LINEAR_OFFSET(37)] - rev_rates[CU_LINEAR_OFFSET(37)])
                                - 2.0 * (fwd_rates[CU_LINEAR_OFFSET(38)] - rev_rates[CU_LINEAR_OFFSET(38)]) * pres_mod[CU_LINEAR_OFFSET(4)]
                                - 2.0 * (fwd_rates[CU_LINEAR_OFFSET(39)] - rev_rates[CU_LINEAR_OFFSET(39)])
                                - 2.0 * (fwd_rates[CU_LINEAR_OFFSET(40)] - rev_rates[CU_LINEAR_OFFSET(40)])
                                - 2.0 * (fwd_rates[CU_LINEAR_OFFSET(41)] - rev_rates[CU_LINEAR_OFFSET(41)])
                                - (fwd_rates[CU_LINEAR_OFFSET(42)] - rev_rates[CU_LINEAR_OFFSET(42)]) * pres_mod[CU_LINEAR_OFFSET(5)]
                                - (fwd_rates[CU_LINEAR_OFFSET(43)] - rev_rates[CU_LINEAR_OFFSET(43)])
                                - (fwd_rates[CU_LINEAR_OFFSET(44)] - rev_rates[CU_LINEAR_OFFSET(44)])
                                - (fwd_rates[CU_LINEAR_OFFSET(45)] - rev_rates[CU_LINEAR_OFFSET(45)])
                                - (fwd_rates[CU_LINEAR_OFFSET(46)] - rev_rates[CU_LINEAR_OFFSET(46)])
                                - (fwd_rates[CU_LINEAR_OFFSET(47)] - rev_rates[CU_LINEAR_OFFSET(47)])
                                - (fwd_rates[CU_LINEAR_OFFSET(48)] - rev_rates[CU_LINEAR_OFFSET(48)])
                                - (fwd_rates[CU_LINEAR_OFFSET(49)] - rev_rates[CU_LINEAR_OFFSET(49)]) * pres_mod[CU_LINEAR_OFFSET(6)]
                                - (fwd_rates[CU_LINEAR_OFFSET(50)] - rev_rates[CU_LINEAR_OFFSET(50)])
                                - (fwd_rates[CU_LINEAR_OFFSET(51)] - rev_rates[CU_LINEAR_OFFSET(51)]) * pres_mod[CU_LINEAR_OFFSET(7)]
                                - (fwd_rates[CU_LINEAR_OFFSET(52)] - rev_rates[CU_LINEAR_OFFSET(52)])
                                - (fwd_rates[CU_LINEAR_OFFSET(53)] - rev_rates[CU_LINEAR_OFFSET(53)]) * pres_mod[CU_LINEAR_OFFSET(8)]
                                - (fwd_rates[CU_LINEAR_OFFSET(54)] - rev_rates[CU_LINEAR_OFFSET(54)])
                                ;
}
__global__ void k_eval_spec_rates()
{
    eval_spec_rates(memory_pointers.fwd_rates, memory_pointers.rev_rates, memory_pointers.pres_mod, memory_pointers.dy);
}

int main (int argc, char *argv[]) {
    int grid = 8;
    int block = 128;

    cudaErrorCheck( cudaSetDevice(0) );
    cudaErrorCheck( cudaPeekAtLastError() );
    cudaErrorCheck( cudaDeviceSynchronize() );

    int padded = grid * block;

    gpuMemory host_mem;

    initialize_pointer(&host_mem.fwd_rates, FWD_RATES * padded);
    initialize_pointer(&host_mem.rev_rates, REV_RATES * padded);
    initialize_pointer(&host_mem.pres_mod, PRES_MOD_RATES * padded);
    initialize_pointer(&host_mem.dy, NN * padded);

    cudaErrorCheck(cudaMemcpyToSymbol(memory_pointers, &host_mem, sizeof(gpuMemory)));

    k_eval_spec_rates<<<grid, block>>>();

    cudaErrorCheck( cudaPeekAtLastError() );
    cudaErrorCheck( cudaDeviceSynchronize() );

    cudaErrorCheck(cudaFree(host_mem.fwd_rates));
    cudaErrorCheck(cudaFree(host_mem.rev_rates));
    cudaErrorCheck(cudaFree(host_mem.pres_mod));
    cudaErrorCheck(cudaFree(host_mem.dy));
    cudaErrorCheck(cudaDeviceReset());

}

I consistently get illegal memory accesses in the eval_spec_rates kernel, specifically cuda-memcheck points at line 50 (sp_rates[CU_LINEAR_OFFSET(2)] = …)

Further adding to my confusion, I’ve noticed if I add the following if statement enclosing the body of the eval_spec_rates routine:

if (threadIdx.x + blockIdx.x * blockDim.x < blockDim.x * gridDim.x) {
    sp_rates[CU_LINEAR_OFFSET(1)] = ...
    ...
}

the problem disappears. However, I’m under the impression that this if-statement can’t actually do anything - isn’t the unique thread id in the x direction (threadIdx.x + blockIdx.x * blockDim.x) bounded by the total number of threads in the cuda launch (blockDim.x * gridDim.x) by definition? Further, the problem only occurs while compiling with full optimizations, making debugging this issue more difficult.

I would greatly appreciate any help!

platform: x86_64, Intel(R) Xeon(R) CPU X5650  @ 2.67GHz

GPU: NVIDIA Corporation GF110GL [Tesla C2050 / C2075] (rev a1)

OS: Red Hat Enterprise Linux Server release 6.6 (Santiago)

CUDA Release: CUDA Driver Version / Runtime Version  6.5 / 6.5

CUDA Capability Major/Minor version number:    2.0

Compilation (for a single combined file, as edited above):

/usr/local/cuda/bin/nvcc -ccbin=/usr/bin -O3 -arch=sm_20 -m64 -lineinfo -I/usr/local/cuda/include/ -I/usr/local/cuda/samples/common/inc/ -dc -o obj/test.cu.o src/test.cu

/usr/local/cuda/bin/nvcc -ccbin=/usr/bin obj/test.cu.o -lm -L/usr/local/cuda/lib64 -lcuda -lcudart -lstdc++ -O3 -arch=sm_20 -m64 -lineinfo -dlink -o dlink.o

cc -Xlinker -rpath /usr/local/cuda/lib64 obj/test.cu.o dlink.o -lm -L/usr/local/cuda/lib64 -lcuda -lcudart -lstdc++ -O3 -std=c99 -mtune=native -o gputest

NVCC info:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2014 NVIDIA Corporation
Built on Thu_Jul_17_21:41:27_CDT_2014
Cuda compilation tools, release 6.5, V6.5.12

Driver info:

Driver Version: 340.65

Cross posted from: http://stackoverflow.com/questions/28613304/tracking-down-cuda-illegal-memory-access