I’ve been trying to track down the source of a CUDA illegal memory access. I’ve cut down my code considerably to the simplest compilable example where I consistently get get illegal memory access errors:
In my main cuda file I allocate memory for a number of CUDA pointers and copy them to a constant struct, which is used to hold pointers for global memory (instead of having to pass them in as parameters to kernels; I have many more arrays not included in this example):
#include <stdio.h>
/** Include CUDA libraries. */
#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#define NSP 53
#define NN 54
#define FWD_RATES 325
#define REV_RATES 309
#define PRES_MOD_RATES 41
#define cudaErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct gpuMemory {
double* dy;
double* fwd_rates;
double* rev_rates;
double* pres_mod;
};
void initialize_pointer(double** ptr, int size) {
cudaErrorCheck( cudaMalloc((void**)ptr, size * sizeof(double)) );
cudaErrorCheck( cudaMemset(*ptr, 1, size * sizeof(double)) );
}
__constant__ gpuMemory memory_pointers;
#define CU_LINEAR_OFFSET(I) (threadIdx.x + blockIdx.x * blockDim.x + (I) * blockDim.x * gridDim.x)
__device__ void eval_spec_rates (const double * fwd_rates, const double * rev_rates, const double * pres_mod, double * sp_rates) {
sp_rates[CU_LINEAR_OFFSET(1)] = -(fwd_rates[CU_LINEAR_OFFSET(2)] - rev_rates[CU_LINEAR_OFFSET(2)])
+ (fwd_rates[CU_LINEAR_OFFSET(7)] - rev_rates[CU_LINEAR_OFFSET(7)])
+ (fwd_rates[CU_LINEAR_OFFSET(38)] - rev_rates[CU_LINEAR_OFFSET(38)]) * pres_mod[CU_LINEAR_OFFSET(4)]
+ (fwd_rates[CU_LINEAR_OFFSET(39)] - rev_rates[CU_LINEAR_OFFSET(39)])
+ (fwd_rates[CU_LINEAR_OFFSET(40)] - rev_rates[CU_LINEAR_OFFSET(40)])
+ (fwd_rates[CU_LINEAR_OFFSET(41)] - rev_rates[CU_LINEAR_OFFSET(41)])
+ (fwd_rates[CU_LINEAR_OFFSET(44)] - rev_rates[CU_LINEAR_OFFSET(44)])
+ (fwd_rates[CU_LINEAR_OFFSET(46)] - rev_rates[CU_LINEAR_OFFSET(46)])
+ (fwd_rates[CU_LINEAR_OFFSET(48)] - rev_rates[CU_LINEAR_OFFSET(48)])
+ (fwd_rates[CU_LINEAR_OFFSET(50)] - rev_rates[CU_LINEAR_OFFSET(50)])
+ (fwd_rates[CU_LINEAR_OFFSET(52)] - rev_rates[CU_LINEAR_OFFSET(52)])
+ (fwd_rates[CU_LINEAR_OFFSET(54)] - rev_rates[CU_LINEAR_OFFSET(54)])
+ (fwd_rates[CU_LINEAR_OFFSET(57)] - rev_rates[CU_LINEAR_OFFSET(57)])
+ (fwd_rates[CU_LINEAR_OFFSET(59)] - rev_rates[CU_LINEAR_OFFSET(59)])
+ (fwd_rates[CU_LINEAR_OFFSET(64)] - rev_rates[CU_LINEAR_OFFSET(64)])
+ (fwd_rates[CU_LINEAR_OFFSET(67)] - rev_rates[CU_LINEAR_OFFSET(67)])
+ (fwd_rates[CU_LINEAR_OFFSET(68)] - rev_rates[CU_LINEAR_OFFSET(68)])
+ (fwd_rates[CU_LINEAR_OFFSET(72)] - rev_rates[CU_LINEAR_OFFSET(72)])
+ (fwd_rates[CU_LINEAR_OFFSET(74)] - rev_rates[CU_LINEAR_OFFSET(74)])
+ (fwd_rates[CU_LINEAR_OFFSET(76)] - rev_rates[CU_LINEAR_OFFSET(76)])
+ (fwd_rates[CU_LINEAR_OFFSET(77)] - rev_rates[CU_LINEAR_OFFSET(77)])
+ (fwd_rates[CU_LINEAR_OFFSET(79)] - rev_rates[CU_LINEAR_OFFSET(79)])
- (fwd_rates[CU_LINEAR_OFFSET(82)] - rev_rates[CU_LINEAR_OFFSET(82)]) * pres_mod[CU_LINEAR_OFFSET(18)]
- (fwd_rates[CU_LINEAR_OFFSET(83)] - rev_rates[CU_LINEAR_OFFSET(83)])
- (fwd_rates[CU_LINEAR_OFFSET(125)] - rev_rates[CU_LINEAR_OFFSET(125)])
- (fwd_rates[CU_LINEAR_OFFSET(135)] - rev_rates[CU_LINEAR_OFFSET(134)])
+ (fwd_rates[CU_LINEAR_OFFSET(136)] - rev_rates[CU_LINEAR_OFFSET(135)])
- (fwd_rates[CU_LINEAR_OFFSET(145)] - rev_rates[CU_LINEAR_OFFSET(144)])
- (fwd_rates[CU_LINEAR_OFFSET(171)] - rev_rates[CU_LINEAR_OFFSET(170)])
+ (fwd_rates[CU_LINEAR_OFFSET(173)] - rev_rates[CU_LINEAR_OFFSET(172)]) * pres_mod[CU_LINEAR_OFFSET(26)]
+ (fwd_rates[CU_LINEAR_OFFSET(190)] - rev_rates[CU_LINEAR_OFFSET(189)])
+ (fwd_rates[CU_LINEAR_OFFSET(196)] - rev_rates[CU_LINEAR_OFFSET(195)])
+ (fwd_rates[CU_LINEAR_OFFSET(201)] - rev_rates[CU_LINEAR_OFFSET(200)])
+ (fwd_rates[CU_LINEAR_OFFSET(208)] - rev_rates[CU_LINEAR_OFFSET(207)])
+ (fwd_rates[CU_LINEAR_OFFSET(213)] - rev_rates[CU_LINEAR_OFFSET(212)])
- (fwd_rates[CU_LINEAR_OFFSET(220)] - rev_rates[CU_LINEAR_OFFSET(219)])
+ (fwd_rates[CU_LINEAR_OFFSET(265)] - rev_rates[CU_LINEAR_OFFSET(264)])
+ (fwd_rates[CU_LINEAR_OFFSET(275)] - rev_rates[CU_LINEAR_OFFSET(274)])
+ (fwd_rates[CU_LINEAR_OFFSET(276)] - rev_rates[CU_LINEAR_OFFSET(275)])
+ fwd_rates[CU_LINEAR_OFFSET(283)] + fwd_rates[CU_LINEAR_OFFSET(287)]
- (fwd_rates[CU_LINEAR_OFFSET(288)] - rev_rates[CU_LINEAR_OFFSET(285)]) * pres_mod[CU_LINEAR_OFFSET(36)]
+ fwd_rates[CU_LINEAR_OFFSET(292)] + (fwd_rates[CU_LINEAR_OFFSET(298)] - rev_rates[CU_LINEAR_OFFSET(290)])
+ fwd_rates[CU_LINEAR_OFFSET(299)] + (fwd_rates[CU_LINEAR_OFFSET(308)] - rev_rates[CU_LINEAR_OFFSET(293)])
+ (fwd_rates[CU_LINEAR_OFFSET(313)] - rev_rates[CU_LINEAR_OFFSET(298)])
;
sp_rates[CU_LINEAR_OFFSET(2)] = -(fwd_rates[CU_LINEAR_OFFSET(1)] - rev_rates[CU_LINEAR_OFFSET(1)]) * pres_mod[CU_LINEAR_OFFSET(1)]
+ (fwd_rates[CU_LINEAR_OFFSET(2)] - rev_rates[CU_LINEAR_OFFSET(2)])
+ (fwd_rates[CU_LINEAR_OFFSET(5)] - rev_rates[CU_LINEAR_OFFSET(5)])
+ (fwd_rates[CU_LINEAR_OFFSET(6)] - rev_rates[CU_LINEAR_OFFSET(6)])
+ (fwd_rates[CU_LINEAR_OFFSET(8)] - rev_rates[CU_LINEAR_OFFSET(8)])
+ (fwd_rates[CU_LINEAR_OFFSET(9)] - rev_rates[CU_LINEAR_OFFSET(9)])
+ (fwd_rates[CU_LINEAR_OFFSET(13)] - rev_rates[CU_LINEAR_OFFSET(13)])
+ (fwd_rates[CU_LINEAR_OFFSET(20)] - rev_rates[CU_LINEAR_OFFSET(20)])
+ (fwd_rates[CU_LINEAR_OFFSET(23)] - rev_rates[CU_LINEAR_OFFSET(23)])
+ (fwd_rates[CU_LINEAR_OFFSET(27)] - rev_rates[CU_LINEAR_OFFSET(27)])
- (fwd_rates[CU_LINEAR_OFFSET(32)] - rev_rates[CU_LINEAR_OFFSET(32)]) * pres_mod[CU_LINEAR_OFFSET(3)]
- (fwd_rates[CU_LINEAR_OFFSET(33)] - rev_rates[CU_LINEAR_OFFSET(33)])
- (fwd_rates[CU_LINEAR_OFFSET(34)] - rev_rates[CU_LINEAR_OFFSET(34)])
- (fwd_rates[CU_LINEAR_OFFSET(35)] - rev_rates[CU_LINEAR_OFFSET(35)])
- (fwd_rates[CU_LINEAR_OFFSET(36)] - rev_rates[CU_LINEAR_OFFSET(36)])
- (fwd_rates[CU_LINEAR_OFFSET(37)] - rev_rates[CU_LINEAR_OFFSET(37)])
- 2.0 * (fwd_rates[CU_LINEAR_OFFSET(38)] - rev_rates[CU_LINEAR_OFFSET(38)]) * pres_mod[CU_LINEAR_OFFSET(4)]
- 2.0 * (fwd_rates[CU_LINEAR_OFFSET(39)] - rev_rates[CU_LINEAR_OFFSET(39)])
- 2.0 * (fwd_rates[CU_LINEAR_OFFSET(40)] - rev_rates[CU_LINEAR_OFFSET(40)])
- 2.0 * (fwd_rates[CU_LINEAR_OFFSET(41)] - rev_rates[CU_LINEAR_OFFSET(41)])
- (fwd_rates[CU_LINEAR_OFFSET(42)] - rev_rates[CU_LINEAR_OFFSET(42)]) * pres_mod[CU_LINEAR_OFFSET(5)]
- (fwd_rates[CU_LINEAR_OFFSET(43)] - rev_rates[CU_LINEAR_OFFSET(43)])
- (fwd_rates[CU_LINEAR_OFFSET(44)] - rev_rates[CU_LINEAR_OFFSET(44)])
- (fwd_rates[CU_LINEAR_OFFSET(45)] - rev_rates[CU_LINEAR_OFFSET(45)])
- (fwd_rates[CU_LINEAR_OFFSET(46)] - rev_rates[CU_LINEAR_OFFSET(46)])
- (fwd_rates[CU_LINEAR_OFFSET(47)] - rev_rates[CU_LINEAR_OFFSET(47)])
- (fwd_rates[CU_LINEAR_OFFSET(48)] - rev_rates[CU_LINEAR_OFFSET(48)])
- (fwd_rates[CU_LINEAR_OFFSET(49)] - rev_rates[CU_LINEAR_OFFSET(49)]) * pres_mod[CU_LINEAR_OFFSET(6)]
- (fwd_rates[CU_LINEAR_OFFSET(50)] - rev_rates[CU_LINEAR_OFFSET(50)])
- (fwd_rates[CU_LINEAR_OFFSET(51)] - rev_rates[CU_LINEAR_OFFSET(51)]) * pres_mod[CU_LINEAR_OFFSET(7)]
- (fwd_rates[CU_LINEAR_OFFSET(52)] - rev_rates[CU_LINEAR_OFFSET(52)])
- (fwd_rates[CU_LINEAR_OFFSET(53)] - rev_rates[CU_LINEAR_OFFSET(53)]) * pres_mod[CU_LINEAR_OFFSET(8)]
- (fwd_rates[CU_LINEAR_OFFSET(54)] - rev_rates[CU_LINEAR_OFFSET(54)])
;
}
__global__ void k_eval_spec_rates()
{
eval_spec_rates(memory_pointers.fwd_rates, memory_pointers.rev_rates, memory_pointers.pres_mod, memory_pointers.dy);
}
int main (int argc, char *argv[]) {
int grid = 8;
int block = 128;
cudaErrorCheck( cudaSetDevice(0) );
cudaErrorCheck( cudaPeekAtLastError() );
cudaErrorCheck( cudaDeviceSynchronize() );
int padded = grid * block;
gpuMemory host_mem;
initialize_pointer(&host_mem.fwd_rates, FWD_RATES * padded);
initialize_pointer(&host_mem.rev_rates, REV_RATES * padded);
initialize_pointer(&host_mem.pres_mod, PRES_MOD_RATES * padded);
initialize_pointer(&host_mem.dy, NN * padded);
cudaErrorCheck(cudaMemcpyToSymbol(memory_pointers, &host_mem, sizeof(gpuMemory)));
k_eval_spec_rates<<<grid, block>>>();
cudaErrorCheck( cudaPeekAtLastError() );
cudaErrorCheck( cudaDeviceSynchronize() );
cudaErrorCheck(cudaFree(host_mem.fwd_rates));
cudaErrorCheck(cudaFree(host_mem.rev_rates));
cudaErrorCheck(cudaFree(host_mem.pres_mod));
cudaErrorCheck(cudaFree(host_mem.dy));
cudaErrorCheck(cudaDeviceReset());
}
I consistently get illegal memory accesses in the eval_spec_rates kernel, specifically cuda-memcheck points at line 50 (sp_rates[CU_LINEAR_OFFSET(2)] = …)
Further adding to my confusion, I’ve noticed if I add the following if statement enclosing the body of the eval_spec_rates routine:
if (threadIdx.x + blockIdx.x * blockDim.x < blockDim.x * gridDim.x) {
sp_rates[CU_LINEAR_OFFSET(1)] = ...
...
}
the problem disappears. However, I’m under the impression that this if-statement can’t actually do anything - isn’t the unique thread id in the x direction (threadIdx.x + blockIdx.x * blockDim.x) bounded by the total number of threads in the cuda launch (blockDim.x * gridDim.x) by definition? Further, the problem only occurs while compiling with full optimizations, making debugging this issue more difficult.
I would greatly appreciate any help!
platform: x86_64, Intel(R) Xeon(R) CPU X5650 @ 2.67GHz
GPU: NVIDIA Corporation GF110GL [Tesla C2050 / C2075] (rev a1)
OS: Red Hat Enterprise Linux Server release 6.6 (Santiago)
CUDA Release: CUDA Driver Version / Runtime Version 6.5 / 6.5
CUDA Capability Major/Minor version number: 2.0
Compilation (for a single combined file, as edited above):
/usr/local/cuda/bin/nvcc -ccbin=/usr/bin -O3 -arch=sm_20 -m64 -lineinfo -I/usr/local/cuda/include/ -I/usr/local/cuda/samples/common/inc/ -dc -o obj/test.cu.o src/test.cu
/usr/local/cuda/bin/nvcc -ccbin=/usr/bin obj/test.cu.o -lm -L/usr/local/cuda/lib64 -lcuda -lcudart -lstdc++ -O3 -arch=sm_20 -m64 -lineinfo -dlink -o dlink.o
cc -Xlinker -rpath /usr/local/cuda/lib64 obj/test.cu.o dlink.o -lm -L/usr/local/cuda/lib64 -lcuda -lcudart -lstdc++ -O3 -std=c99 -mtune=native -o gputest
NVCC info:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2014 NVIDIA Corporation
Built on Thu_Jul_17_21:41:27_CDT_2014
Cuda compilation tools, release 6.5, V6.5.12
Driver info:
Driver Version: 340.65