Memory errors when writing to local variable in kernel

Hi,

I’m trying to implement a path tracer in CUDA, and I’m seeing very weird memory errors in the CUDA debugger/memcheck. It’s a bidirectional path tracer, so there’s loads of kernel functions, and the one causing the errors is the Ray-Triangle intersection. I’m using the vector functions defined in helper_math.h from the CUDA samples.
The errors are not always the same, here are the code and some examples:

inline __device__ float intersect(const Triangle& triangle, const Ray& r, float& u, float& v, bool& face)
  {
    //Find vectors for two edges sharing A
    float3 e1 = triangle.B.position - triangle.A.position;
    float3 e2 = triangle.C.position - triangle.A.position;
    //Begin calculating determinant - also used to calculate u parameter
    float3 P = cross(r.d, e2);
    //if determinant is near zero, ray lies in plane of triangle or ray is parallel to plane of triangle
    float det = dot(e1, P);
    if (det > -EPSILON && det < EPSILON)
      return -1.0;

    //calculate distance from A to ray origin
    float3 T = r.o - triangle.A.position;

    //Calculate u parameter and test bound
    u = dot(T, P) / det;
    //The intersection lies outside of the triangle
    if (u < EPSILON || u > 1.0f - EPSILON)
      return -1.0;

    //Prepare to test v parameter
    float3 Q = cross(T, e1);

    //Calculate V parameter and test bound
    v = dot(r.d, Q) / det;
    //The intersection lies outside of the triangle
    if (v < EPSILON || u + v  > 1.0f)
      return -1.0;

    float t = dot(e2, Q) / det;

    if (t > EPSILON) { //ray intersection
      face = det > 0.0f;
      return t;
    }

    // No hit, no win
    return -1.0;
  }

EXAMPLE #1:

This one happens in the function “float3 operator-(float3 a, float3 b)” which is called from intersect.

CUDA Memory Checker detected 32 threads caused an access violation:
Launch Parameters
CUcontext = 23f7553c750
CUstream = 23f77b8eb70
CUmodule = 23f0313dcd0
CUfunction = 23f0325dfa0
FunctionName = _ZN12CuBDPTKernel16evalContributionEy
GridId = 40
gridDim = {32,64,1}
blockDim = {8,4,1}
sharedSize = 256
Parameters:
surface = 6
Parameters (raw):
0x00000006 0x00000000
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source

23f7bfff158 4 adr ld g 0 0 {0,0,0} {0,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 1 {0,0,0} {1,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 2 {0,0,0} {2,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 3 {0,0,0} {3,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 4 {0,0,0} {4,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 5 {0,0,0} {5,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 6 {0,0,0} {6,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 7 {0,0,0} {7,0,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 8 {0,0,0} {0,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 9 {0,0,0} {1,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 10 {0,0,0} {2,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 11 {0,0,0} {3,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 12 {0,0,0} {4,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 13 {0,0,0} {5,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 14 {0,0,0} {6,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 15 {0,0,0} {7,1,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 16 {0,0,0} {0,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 17 {0,0,0} {1,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 18 {0,0,0} {2,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 19 {0,0,0} {3,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 20 {0,0,0} {4,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 21 {0,0,0} {5,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 22 {0,0,0} {6,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 23 {0,0,0} {7,2,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 24 {0,0,0} {0,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 25 {0,0,0} {1,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 26 {0,0,0} {2,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 27 {0,0,0} {3,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 28 {0,0,0} {4,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 29 {0,0,0} {5,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 30 {0,0,0} {6,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581
23f7bfff158 4 adr ld g 0 31 {0,0,0} {7,3,0} Zmi6float3S+000778 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:581

Summary of access violations:
c:\projects\ilmjru\gpugi\common\inc\helper_math.h(581): error MemoryChecker: #misaligned=0 #invalidAddress=32

Memory Checker detected 32 access violations.
error = access violation on load (global memory)
gridid = 40
blockIdx = {0,0,0}
threadIdx = {0,0,0}
address = 0x23f7bfff158
accessSize = 4

EXAMPLE #2:

This one is happening in shared memory, and I don’t even use shared memory, so I’m guessing that some local variables are placed there as there is no more register space. The violation happens in the fucntion “void* memset(void *dest, int c, size_t n)”, which is in my code, I don’t even know why it’s being called.

CUDA Memory Checker detected 31 threads caused an access violation:
Launch Parameters
CUcontext = 23f7553c750
CUstream = 23f77b8eb70
CUmodule = 23f0313dcd0
CUfunction = 23f0325dfa0
FunctionName = _ZN12CuBDPTKernel16evalContributionEy
GridId = 40
gridDim = {32,64,1}
blockDim = {8,4,1}
sharedSize = 256
Parameters:
surface = 6
Parameters (raw):
0x00000006 0x00000000
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source

23f00fff185 1 adr st s 18 1 {18,0,0} {1,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 2 {18,0,0} {2,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 3 {18,0,0} {3,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 4 {18,0,0} {4,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 5 {18,0,0} {5,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 6 {18,0,0} {6,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 7 {18,0,0} {7,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 8 {18,0,0} {0,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 9 {18,0,0} {1,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 10 {18,0,0} {2,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 11 {18,0,0} {3,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 12 {18,0,0} {4,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 13 {18,0,0} {5,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 14 {18,0,0} {6,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 15 {18,0,0} {7,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 16 {18,0,0} {0,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 17 {18,0,0} {1,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 18 {18,0,0} {2,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 19 {18,0,0} {3,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 20 {18,0,0} {4,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 21 {18,0,0} {5,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 22 {18,0,0} {6,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 23 {18,0,0} {7,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 24 {18,0,0} {0,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 25 {18,0,0} {1,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 26 {18,0,0} {2,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 27 {18,0,0} {3,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 28 {18,0,0} {4,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 29 {18,0,0} {5,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 30 {18,0,0} {6,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f00fff185 1 adr st s 18 31 {18,0,0} {7,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422

Summary of access violations:
c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp(422): error MemoryChecker: #misaligned=0 #invalidAddress=32

Memory Checker detected 31 access violations.
error = access violation on store (shared memory)
gridid = 40
blockIdx = {18,0,0}
threadIdx = {1,0,0}
address = 0x23f00fff185
accessSize = 1

EXAMPLE #3:

Similar to the first example. It’s happening in function “float dot(float3 a, float3 b)” which is called from intersect. Again it’s shared memory which I don’t use.

CUDA Memory Checker detected 31 threads caused an access violation:
Launch Parameters
CUcontext = 23f7553c750
CUstream = 23f77b8eb70
CUmodule = 23f0313dcd0
CUfunction = 23f0325dfa0
FunctionName = _ZN12CuBDPTKernel16evalContributionEy
GridId = 40
gridDim = {32,64,1}
blockDim = {8,4,1}
sharedSize = 256
Parameters:
surface = 6
Parameters (raw):
0x00000006 0x00000000
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source

23f00fff160 4 adr st s 38 1 {6,1,0} {1,0,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 2 {6,1,0} {2,0,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 3 {6,1,0} {3,0,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 4 {6,1,0} {4,0,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 5 {6,1,0} {5,0,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 6 {6,1,0} {6,0,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 7 {6,1,0} {7,0,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 8 {6,1,0} {0,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 9 {6,1,0} {1,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 10 {6,1,0} {2,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 11 {6,1,0} {3,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 12 {6,1,0} {4,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 13 {6,1,0} {5,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 14 {6,1,0} {6,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 15 {6,1,0} {7,1,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 16 {6,1,0} {0,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 17 {6,1,0} {1,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 18 {6,1,0} {2,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 19 {6,1,0} {3,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 20 {6,1,0} {4,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 21 {6,1,0} {5,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 22 {6,1,0} {6,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 23 {6,1,0} {7,2,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 24 {6,1,0} {0,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 25 {6,1,0} {1,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 26 {6,1,0} {2,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 27 {6,1,0} {3,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 28 {6,1,0} {4,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 29 {6,1,0} {5,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 30 {6,1,0} {6,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248
23f00fff160 4 adr st s 38 31 {6,1,0} {7,3,0} Z3dot6float3S+000470 c:\projects\ilmjru\gpugi\common\inc\helper_math.h:1248

Summary of access violations:
c:\projects\ilmjru\gpugi\common\inc\helper_math.h(1248): error MemoryChecker: #misaligned=0 #invalidAddress=32

Memory Checker detected 31 access violations.
error = access violation on store (shared memory)
gridid = 40
blockIdx = {6,1,0}
threadIdx = {1,0,0}
address = 0x23f00fff160
accessSize = 4

EXAMPLE #4:

This one is a misaligned store in global memory, caused by the memset function mentioned above.

CUDA Memory Checker detected 32 threads caused an access violation:
Launch Parameters
CUcontext = 23f7553c750
CUstream = 23f77b8eb70
CUmodule = 23f0313dcd0
CUfunction = 23f0325dfa0
FunctionName = _ZN12CuBDPTKernel16evalContributionEy
GridId = 40
gridDim = {32,64,1}
blockDim = {8,4,1}
sharedSize = 256
Parameters:
surface = 6
Parameters (raw):
0x00000006 0x00000000
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source

23f7bfff181 1 mis st g 48 0 {16,1,0} {0,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 1 {16,1,0} {1,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 2 {16,1,0} {2,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 3 {16,1,0} {3,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 4 {16,1,0} {4,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 5 {16,1,0} {5,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 6 {16,1,0} {6,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 7 {16,1,0} {7,0,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 8 {16,1,0} {0,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 9 {16,1,0} {1,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 10 {16,1,0} {2,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 11 {16,1,0} {3,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 12 {16,1,0} {4,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 13 {16,1,0} {5,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 14 {16,1,0} {6,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 15 {16,1,0} {7,1,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 16 {16,1,0} {0,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 17 {16,1,0} {1,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 18 {16,1,0} {2,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 19 {16,1,0} {3,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 20 {16,1,0} {4,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 21 {16,1,0} {5,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 22 {16,1,0} {6,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 23 {16,1,0} {7,2,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 24 {16,1,0} {0,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 25 {16,1,0} {1,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 26 {16,1,0} {2,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 27 {16,1,0} {3,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 28 {16,1,0} {4,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 29 {16,1,0} {5,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 30 {16,1,0} {6,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422
23f7bfff181 1 mis st g 48 31 {16,1,0} {7,3,0} _ZN10CuGeometry9intersectERK8TriangleRK3RayRfS6_Rb+000d30 c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp:422

Summary of access violations:
c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp(422): error MemoryChecker: #misaligned=32 #invalidAddress=0

Memory Checker detected 32 access violations.
error = misaligned store (global memory)
gridid = 40
blockIdx = {16,1,0}
threadIdx = {0,0,0}
address = 0x23f7bfff181
accessSize = 1

There are a bunch of other variants, but it’s all happening in the intersect function, always with local variables. The structs I use (Triangle, etc.) are all defined with 16 byte alignment (“alignas(16)”).

This is a school project, the deadline is very close now so I’ve been working on this non-stop for 2 days now but I have no idea why it happens. Could you guys help me out please?

Thanks in advance,
David

It is exceedingly likely that the root cause is outside the function shown, and that the actual failure occurs in this code because ‘intersect’ is being passed bad pointers to data. So I would suggest looking “upstream” of this code (that is, higher up in the call chain).

I’m debugging the code, and all parameters passed to intersect have valid values.

Also, the only thing passed that’s not a local variable somewhere upstream is the Triangle instance. but the debugger shows that it has a valid value just like the other parameters.

Work backwards from the source code line that triggers the out-of-bounds memory access. You didn’t show where that is in the code you posted, or did I overlook it?

It’s happening at different lines all the time.

Examples:

float3 P = cross(r.d, e2);
float det = dot(e1, P);
float3 e1 = triangle.B.position - triangle.A.position;

Here’s the function calling intersect:

__device__ CuKDNode* nodes;
  __device__ size_t lastNodeIdx;
  __device__  Triangle* triangles;

  inline __device__ Fragment hitNode(unsigned int idx, const Ray& ray)
  {
    if (idx > lastNodeIdx) {
      return NO_HIT;
    }
    const CuKDNode& node = nodes[idx];
    if (!node.valid) {
      return NO_HIT;
    }
    if (CuGeometry::hitBBox(node.from, node.to, ray)) {
      if (node.leaf) {
        float t = FLOAT_MAX, u, v, tempT, tempU, tempV;
        bool face, tempFace;
        Triangle* triangle{ nullptr };
        Triangle tri;
        for (unsigned int i = node.begin; i < node.end; ++i) {
          tri = triangles[i];
          tempT = CuGeometry::intersect(tri, ray, tempU, tempV, tempFace);
          if (tempT > 0.0f && tempT < t) {
            u = tempU;
            v = tempV;
            face = tempFace;
            triangle = &triangles[i];
          }
        }
        if (!triangle) {
          return NO_HIT;
        }
        return Fragment{ t, 
          u * triangle->B.position + v * triangle->C.position + (1.0f - u - v) * triangle->A.position,
          normalize(u * triangle->B.normal + v * triangle->C.normal + (1.0f - u - v) * triangle->A.normal),
          u * triangle->B.texcoord + v * triangle->C.texcoord + (1.0f - u - v) * triangle->A.texcoord,
          face, 
          triangle->texture,
          triangle->emission,
          CuGeometry::area(*triangle),
          triangle->refl };
      }
      else {
        Fragment leftFragment = hitNode(2 * idx + 1, ray);
        Fragment rightFragment = hitNode(2 * idx + 2, ray);
        return leftFragment.t < rightFragment.t ? leftFragment : rightFragment;
      }
    }
    return NO_HIT;
  }

It seems like you are passing in invalid pointers or you’re accessing them in a bad way.

Also, it looks like your function is impure in general. I don’t know your code but seeing an impure version of an intersection routine feels very, very off.

It does seem like I’m passing an invalid pointer, and that’s buggering me, because (as you can see in the code above) every parameter passed to the intersect function is local to the function calling it (hitNode).

Is it still possible that they point to a wrong address?

Here is the code that allocates the arrays in global memory:

class CuKDTree {
[...]

  void uploadToDevice() {
    checkCudaErrors(cudaMalloc(&m_nodesPtr, m_nodes.size() * sizeof(CuKDNode)));
    checkCudaErrors(cudaMemcpy(m_nodesPtr, m_nodes.data(), m_nodes.size() * sizeof(CuKDNode), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpyToSymbol(CuKDTreeDevice::nodes, &m_nodesPtr, sizeof(CuKDNode*), 0, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMalloc(&m_trianglesPtr, m_triangles.size() * sizeof(Triangle)));
    checkCudaErrors(cudaMemcpy(m_trianglesPtr, m_triangles.data(), m_triangles.size() * sizeof(Triangle), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpyToSymbol(CuKDTreeDevice::triangles, &m_trianglesPtr, sizeof(Triangle*), 0, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpyToSymbol(CuKDTreeDevice::lastNodeIdx, &m_lastNodeIdx, sizeof(unsigned int), 0, cudaMemcpyHostToDevice));
  }

private:
  std::vector<CuKDNode> m_nodes;
  std::vector<Triangle> m_triangles;
  CuKDNode* m_nodesPtr{ nullptr };
  size_t m_lastNodeIdx;
  Triangle* m_trianglesPtr{ nullptr };
};

Is there anything wrong with it?

My best guess is that this

tri = triangles[i];

is picking up out-of-bounds data. Have you checked the following iteration against the allocation size?

Use more Thrust. Write more pure functions. Prefer thrust::host_vector over std::vector because (I’m assuming) Thrust allocates pinned host memory for faster GPU transfers.

I’m stepping over the loop that iterates over the triangles, and I noticed something very strange:
the value of “tri” only changes on every second iteration, specifically when the value of index “i” is odd.
I can’t think of any explanation to this.

“Have you checked the following iteration against the allocation size?”

Sorry but can you please explain what do you mean by that?

Sorry, please disregard my last comment. I was only checking the value of the first vertex of the triangle, and that changes on every second iteration because neighboring triangles share the first vertex.

OK I removed all global device pointers and placed everything in thrust::device_vectors. I get the device pointers from these and pass them down the way to every device function that uses them (so they are pure functions if we don’t count the constant globals).

Unfortunately the issue still remains. :-(

So, one thing that might help here is to start writing tests. Another thing that somewhat concerns me is global device pointers. There’s a shocking amount of mutation and global state reliance in your code, it seems, and things are breaking. This isn’t me ranting about the boons of functional programming but I will say this, break your code down into small, test-able chunks. Cut as much reliance on global variables as possible. Avoid unnecessary mutation like the plague.

device_vectors won’t also solve access problems. The only problem they solve is memory management itself (freeing, copying, that kind of thing).

Again, write small composable functions that you can unit test and then you can test how all these functions integrate and so on and so forth.

Note, global constants are usually fine. When they aren’t managed resources like heap allocations. If you have a global int as a config, that’s fine. If you have a global pointer? That’s kind of a red flag, imo.

Are you suggesting that I should write unit tests that run on the GPU?

The code is tested on the CPU side (I use the same code there and it works fine).

Also, if I should avoid pointers that point to arrays in global memory, where should I store my stuff/how should my functions access the stuff?

By stuff I mainly mean triangles and KD tree nodes. I could try hacking them into surface objects somehow, which does not seem like a good idea, but nothing else comes to my mind. One way or another all the threads have to read these concurrently.

I think I know what you mean. Even though the code works on the CPU I have to test on the GPU, so tomorrow I’ll start writing global functions that test the path tracing functions reparately.

Thanks for the help people!
Any more help would be much appreciated until this becomes solved. :-)

I used to use device-side asserts but then I got some better advice on GPU testing from the ArrayFire team. If you’re running something on the GPU, you expect some sort of after-effect. This is what you test from the host-side. If a GPU routine should sum up two vectors, run the kernel and then do a device-to-host copy and check that it’s what you want.

If you have host and device routines that don’t depend on pointers, you can just test the host stuff and it’s fine. Otherwise, you’ll have to do what I mentioned before.

As for pointers, I would recommend that you think about the lifetimes of your allocations and what are your intentions. How long something should live and what should happen when it dies is a cornerstone of C++. So whenever I hear the phrase “global pointer”, all I can think is, “Oh Jesus Christ, we haven’t worked out what the lifetimes are supposed to look like”.

This is not an absolute and you’re free to ignore me but I will say this, I’ve found that if code is easy to test and is well-tested, it’s usually a lot better than code that is not well-tested and not easy to test.

As for testing, I used to use just simple assert statements but now I’ve switched to a framework called Catch which is a nice header-only library.

Also, if you haven’t heard all the buzzwords and nonsense that functional programming evangelists like to promote, CUDA benefits from a lot of their ideas.

For one, cutting reliance on global state. This is just good design advice in general. Minimize global reliance is a good rule of thumb.

Purity is another. If you can, const-qualify freaking everything under the sun. If you can avoid mutating a reference, do so! Prefer returning actual values instead of mutating in-place. This is not useful for pointers (obviously) but this kind of design will simplify other things in your codebase.

Higher-order functions is another thing. Higher-order functions are functions that take functions as arguments. The Thrust library uses this extensively with their transform, reduce, etc. algorithms. This is a design pattern that fits well to the GPU.

I write geometric code as well. It’s very simple to write small and pure functions that deal with things like point-tetrahedron intersection. Or point-circumsphere intersection. By modularizing and testing these things in isolation, I have to worry a lot less when I’m coding up my main triangulation routine.

None of what I’m saying should be taken as an absolute though but maybe rather a goal. Do what you need to do but try to avoid things that can potentially produce incorrect code.

Update:

I still didn’t managed to fix the issue, but I wrote a small app that reproduces the same thing for me, not just with my struct, but with the built-in float4 as well:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "helper_functions.h"    // includes cuda.h and cuda_runtime_api.h
#include "helper_cuda.h"         // helper functions for CUDA error check

#include <iostream>
#include <random>
#include <thrust/device_vector.h>

#define COUNT1 20000
#define COUNT2 10000
#define WIDTH   1024
#define HEIGHT  1024
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16

float randF()
{
  static std::default_random_engine generator;
  static std::uniform_real_distribution<float> distr;
  return distr(generator);
}

__device__ float4* float4s;
__device__ float4* float4scopy;

__global__ void test1()
{
  for (int i = 0; i < COUNT1; ++i) {
    float4scopy[i] = float4s[i];
  }
}

struct __align__(16) Vertex
{
  float3 position;
  float3 normal;
  float2 texcoord;
};

__device__ Vertex* triangleVertices;
__device__ Vertex* triangleVerticesCopy;

__global__ void test2()
{
  for (int i = 0; i < COUNT2; ++i) {
    triangleVerticesCopy[i] = triangleVertices[i];
  }
}

int main()
{
  std::cout << "sizeof(Vertex): " << sizeof(Vertex) << std::endl;

  checkCudaErrors(cudaSetDevice(gpuGetMaxGflopsDeviceId()));

  {
    thrust::host_vector<float4> float4sHost;
    for (int i = 0; i < COUNT1; ++i) {
      float4sHost.push_back(float4{ randF(), randF(), randF(), randF() });
    }
    thrust::device_vector<float4> float4sDevice(float4sHost);
    float4* float4sDevPtr = float4sDevice.data().get();
    checkCudaErrors(cudaMemcpyToSymbol(float4s, &float4sDevPtr, sizeof(float4sDevPtr), 0, cudaMemcpyHostToDevice));

    thrust::device_vector<float4> float4sDeviceCopy;
    float4sDeviceCopy.resize(COUNT1);
    float4* float4sCopyDevPtr = float4sDeviceCopy.data().get();
    checkCudaErrors(cudaMemcpyToSymbol(float4scopy, &float4sCopyDevPtr, sizeof(float4sCopyDevPtr), 0, cudaMemcpyHostToDevice));


    dim3 block{ BLOCK_DIM_X, BLOCK_DIM_Y, 1 };
    dim3 grid{ WIDTH / BLOCK_DIM_X, HEIGHT / BLOCK_DIM_Y, 1 };

    test1 << < grid, block >> > ();

    checkCudaErrors(cudaStreamSynchronize(0));
  }

  {
    thrust::host_vector<Vertex> verticesHost;
    for (int i = 0; i < COUNT2; ++i) {
      verticesHost.push_back(
        Vertex{
        float3{ randF(), randF(), randF() },
        float3{ randF(), randF(), randF() },
        float2{ randF(), randF() }
      });
    }
    thrust::device_vector<Vertex> verticesDevice(verticesHost);
    Vertex* verticesDevPtr = verticesDevice.data().get();
    checkCudaErrors(cudaMemcpyToSymbol(triangleVertices, &verticesDevPtr, sizeof(verticesDevPtr), 0, cudaMemcpyHostToDevice));

    thrust::device_vector<Vertex> verticesDeviceCopy;
    verticesDeviceCopy.resize(COUNT2);
    Vertex* verticesCopyDevPtr = verticesDeviceCopy.data().get();
    checkCudaErrors(cudaMemcpyToSymbol(triangleVerticesCopy, &verticesCopyDevPtr, sizeof(verticesCopyDevPtr), 0, cudaMemcpyHostToDevice));


    dim3 block{ BLOCK_DIM_X, BLOCK_DIM_Y, 1 };
    dim3 grid{ WIDTH / BLOCK_DIM_X, HEIGHT / BLOCK_DIM_Y, 1 };

    test2 <<< grid, block >>> ();

    checkCudaErrors(cudaStreamSynchronize(0));
  }

  checkCudaErrors(cudaDeviceReset());
  return 0;
}

I’m starting to think that there’s a bug in the driver or my GPU is faulty.
I have an MSI 1080 Gaming X, maybe the driver is still a bit buggy for pascal GPUs, does anyone know anything about that?

I ran your code on a Titan X Pascal, on Ubuntu 14.04.1 with CUDA 8.0.44 and driver 367.44

cuda-memcheck reported no errors

with the windows memory checker (or cuda-memcheck) your kernels will run quite a bit slower - that is normal/expected.

For long running kernels on windows, the usual questions around the WDDM TDR mechanism apply. For this particular code, in my case, the kernel execution time goes from less than 1 second to over 20 seconds. On windows, this would be enough to trigger the TDR mechanism.

Normally when the TDR mechanism triggers, you have some visible indication, such as a screen flash/repaint, and a message in the system tray about the driver stopped responding…

You might also want to update to the latest r375 branch driver for that GPU, and I am assuming you are using CUDA 8.0