Strange Warp Misaligned Address problem

I am getting a strange error when attempting to run on the GPU. I am using separate .cu files and compiling with the following flags:
nvcc --gpu-architecture=compute_20 --gpu-code=sm_20 -c --relocatable-device-code=true *.cu
nvcc --gpu-architecture=compute_20 --gpu-code=sm_20 --device-link *.o --output-file link.o
g++ -c *.cpp
g++ -o /home/ben/Executables/LCDPLD/GPULCDPLD *.o -L/usr/local/cuda-5.0/lib -lcuda -lcudart -lcurand

Any variables that you find undefined are in constant memory in cudafunctions.h.
The code seems to have a problem with attempting to write Tempdist2 back to global memory ggrid_r (found by compiling and commenting out lines of code)

#include "cudafunctions.h"

__global__ void GPULCD(float* ggrid_r, ATOMS* gExtraAtoms)
{
	int tidx = blockIdx.x*blockDim.x+threadIdx.x;
	int tidy = blockIdx.y*blockDim.y+threadIdx.y;
	int tidz = blockIdx.z*blockDim.z+threadIdx.z;
    int position = tidz*gGridPoints.x*gGridPoints.y+tidy*gGridPoints.x+tidx;
	int i;
	float posx,posy,posz;
    float TempX,TempY,TempZ;
float FracX,FracY,FracZ;
	float TempDist, TempDist2;
    if(tidx<gGridPoints.x && tidy<gGridPoints.y && tidz<gGridPoints.z)
    {
        posx=tidx*gGridSize.x/gGridPoints.x+gGridShift.x;
        posy=tidy*gGridSize.y/gGridPoints.y+gGridShift.y;
        posz=tidz*gGridSize.z/gGridPoints.z+gGridShift.z;
        TempDist2=5000;
        for (i=0; i<(gAtomSize-gExtraAtomSize); i++)
        {
            TempX=posx-gAtoms[i].CartPosition.x;
            TempY=posy-gAtoms[i].CartPosition.y;
            TempZ=posz-gAtoms[i].CartPosition.z;
FracX=gInverseUnitCellVectors.ax*TempX+gInverseUnitCellVectors.bx*TempY+gInverseUnitCellVectors.cx*TempZ;
FracY=gInverseUnitCellVectors.ay*TempX+gInverseUnitCellVectors.by*TempY+gInverseUnitCellVectors.cy*TempZ;            FracZ=gInverseUnitCellVectors.az*TempX+gInverseUnitCellVectors.bz*TempY+gInverseUnitCellVectors.cz*TempZ;
            FracX-=(float)rint(FracX);
            FracY-=(float)rint(FracY);
            FracZ-=(float)rint(FracZ);
            TempX=gUnitCellVectors.ax*FracX+gUnitCellVectors.bx*FracY+gUnitCellVectors.cx*FracZ;
            TempY=gUnitCellVectors.ay*FracX+gUnitCellVectors.by*FracY+gUnitCellVectors.cy*FracZ;
            TempZ=gUnitCellVectors.az*FracX+gUnitCellVectors.bz*FracY+gUnitCellVectors.cz*FracZ;
            TempDist=sqrt((pow(TempX,2)+pow(TempY,2)+pow(TempZ,2)))-gAtoms[i].Radius;
            if (TempDist2>TempDist)
                TempDist2=TempDist;
        }
        ggrid_r[position]=TempDist2;
	}
}

This is the error from cuda-gdb:
[Launch of CUDA Kernel 42 (GPULCD<<<(1,2,3),(8,8,8)>>>) on Device 0]

Program received signal CUDA_EXCEPTION_6, Warp Misaligned Address.
[Switching focus to CUDA kernel 42, grid 3, block (0,0,0), thread (0,4,1), device 0, sm 2, warp 3, lane 0]
0x000000000830d4a8 in GPULCD(float*, ATOMS*)<<<(1,2,3),(8,8,8)>>> ()

Hi chemnerd. Could you try running your application with cuda-memcheck ? Cuda-memcheck is a tool that can precisely detect misaligned and out of bounds memory accesses. You can do this by issuing

cuda-memcheck GPULCDPLD

. You can also enable cuda-memcheck inside cuda-gdb by issuing

set cuda memcheck on

before running the application.

Here are the results from running cuda-memcheck GPULCDPLD

Error at cudacode.cu:175
========= Misaligned Shared or Local Address
========= at 0x00000330 in GPULCD(float*, ATOMS*)
========= by thread (0,4,6) in block (0,0,1)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x34b) [0x55d0b]
========= Host Frame:/usr/local/cuda-5.0/lib/libcudart.so.5.0 [0x8f6a]

========= Program hit error 4 on CUDA API call to cudaDeviceSynchronize
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/libcuda.so [0x24e129]
========= Host Frame:/usr/local/cuda-5.0/lib/libcudart.so.5.0 (cudaDeviceSynchronize + 0x214) [0x27e24]

========= Program hit error 4 on CUDA API call to cudaMemcpy
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/libcuda.so [0x24e129]
========= Host Frame:/usr/local/cuda-5.0/lib/libcudart.so.5.0 (cudaMemcpy + 0x2bc) [0x3772c]
========= Host Frame:[0x5800000]

========= ERROR SUMMARY: 3 errors

Thanks for the help!

Doesn’t seem to like the way I am accessing the constant memory struct gUnitCellVectors: Given is the code for declaring it and what kind of struct it is:

__constant__ ATOMS gAtoms[1500];
__constant__ POINT gUnitCellSize;
__constant__ VECTOR3 gUnitCellVectors;
__constant__ VECTOR3 gInverseUnitCellVectors;
__constant__ POINT gGridSize;
__constant__ POINT gGridShift;
__constant__ INT3 gGridPoints;
__constant__ INT3 gNumberUnitCells;
__constant__ int gAtomSize;
__constant__ int gExtraAtomSize;

...

    ///////////////Copy the constants from memory to GPU memory for use throughout the code////////////////
    CUDA_CALL(cudaMemcpyToSymbol(gGridSize,&GlobalGrid.GridSize, sizeof(POINT)));                                   //
    CUDA_CALL(cudaMemcpyToSymbol(gGridShift,&GlobalGrid.GridShift, sizeof(POINT)));                                 //
    CUDA_CALL(cudaMemcpyToSymbol(gGridPoints,&GlobalGrid.NumberOfGridPoints,sizeof(INT3)));                         //
	CUDA_CALL(cudaMemcpyToSymbol(gAtomSize,&AtomSize,sizeof(int)));                                               //
	CUDA_CALL(cudaMemcpyToSymbol(gExtraAtomSize,&ExtraAtomSize,sizeof(int)));                                     //                                              //
    CUDA_CALL(cudaMemcpyToSymbol(gUnitCellSize,&CurrentFramework.UnitCellSize, sizeof(POINT)));                     //
    CUDA_CALL(cudaMemcpyToSymbol(gUnitCellVectors,&CurrentFramework.UnitCellVectors, sizeof(VECTOR3)));             //
    CUDA_CALL(cudaMemcpyToSymbol(gInverseUnitCellVectors,&CurrentFramework.InverseUnitCellVectors,sizeof(VECTOR3)));//
    CUDA_CALL(cudaMemcpyToSymbol(gNumberUnitCells,&CurrentFramework.NumberUnitCells,sizeof(INT3)));                 //
    CUDA_CALL(cudaMemcpyToSymbol(gAtoms,TempAtoms,sizeof(ATOMS)*ConstantAtomSize));                                 //
    CUDA_CALL(cudaMemcpyToSymbol(gSurfaceArea,&Zero,sizeof(float)));                                                  //
    ///////////////////////////////////////////////////////////////////////////////////////////////////////

The struct:

struct VECTOR3
{
    float ax;
    float ay;
    float az;
    float bx;
    float by;
    float bz;
    float cx;
    float cy;
    float cz;

    void PrintUnitCellSize()
    {
        cout<<"ax = "<<ax<<endl;
        cout<<"ay = "<<ay<<endl;
        cout<<"az = "<<az<<endl;
        cout<<"bx = "<<bx<<endl;
        cout<<"by = "<<by<<endl;
        cout<<"bz = "<<bz<<endl;
        cout<<"cx = "<<cx<<endl;
        cout<<"cy = "<<cy<<endl;
        cout<<"cz = "<<cz<<endl;
    }
};