Nested Unions - Warp Out-Of-Bound-Access

Dear All,

I am stuck with a weird problem (does not happen if I execute the same code on CPU):

When executing the following code, I get the following error in GDB:

"in xGenome::CopyFromGlobal (this=0xffffffff01000000, __xThreadInfo={data = {x = 0, y = 0, z = 0, w = 0}}, __g_ucGenomes=0x200400000 “”)

at kernel.cu:100"

(I marked line 100 in the code).

I am worried that somehow the members of xGenome are not initialised in memory or so? (nested unions)

#include <stdio.h>

#define m_fit_SAFE_MEMORY_MAPPING

#define mAlignedByteLengthGenome 8

#define mWarpSize 32

#define m_fit_THREAD_DIM_X 32

#define m_fit_THREAD_DIM_Y 1

#define m_fit_BLOCK_DIM_X 1

struct xThreadInfo {

        ushort4 data;__device__

        xThreadInfo(unsigned short __usThreadIdX, unsigned short __usThreadIdY,

                        unsigned short __usBlockIdX, unsigned short __usBlockIdY);__device__

        unsigned short WarpId(void);__device__

        unsigned short BankId(void);__device__

        unsigned short FlatThreadId(void);__device__

        unsigned short FlatBlockId(void);__device__

        unsigned short GlobId(unsigned short __usTypeLength);__device__

        void __DEBUG_CALL(void);

};

__device__ xThreadInfo::xThreadInfo(unsigned short __usThreadIdX,

                unsigned short __usThreadIdY, unsigned short __usBlockIdX,

                unsigned short __usBlockIdY) {

        this->data.z = threadIdx.y * m_fit_THREAD_DIM_X + threadIdx.x; //Flat Thread ID

        this->data.x = this->data.z % mWarpSize; //BankID

        this->data.y = (this->data.z - this->data.x) / mWarpSize; //WarpID

        this->data.w = blockIdx.y * m_fit_BLOCK_DIM_X + blockIdx.x; //Flat Block ID

}

__device__ unsigned short xThreadInfo::GlobId(unsigned short __usTypeLength) {

        return (this->data.w * m_fit_THREAD_DIM_X * m_fit_THREAD_DIM_Y

                        + this->data.z) * __usTypeLength;

}

struct xGenome {

        union {

                unsigned char one_d[mAlignedByteLengthGenome];

        } data;

        __device__

        void CopyFromGlobal(xThreadInfo __xThreadInfo,

                        unsigned char *__g_ucGenomeSet);__device__

};

struct xGenomeSet {

        union {

                xGenome multi_d[mWarpSize];

        } data;__device__

        __device__

        void CopyFromGlobal(xThreadInfo __xThreadInfo,

                        unsigned char *__g_ucGenomeSet);

};

__device__ void xGenomeSet::CopyFromGlobal(xThreadInfo __xThreadInfo,

                unsigned char *__g_ucGenomes) {

        this->data.multi_d[__xThreadInfo.WarpId()].CopyFromGlobal(__xThreadInfo, __g_ucGenomes);

}

__device__ void xGenome::CopyFromGlobal(xThreadInfo __xThreadInfo,

                unsigned char *__g_ucGenomes) {

        for (int i = 0; i < mAlignedByteLengthGenome; i += 1) { //LINE 100!!!

                this->data.one_d[i] = __g_ucGenomes[__xThreadInfo.GlobId(sizeof(xGenome)) + i];

        }

}

This is the executed kernel:

__global__ void multiply_them(unsigned char *dest) //This is the kernel we execute

{

    __shared__ xGenomeSet Tmp;

    xThreadInfo Tmpa(threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y);

    Tmp.CopyFromGlobal(Tmpa, dest);    

}

So that seems to mean that “this” is not a valid memory address - But I do not understand why that is.

I am very frustrated at the moment as my code contains a lot of nested unions.

Please help.

Thanks and regards

Christian

Ok, found what the problem was.
No printf in device functions (just global).

The error message generated is completely misleading - seems to be a common problem with cuda-gdb.
Regards
Christian