CUDA Runtime API error 74: misaligned address

Hello,

I am working on shared memory and the program send me that error.
Here is my program:

////////////////////////////////////////////////////////////////////////////////////
    /// Local copy of the Image +size of the kernel and the kernel coordinate
    ///
    __shared__ u_int8_t LocalImage[(BLOCK_SIZE_X+kernelSize_X)*(BLOCK_SIZE_Y+kernelSize_Y)];
    __shared__ int16_t LocalKernel[kernelSize_X*kernelSize_Y];



    int x = blockIdx.x*blockDim.x;
    int y = blockIdx.y*blockDim.y;

    if( (blockIdx.x*blockDim.x +threadIdx.x)  >= w  || (blockIdx.y*blockDim.y +threadIdx.y)  >= h)
    {
        return;
    }

    for(int i=threadIdx.x;i<BLOCK_SIZE_X+kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<BLOCK_SIZE_Y+kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            LocalImage[i+j*(BLOCK_SIZE_X+kernelSize_X)] = (u8_ImageIn)[x+i + (y+j)*w];
        }
    }


    for(int i=threadIdx.x;i<kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            int ptrk =  i + j*kernelSize_X;
            LocalKernel[ptrk] = u8_Kernel[ptrk];
        }
    }


    syncthreads();


    ////////////////////////////////////////////////////////////

    int iMidX = (kernelSize_X-1)/2;
    int iMidY = (kernelSize_Y-1)/2;

    int xglobal = x+threadIdx.x;
    int yglobal = y+threadIdx.y;
    int xglobal_offset = x+threadIdx.x+iMidX;
    int yglobal_offset = y+threadIdx.y+iMidY;


    if( xglobal < iMidX ||  yglobal < iMidY)
    {
        // Change the output pixel to 0
        u8_ImageOut[xglobal+yglobal*w] =  0;
    }

    if( xglobal >= w-kernelSize_X/2 ||  yglobal >= h-kernelSize_Y/2)
    {
        // Change the output pixel to 0
        u8_ImageOut[xglobal+yglobal*w] =  0;
        return;
    }

    if( xglobal > w-kernelSize_X ||  yglobal > h-kernelSize_Y)
    {
        return;
    }


    int ptrl = threadIdx.x+iMidX + (threadIdx.y+iMidY) *(BLOCK_SIZE_X+kernelSize_X);
    if(LocalImage[ptrl] == 0)
    {
        u8_ImageOut[xglobal_offset+yglobal_offset*w] =  0;
        return;
    }



    u_int32_t valMin = 255;
    u_int32_t tmpAND;
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    int ptrl_Kernel = 0;//threadIdx.x + (threadIdx.y) *(kernelSize_X);

//        printf("ptrl_Image %d %d %d  \n",threadIdx.x,threadIdx.y,ptrl_Image);
    u_int32_t *ptLocalImage     = (u_int32_t *)(&LocalImage[ptrl_Image]);
    u_int32_t *ptLocalKernel    = (u_int32_t *)(&LocalImage[ptrl_Kernel]);



    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i+=4)
        {
            tmpAND = *ptLocalImage & *ptLocalKernel;
            valMin = min(valMin,tmpAND);
        }
        ptrl_Image+=(BLOCK_SIZE_X+kernelSize_X);
//        / ptLocal = (u_int32_t *)(&LocalImage[ptrl2]);
    }

The error seems to come from these two lines:

u_int32_t *ptLocalImage     = (u_int32_t *)(&LocalImage[ptrl_Image]);
    u_int32_t *ptLocalKernel    = (u_int32_t *)(&LocalImage[ptrl_Kernel]);

Can someone explain me why?

Because u_int8_t and int16_t arrays like LocalImage and LocalKernel are not usually aligned at 4 byte boundaries.

You can try forcing the appropriate alignment. But even then ptrl_Image and ptrl_Kernel must be multiples of 4 for this to work.

Christian

Thank you for the answer.

Even the shared memory had to be aligned? And what about the simple local array?
If ptLocalKernel is a local array should it be align?

Yes, all memory accesses need alignment, regardless of memory space.

Thank you.

So I modify my kernel like this:

__shared__ u_int8_t LocalImage[(BLOCK_SIZE_X+kernelSize_X)*(BLOCK_SIZE_Y+kernelSize_Y)];
    __shared__ u_int8_t LocalKernel[kernelSize_X][kernelSize_Y];


    int x = blockIdx.x*blockDim.x;
    int y = blockIdx.y*blockDim.y;

    if( (blockIdx.x*blockDim.x +threadIdx.x)  >= w  || (blockIdx.y*blockDim.y +threadIdx.y)  >= h)
    {
        return;
    }

    for(int i=threadIdx.x;i<BLOCK_SIZE_X+kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<BLOCK_SIZE_Y+kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            LocalImage[i+j*(BLOCK_SIZE_X+kernelSize_X)] = (u8_ImageIn)[x+i + (y+j)*w];
        }
    }


    for(int i=threadIdx.x;i<kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            int ptrk =  i + j*kernelSize_X;
            LocalKernel[i][j] = u8_Kernel[ptrk];
        }
    }


    syncthreads();


    ////////////////////////////////////////////////////////////

    int iMidX = (kernelSize_X-1)/2;
    int iMidY = (kernelSize_Y-1)/2;

    int xglobal = x+threadIdx.x;
    int yglobal = y+threadIdx.y;
    int xglobal_offset = x+threadIdx.x+iMidX;
    int yglobal_offset = y+threadIdx.y+iMidY;

    int ptrl = threadIdx.x+iMidX + (threadIdx.y+iMidY) *(BLOCK_SIZE_X+kernelSize_X);
    if(LocalImage[ptrl] == 255)
    {
        u8_ImageOut[xglobal_offset+yglobal_offset*w] =  125;
        return;
    }


    u_int8_t LocalALignedMemory[kernelSize_X][kernelSize_Y];
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    u_int8_t *ptLocalImage     = (u_int8_t *)(&LocalImage[ptrl_Image]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i++)
        {
            LocalALignedMemory[i][j]  = *ptLocalImage;
            ptLocalImage++;
        }
        ptrl_Image += (BLOCK_SIZE_X+kernelSize_X);
        ptLocalImage = (u_int8_t *)(&LocalImage[ptrl_Image]);
    }


    u_int32_t valMin = 0;
    u_int32_t tmpAND;
    u_int32_t tmpImage;
    u_int32_t tmpKernel;

    ptrl_Image  = 0;
    u_int32_t *pt32LocalAlignedImage     = (u_int32_t *)(&LocalALignedMemory[0][0]);
    u_int32_t *pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i+=4)
        {

            tmpImage    = *(u_int32_t *)(&LocalALignedMemory[i][j]);
            tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));
            tmpAND = tmpImage | tmpKernel;
            valMin = max(valMin,tmpAND);
        }
}

The problem send me the same misalign error on that line:

tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));

How could this be possible? I made the memory aligned.

You haven’t made it aligned. You attempted to “align” the i index, but based on C storage patterns you have to align the j index. Even that is problematic since there is no guarantee (AFAIK) that LocalKernel is aligned to a 32-bit boundary to begin with.

Anyway, if you are still getting the error, now or in the future, it means you are doing a misaligned access. If you get the error, the presence of the error indicates that you haven’t properly aligned things.

How can I properly align it according i and j?

I’m not going to rewrite your kernel for you.

One possible approach would be to start out with LocalKernel defined as a uint_32 type. Then, every time you write to it, write 32 bits. Every time you read from it, you are reading 32 bits.

Problem solved. Obviously this requires a bunch of changes to the rest of your kernel. I’m not going to try and identify what all that should be.

I’m sure there are other approaches as well.

It is not what I am asking. I just would like to know if there is a way to force a array to be align in cuda?

Try the accepted answer on stackoverflow. It defines an alignment macro that works in all compiler environments (on host compilers as well).

http://stackoverflow.com/questions/12778949/cuda-memory-alignment

Christian

You can force an array declaration to be aligned in CUDA, certainly. But if you then go generating byte-level indexing into the array, things can still break.

The issue here is not how to align an array. That’s trivial. The issue is aligned access.

What you need to do is ensure your access mechanisms (index generation) will create aligned indicies into that array.

That will require rewriting of your code, not just ensuring that the base data declaration is aligned.

As you adviced me I wrote à 32 bits version of m’y local copy:

u_int32_t LocalALignedMemory[kernelSize_X][kernelSize_Y];
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    u_int8_t *ptLocalImage     = (u_int8_t *)(&LocalImage[ptrl_Image]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        u_int8_t *u__localptr = (u_int8_t *)&(LocalALignedMemory[j][0]);
        for(int i=0;i<kernelSize_X;i++)
        {
//            (u_int8_t*)(&(LocalALignedMemory[0][j]))+i)  = *ptLocalImage;
            u__localptr[i] = *ptLocalImage;
            ptLocalImage++;
        }
        ptrl_Image += (BLOCK_SIZE_X+kernelSize_X);
        ptLocalImage = (u_int8_t *)(&LocalImage[ptrl_Image]);
    }


    u_int32_t valMin = 255;
    u_int32_t tmpAND;
    u_int32_t tmpImage;
    u_int32_t tmpKernel;

//    ptrl_Image  = 0;
//    u_int32_t *pt32LocalAlignedImage     = (u_int32_t *)(&LocalALignedMemory[0][0]);
//    u_int32_t *pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);


    for(int j=0;j<kernelSize_Y;j++)
    {
        int i = 0;
        int ip =4;
        for(;(i)<kernelSize_X;)
        {
            if(blockIdx.x == 0 && blockIdx.y ==0 &&threadIdx.x ==0 && threadIdx.y==0)
                printf("j %d i %d adress %d %d %d \n",j,i,(&LocalALignedMemory[j][i]),(&LocalALignedMemory[j][i])- (&LocalALignedMemory[j][i+1]),(&LocalALignedMemory[j+1][i])- (&LocalALignedMemory[j][i]));
            //tmpImage = *LocalImage <<24 | *(LocalImage+1)<<16| *(LocalImage+2)<<8| *(LocalImage+3);
            tmpImage    = *(u_int32_t *)(&LocalALignedMemory[i][j]);//*pt32LocalAlignedImage;
//            tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));//*pt32LocalKernel;//*ptLocalKernel <<24 | *(ptLocalKernel+1)<<16| *(ptLocalKernel+2)<<8| *(ptLocalKernel+3);
            tmpAND = tmpImage;
            valMin = min(valMin,tmpAND);
//            pt32LocalAlignedImage++;
//            pt32LocalKernel++;
            i+=4;
            ip+=4;
        }
//        pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);
//        pt32LocalAlignedImage = (u_int32_t *)(&LocalALignedMemory[0][0]);

//        i-=4;
        for(;(i+2)<kernelSize_X;i+=2)
        {
            tmpImage    = (0x00000000) | *(u_int16_t *)(&LocalALignedMemory[i][j]);//*pt32LocalAlignedImage;
            tmpAND = tmpImage ;
            valMin = min(valMin,tmpAND);
        }

//        i-=2;
        for(;i<kernelSize_X;i++)
        {
            tmpImage    = (0x00000000) | LocalALignedMemory[i][j];//*pt32LocalAlignedImage;
            tmpAND = tmpImage ;
            valMin = min(valMin,tmpAND);
        }

    }

There is no misa lignes error but the result is not correct.