CUDA Runtime API error 74: misaligned address

xav12358 · November 10, 2016, 9:47am

Hello,

I am working on shared memory and the program send me that error.
Here is my program:

////////////////////////////////////////////////////////////////////////////////////
    /// Local copy of the Image +size of the kernel and the kernel coordinate
    ///
    __shared__ u_int8_t LocalImage[(BLOCK_SIZE_X+kernelSize_X)*(BLOCK_SIZE_Y+kernelSize_Y)];
    __shared__ int16_t LocalKernel[kernelSize_X*kernelSize_Y];



    int x = blockIdx.x*blockDim.x;
    int y = blockIdx.y*blockDim.y;

    if( (blockIdx.x*blockDim.x +threadIdx.x)  >= w  || (blockIdx.y*blockDim.y +threadIdx.y)  >= h)
    {
        return;
    }

    for(int i=threadIdx.x;i<BLOCK_SIZE_X+kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<BLOCK_SIZE_Y+kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            LocalImage[i+j*(BLOCK_SIZE_X+kernelSize_X)] = (u8_ImageIn)[x+i + (y+j)*w];
        }
    }


    for(int i=threadIdx.x;i<kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            int ptrk =  i + j*kernelSize_X;
            LocalKernel[ptrk] = u8_Kernel[ptrk];
        }
    }


    syncthreads();


    ////////////////////////////////////////////////////////////

    int iMidX = (kernelSize_X-1)/2;
    int iMidY = (kernelSize_Y-1)/2;

    int xglobal = x+threadIdx.x;
    int yglobal = y+threadIdx.y;
    int xglobal_offset = x+threadIdx.x+iMidX;
    int yglobal_offset = y+threadIdx.y+iMidY;


    if( xglobal < iMidX ||  yglobal < iMidY)
    {
        // Change the output pixel to 0
        u8_ImageOut[xglobal+yglobal*w] =  0;
    }

    if( xglobal >= w-kernelSize_X/2 ||  yglobal >= h-kernelSize_Y/2)
    {
        // Change the output pixel to 0
        u8_ImageOut[xglobal+yglobal*w] =  0;
        return;
    }

    if( xglobal > w-kernelSize_X ||  yglobal > h-kernelSize_Y)
    {
        return;
    }


    int ptrl = threadIdx.x+iMidX + (threadIdx.y+iMidY) *(BLOCK_SIZE_X+kernelSize_X);
    if(LocalImage[ptrl] == 0)
    {
        u8_ImageOut[xglobal_offset+yglobal_offset*w] =  0;
        return;
    }



    u_int32_t valMin = 255;
    u_int32_t tmpAND;
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    int ptrl_Kernel = 0;//threadIdx.x + (threadIdx.y) *(kernelSize_X);

//        printf("ptrl_Image %d %d %d  \n",threadIdx.x,threadIdx.y,ptrl_Image);
    u_int32_t *ptLocalImage     = (u_int32_t *)(&LocalImage[ptrl_Image]);
    u_int32_t *ptLocalKernel    = (u_int32_t *)(&LocalImage[ptrl_Kernel]);



    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i+=4)
        {
            tmpAND = *ptLocalImage & *ptLocalKernel;
            valMin = min(valMin,tmpAND);
        }
        ptrl_Image+=(BLOCK_SIZE_X+kernelSize_X);
//        / ptLocal = (u_int32_t *)(&LocalImage[ptrl2]);
    }

The error seems to come from these two lines:

u_int32_t *ptLocalImage     = (u_int32_t *)(&LocalImage[ptrl_Image]);
    u_int32_t *ptLocalKernel    = (u_int32_t *)(&LocalImage[ptrl_Kernel]);

Can someone explain me why?

cbuchner1 · November 10, 2016, 10:07am

Because u_int8_t and int16_t arrays like LocalImage and LocalKernel are not usually aligned at 4 byte boundaries.

You can try forcing the appropriate alignment. But even then ptrl_Image and ptrl_Kernel must be multiples of 4 for this to work.

Christian

xav12358 · November 10, 2016, 10:14am

Thank you for the answer.

Even the shared memory had to be aligned? And what about the simple local array?
If ptLocalKernel is a local array should it be align?

tera · November 10, 2016, 12:12pm

Yes, all memory accesses need alignment, regardless of memory space.

xav12358 · November 10, 2016, 1:25pm

Thank you.

So I modify my kernel like this:

__shared__ u_int8_t LocalImage[(BLOCK_SIZE_X+kernelSize_X)*(BLOCK_SIZE_Y+kernelSize_Y)];
    __shared__ u_int8_t LocalKernel[kernelSize_X][kernelSize_Y];


    int x = blockIdx.x*blockDim.x;
    int y = blockIdx.y*blockDim.y;

    if( (blockIdx.x*blockDim.x +threadIdx.x)  >= w  || (blockIdx.y*blockDim.y +threadIdx.y)  >= h)
    {
        return;
    }

    for(int i=threadIdx.x;i<BLOCK_SIZE_X+kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<BLOCK_SIZE_Y+kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            LocalImage[i+j*(BLOCK_SIZE_X+kernelSize_X)] = (u8_ImageIn)[x+i + (y+j)*w];
        }
    }


    for(int i=threadIdx.x;i<kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            int ptrk =  i + j*kernelSize_X;
            LocalKernel[i][j] = u8_Kernel[ptrk];
        }
    }


    syncthreads();


    ////////////////////////////////////////////////////////////

    int iMidX = (kernelSize_X-1)/2;
    int iMidY = (kernelSize_Y-1)/2;

    int xglobal = x+threadIdx.x;
    int yglobal = y+threadIdx.y;
    int xglobal_offset = x+threadIdx.x+iMidX;
    int yglobal_offset = y+threadIdx.y+iMidY;

    int ptrl = threadIdx.x+iMidX + (threadIdx.y+iMidY) *(BLOCK_SIZE_X+kernelSize_X);
    if(LocalImage[ptrl] == 255)
    {
        u8_ImageOut[xglobal_offset+yglobal_offset*w] =  125;
        return;
    }


    u_int8_t LocalALignedMemory[kernelSize_X][kernelSize_Y];
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    u_int8_t *ptLocalImage     = (u_int8_t *)(&LocalImage[ptrl_Image]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i++)
        {
            LocalALignedMemory[i][j]  = *ptLocalImage;
            ptLocalImage++;
        }
        ptrl_Image += (BLOCK_SIZE_X+kernelSize_X);
        ptLocalImage = (u_int8_t *)(&LocalImage[ptrl_Image]);
    }


    u_int32_t valMin = 0;
    u_int32_t tmpAND;
    u_int32_t tmpImage;
    u_int32_t tmpKernel;

    ptrl_Image  = 0;
    u_int32_t *pt32LocalAlignedImage     = (u_int32_t *)(&LocalALignedMemory[0][0]);
    u_int32_t *pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i+=4)
        {

            tmpImage    = *(u_int32_t *)(&LocalALignedMemory[i][j]);
            tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));
            tmpAND = tmpImage | tmpKernel;
            valMin = max(valMin,tmpAND);
        }
}

The problem send me the same misalign error on that line:

tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));

How could this be possible? I made the memory aligned.

Robert_Crovella · November 10, 2016, 2:51pm

You haven’t made it aligned. You attempted to “align” the i index, but based on C storage patterns you have to align the j index. Even that is problematic since there is no guarantee (AFAIK) that LocalKernel is aligned to a 32-bit boundary to begin with.

Anyway, if you are still getting the error, now or in the future, it means you are doing a misaligned access. If you get the error, the presence of the error indicates that you haven’t properly aligned things.

xav12358 · November 10, 2016, 3:23pm

How can I properly align it according i and j?

Robert_Crovella · November 10, 2016, 3:32pm

I’m not going to rewrite your kernel for you.

One possible approach would be to start out with LocalKernel defined as a uint_32 type. Then, every time you write to it, write 32 bits. Every time you read from it, you are reading 32 bits.

Problem solved. Obviously this requires a bunch of changes to the rest of your kernel. I’m not going to try and identify what all that should be.

I’m sure there are other approaches as well.

xav12358 · November 10, 2016, 4:11pm

It is not what I am asking. I just would like to know if there is a way to force a array to be align in cuda?

cbuchner1 · November 10, 2016, 4:32pm

Try the accepted answer on stackoverflow. It defines an alignment macro that works in all compiler environments (on host compilers as well).

http://stackoverflow.com/questions/12778949/cuda-memory-alignment

Christian

Robert_Crovella · November 10, 2016, 5:37pm

You can force an array declaration to be aligned in CUDA, certainly. But if you then go generating byte-level indexing into the array, things can still break.

The issue here is not how to align an array. That’s trivial. The issue is aligned access.

What you need to do is ensure your access mechanisms (index generation) will create aligned indicies into that array.

That will require rewriting of your code, not just ensuring that the base data declaration is aligned.

xav12358 · November 10, 2016, 6:05pm

As you adviced me I wrote à 32 bits version of m’y local copy:

u_int32_t LocalALignedMemory[kernelSize_X][kernelSize_Y];
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    u_int8_t *ptLocalImage     = (u_int8_t *)(&LocalImage[ptrl_Image]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        u_int8_t *u__localptr = (u_int8_t *)&(LocalALignedMemory[j][0]);
        for(int i=0;i<kernelSize_X;i++)
        {
//            (u_int8_t*)(&(LocalALignedMemory[0][j]))+i)  = *ptLocalImage;
            u__localptr[i] = *ptLocalImage;
            ptLocalImage++;
        }
        ptrl_Image += (BLOCK_SIZE_X+kernelSize_X);
        ptLocalImage = (u_int8_t *)(&LocalImage[ptrl_Image]);
    }


    u_int32_t valMin = 255;
    u_int32_t tmpAND;
    u_int32_t tmpImage;
    u_int32_t tmpKernel;

//    ptrl_Image  = 0;
//    u_int32_t *pt32LocalAlignedImage     = (u_int32_t *)(&LocalALignedMemory[0][0]);
//    u_int32_t *pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);


    for(int j=0;j<kernelSize_Y;j++)
    {
        int i = 0;
        int ip =4;
        for(;(i)<kernelSize_X;)
        {
            if(blockIdx.x == 0 && blockIdx.y ==0 &&threadIdx.x ==0 && threadIdx.y==0)
                printf("j %d i %d adress %d %d %d \n",j,i,(&LocalALignedMemory[j][i]),(&LocalALignedMemory[j][i])- (&LocalALignedMemory[j][i+1]),(&LocalALignedMemory[j+1][i])- (&LocalALignedMemory[j][i]));
            //tmpImage = *LocalImage <<24 | *(LocalImage+1)<<16| *(LocalImage+2)<<8| *(LocalImage+3);
            tmpImage    = *(u_int32_t *)(&LocalALignedMemory[i][j]);//*pt32LocalAlignedImage;
//            tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));//*pt32LocalKernel;//*ptLocalKernel <<24 | *(ptLocalKernel+1)<<16| *(ptLocalKernel+2)<<8| *(ptLocalKernel+3);
            tmpAND = tmpImage;
            valMin = min(valMin,tmpAND);
//            pt32LocalAlignedImage++;
//            pt32LocalKernel++;
            i+=4;
            ip+=4;
        }
//        pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);
//        pt32LocalAlignedImage = (u_int32_t *)(&LocalALignedMemory[0][0]);

//        i-=4;
        for(;(i+2)<kernelSize_X;i+=2)
        {
            tmpImage    = (0x00000000) | *(u_int16_t *)(&LocalALignedMemory[i][j]);//*pt32LocalAlignedImage;
            tmpAND = tmpImage ;
            valMin = min(valMin,tmpAND);
        }

//        i-=2;
        for(;i<kernelSize_X;i++)
        {
            tmpImage    = (0x00000000) | LocalALignedMemory[i][j];//*pt32LocalAlignedImage;
            tmpAND = tmpImage ;
            valMin = min(valMin,tmpAND);
        }

    }

There is no misa lignes error but the result is not correct.

Topic		Replies	Views
how to solve error of misaligned address? CUDA Programming and Performance	1	18640	October 11, 2018
Misaligned address of union CUDA Programming and Performance	12	2029	October 16, 2020
When occur these errors in CUDA CUDA Programming and Performance	2	491	September 25, 2018
shared memory alignment CUDA Programming and Performance	1	4748	May 17, 2010
Error: Unaligned memory accesses not supported aligned memory accesses CUDA Programming and Performance	2	3672	January 11, 2010
Help on Error: Unaligned Memory access not supported CUDA Programming and Performance	1	2652	December 26, 2009
Memcheck reports misaligned atomic at address divisible by 0x1000 !!?? CUDA-MEMCHECK	0	1409	January 13, 2017
misalignment of shared memory? CUDA Programming and Performance	3	4196	January 25, 2013
What is a Warp Misaligned Address error? kernel debugging CUDA Programming and Performance	1	22282	March 3, 2011
CL_INVALID_COMMAND_QUEUE error due to local memory byte alignment CUDA Programming and Performance	13	2360	July 29, 2021

CUDA Runtime API error 74: misaligned address

Related topics