Tearing when processing image with cuda kernel, not present when processing with cpu

Hello,

I’m doing a depacking operation on raw image data with my custom cuda kernel and I’m experiencing a sort of tearing.

I have some raw image, if I’m doing the operation with CPU

 for (int i = 0; i < width*height/16; i++)
    {

        int src_i = i*20;
        int dst_i = i*16*2;
        
       //sixteen pixels of two bytes each are in twenty bytes
        dst_buffer[dst_i] = (src_buffer[src_i] & 0b00111111) << 2 | src_buffer[src_i+16] & 0b00000011 ;
        dst_buffer[dst_i+1] = (src_buffer[src_i] &  0b11000000) >> 6;

        dst_buffer[dst_i+2] = (src_buffer[src_i+1] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00001100) >> 2;
        dst_buffer[dst_i+3] = (src_buffer[src_i+1] &  0b11000000) >> 6;

        dst_buffer[dst_i+4] = (src_buffer[src_i+2] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00110000) >> 4;
        dst_buffer[dst_i+5] = (src_buffer[src_i+2] &  0b11000000) >> 6;

        dst_buffer[dst_i+6] = (src_buffer[src_i+3] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b11000000) >> 6;
        dst_buffer[dst_i+7] = (src_buffer[src_i+3] &  0b11000000) >> 6;

        dst_buffer[dst_i+8] = (src_buffer[src_i+4] & 0b00111111) << 2 | src_buffer[src_i+17] & 0b00000011;
        dst_buffer[dst_i+9] = (src_buffer[src_i+4] &  0b11000000) >> 6;

        dst_buffer[dst_i+10] = (src_buffer[src_i+5] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00001100) >> 2;
        dst_buffer[dst_i+11] = (src_buffer[src_i+5] &  0b11000000) >> 6;

        dst_buffer[dst_i+12] = (src_buffer[src_i+6] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00110000) >> 4;
        dst_buffer[dst_i+13] = (src_buffer[src_i+6] &  0b11000000) >> 6;

        dst_buffer[dst_i+14] = (src_buffer[src_i+7] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b11000000) >> 6;
        dst_buffer[dst_i+15] = (src_buffer[src_i+7] &  0b11000000) >> 6;

        dst_buffer[dst_i+16] = (src_buffer[src_i+8] & 0b00111111) << 2 | src_buffer[src_i+18] & 0b00000011;
        dst_buffer[dst_i+17] = (src_buffer[src_i+8] &  0b11000000) >> 6;

        dst_buffer[dst_i+18] = (src_buffer[src_i+9] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00001100) >> 2;
        dst_buffer[dst_i+19] = (src_buffer[src_i+9] &  0b11000000) >> 6;

        dst_buffer[dst_i+20] = (src_buffer[src_i+10] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00110000) << 4;
        dst_buffer[dst_i+21] = (src_buffer[src_i+10] &  0b11000000) >> 6;

        dst_buffer[dst_i+22] = (src_buffer[src_i+11] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b11000000) >> 6;
        dst_buffer[dst_i+23] = (src_buffer[src_i+11] &  0b11000000) >> 6;

        dst_buffer[dst_i+24] = (src_buffer[src_i+12] & 0b00111111) << 2 | src_buffer[src_i+19] & 0b00000011;
        dst_buffer[dst_i+25] = (src_buffer[src_i+12] &  0b11000000) >> 6;

        dst_buffer[dst_i+26] = (src_buffer[src_i+13] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00001100) >> 2;
        dst_buffer[dst_i+27] = (src_buffer[src_i+13] &  0b11000000) >> 6;

        dst_buffer[dst_i+28] = (src_buffer[src_i+14] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00110000) >> 4;
        dst_buffer[dst_i+29] = (src_buffer[src_i+14] &  0b11000000) >> 6;

        dst_buffer[dst_i+30] = (src_buffer[src_i+15] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b11000000) >> 6;
        dst_buffer[dst_i+31] = (src_buffer[src_i+15] &  0b11000000) >> 6;
    }

I get the following, correct output:

but If I’m doing it with the GPU:


void Depack10bitGrouping160(uint8_t* src_buffer, uint8_t* dst_buffer, int width, int height)
{
    int cycle_length = (width)*(height)/16;   
    int num_threads = 1024; 
    int num_blocks = (cycle_length/num_threads)+1; //+1 in case cycle length isn't perfectly divisible
                                                   // by num_threads,

    Depack10bitGrouping160Kernel<<<num_blocks,num_threads,0>>>(src_buffer,dst_buffer,cycle_length-1);  //index_max = cycle_length-1

    cudaDeviceSynchronize();

}


__global__ void Depack10bitGrouping160Kernel (uint8_t* src_buffer, uint8_t* dst_buffer, int index_max)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index > index_max)
        return;

        int src_i = index*20;
        int dst_i = index*16*2;
        
        //sixteen pixels of two bytes each are in twenty bytes
        dst_buffer[dst_i] = (src_buffer[src_i] & 0b00111111) << 2 | src_buffer[src_i+16] & 0b00000011 ;
        dst_buffer[dst_i+1] = (src_buffer[src_i] &  0b11000000) >> 6;

        dst_buffer[dst_i+2] = (src_buffer[src_i+1] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00001100) >> 2;
        dst_buffer[dst_i+3] = (src_buffer[src_i+1] &  0b11000000) >> 6;

        dst_buffer[dst_i+4] = (src_buffer[src_i+2] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00110000) >> 4;
        dst_buffer[dst_i+5] = (src_buffer[src_i+2] &  0b11000000) >> 6;

        dst_buffer[dst_i+6] = (src_buffer[src_i+3] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b11000000) >> 6;
        dst_buffer[dst_i+7] = (src_buffer[src_i+3] &  0b11000000) >> 6;

        dst_buffer[dst_i+8] = (src_buffer[src_i+4] & 0b00111111) << 2 | src_buffer[src_i+17] & 0b00000011;
        dst_buffer[dst_i+9] = (src_buffer[src_i+4] &  0b11000000) >> 6;

        dst_buffer[dst_i+10] = (src_buffer[src_i+5] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00001100) >> 2;
        dst_buffer[dst_i+11] = (src_buffer[src_i+5] &  0b11000000) >> 6;

        dst_buffer[dst_i+12] = (src_buffer[src_i+6] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00110000) >> 4;
        dst_buffer[dst_i+13] = (src_buffer[src_i+6] &  0b11000000) >> 6;

        dst_buffer[dst_i+14] = (src_buffer[src_i+7] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b11000000) >> 6;
        dst_buffer[dst_i+15] = (src_buffer[src_i+7] &  0b11000000) >> 6;

        dst_buffer[dst_i+16] = (src_buffer[src_i+8] & 0b00111111) << 2 | src_buffer[src_i+18] & 0b00000011;
        dst_buffer[dst_i+17] = (src_buffer[src_i+8] &  0b11000000) >> 6;

        dst_buffer[dst_i+18] = (src_buffer[src_i+9] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00001100) >> 2;
        dst_buffer[dst_i+19] = (src_buffer[src_i+9] &  0b11000000) >> 6;

        dst_buffer[dst_i+20] = (src_buffer[src_i+10] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00110000) << 4;
        dst_buffer[dst_i+21] = (src_buffer[src_i+10] &  0b11000000) >> 6;

        dst_buffer[dst_i+22] = (src_buffer[src_i+11] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b11000000) >> 6;
        dst_buffer[dst_i+23] = (src_buffer[src_i+11] &  0b11000000) >> 6;

        dst_buffer[dst_i+24] = (src_buffer[src_i+12] & 0b00111111) << 2 | src_buffer[src_i+19] & 0b00000011;
        dst_buffer[dst_i+25] = (src_buffer[src_i+12] &  0b11000000) >> 6;

        dst_buffer[dst_i+26] = (src_buffer[src_i+13] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00001100) >> 2;
        dst_buffer[dst_i+27] = (src_buffer[src_i+13] &  0b11000000) >> 6;

        dst_buffer[dst_i+28] = (src_buffer[src_i+14] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00110000) >> 4;
        dst_buffer[dst_i+29] = (src_buffer[src_i+14] &  0b11000000) >> 6;

        dst_buffer[dst_i+30] = (src_buffer[src_i+15] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b11000000) >> 6;
        dst_buffer[dst_i+31] = (src_buffer[src_i+15] &  0b11000000) >> 6;

}

I get this tearing:

To me, this seems like some sort of synchronization, caching issue that I don’t understand. Could you guys please tell me what’s causing it?

Thank you,
xlucny

Full code:
SAMPLE_INV.zip (2.9 MB)
System:
NVIDIA Jetson Orin NX (16GB ram)
Jetpack 6.2, L4T 36.4.3
CUDA 12.6.68

Hi,

Could you check if the launched threads/blocks have covered all the data points?

Thanks.

It is sufficient.

if I add some other commands to the kernel, the result is suddenly OK.

        dst_buffer[dst_i+30] = (src_buffer[src_i+15] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b11000000) >> 6;
        dst_buffer[dst_i+31] = (src_buffer[src_i+15] &  0b11000000) >> 6;

 +       if (((dst_i/2) /2048)>1058)
 +       {
 +          int dummy = 0;
 +           printf("A\n");
 +       }

Do you know what would cause such a behaviour?

Hi,

We will try to reproduce this issue internally.
Will provide more info to you later.

Thanks.

1 Like

Hi,

Confirmed that we can reproduce the same issue in our environment.
We need to check this with our internal team and provide more info to you later.

Thanks.

Hi,

Please try the below modification:

diff --git a/main.cpp b/main.cpp
index 8ca4dea..371bd5a 100644
--- a/main.cpp
+++ b/main.cpp
@@ -21,10 +21,10 @@ int main()
 
     void* depacked_buffer, *raw_buffer;
 
-    ret = cudaMallocManaged (&raw_buffer, raw_buffer_length, cudaMemAttachHost);
+    ret = cudaMallocManaged (&raw_buffer, raw_buffer_length);//, cudaMemAttachHost);
     CHECK_CUDA_RET("Allocating unified memory to raw_buffer variable")
     
-    ret = cudaMallocManaged (&depacked_buffer, width*height*2, cudaMemAttachHost);
+    ret = cudaMallocManaged (&depacked_buffer, width*height*2);// cudaMemAttachHost);
     CHECK_CUDA_RET("Allocating unified memory to depacked_buffer variable")
 
     FILE *f;

Change the cudaMallocManaged flag from cudaMemAttachHost to cudaMemAttachGlobal (the default value) can fix the issue.
You can find the corresponding doc for that flag below:
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gd228014f19cc0975ebe3e0dd2af6dd1b

Thanks.

I confirm that this solves the issue.

Is this an issue with the cudaMemAttachHost flag or an expected behaviour?

Hi,

As you need to access the buffer with CUDA, you will need to use the cudaMemAttachGlobal to make it accessible for GPU.

Thanks.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.