Hello,
I’m doing a depacking operation on raw image data with my custom cuda kernel and I’m experiencing a sort of tearing.
I have some raw image, if I’m doing the operation with CPU
for (int i = 0; i < width*height/16; i++)
{
int src_i = i*20;
int dst_i = i*16*2;
//sixteen pixels of two bytes each are in twenty bytes
dst_buffer[dst_i] = (src_buffer[src_i] & 0b00111111) << 2 | src_buffer[src_i+16] & 0b00000011 ;
dst_buffer[dst_i+1] = (src_buffer[src_i] & 0b11000000) >> 6;
dst_buffer[dst_i+2] = (src_buffer[src_i+1] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00001100) >> 2;
dst_buffer[dst_i+3] = (src_buffer[src_i+1] & 0b11000000) >> 6;
dst_buffer[dst_i+4] = (src_buffer[src_i+2] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00110000) >> 4;
dst_buffer[dst_i+5] = (src_buffer[src_i+2] & 0b11000000) >> 6;
dst_buffer[dst_i+6] = (src_buffer[src_i+3] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b11000000) >> 6;
dst_buffer[dst_i+7] = (src_buffer[src_i+3] & 0b11000000) >> 6;
dst_buffer[dst_i+8] = (src_buffer[src_i+4] & 0b00111111) << 2 | src_buffer[src_i+17] & 0b00000011;
dst_buffer[dst_i+9] = (src_buffer[src_i+4] & 0b11000000) >> 6;
dst_buffer[dst_i+10] = (src_buffer[src_i+5] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00001100) >> 2;
dst_buffer[dst_i+11] = (src_buffer[src_i+5] & 0b11000000) >> 6;
dst_buffer[dst_i+12] = (src_buffer[src_i+6] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00110000) >> 4;
dst_buffer[dst_i+13] = (src_buffer[src_i+6] & 0b11000000) >> 6;
dst_buffer[dst_i+14] = (src_buffer[src_i+7] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b11000000) >> 6;
dst_buffer[dst_i+15] = (src_buffer[src_i+7] & 0b11000000) >> 6;
dst_buffer[dst_i+16] = (src_buffer[src_i+8] & 0b00111111) << 2 | src_buffer[src_i+18] & 0b00000011;
dst_buffer[dst_i+17] = (src_buffer[src_i+8] & 0b11000000) >> 6;
dst_buffer[dst_i+18] = (src_buffer[src_i+9] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00001100) >> 2;
dst_buffer[dst_i+19] = (src_buffer[src_i+9] & 0b11000000) >> 6;
dst_buffer[dst_i+20] = (src_buffer[src_i+10] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00110000) << 4;
dst_buffer[dst_i+21] = (src_buffer[src_i+10] & 0b11000000) >> 6;
dst_buffer[dst_i+22] = (src_buffer[src_i+11] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b11000000) >> 6;
dst_buffer[dst_i+23] = (src_buffer[src_i+11] & 0b11000000) >> 6;
dst_buffer[dst_i+24] = (src_buffer[src_i+12] & 0b00111111) << 2 | src_buffer[src_i+19] & 0b00000011;
dst_buffer[dst_i+25] = (src_buffer[src_i+12] & 0b11000000) >> 6;
dst_buffer[dst_i+26] = (src_buffer[src_i+13] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00001100) >> 2;
dst_buffer[dst_i+27] = (src_buffer[src_i+13] & 0b11000000) >> 6;
dst_buffer[dst_i+28] = (src_buffer[src_i+14] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00110000) >> 4;
dst_buffer[dst_i+29] = (src_buffer[src_i+14] & 0b11000000) >> 6;
dst_buffer[dst_i+30] = (src_buffer[src_i+15] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b11000000) >> 6;
dst_buffer[dst_i+31] = (src_buffer[src_i+15] & 0b11000000) >> 6;
}
I get the following, correct output:
but If I’m doing it with the GPU:
void Depack10bitGrouping160(uint8_t* src_buffer, uint8_t* dst_buffer, int width, int height)
{
int cycle_length = (width)*(height)/16;
int num_threads = 1024;
int num_blocks = (cycle_length/num_threads)+1; //+1 in case cycle length isn't perfectly divisible
// by num_threads,
Depack10bitGrouping160Kernel<<<num_blocks,num_threads,0>>>(src_buffer,dst_buffer,cycle_length-1); //index_max = cycle_length-1
cudaDeviceSynchronize();
}
__global__ void Depack10bitGrouping160Kernel (uint8_t* src_buffer, uint8_t* dst_buffer, int index_max)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index > index_max)
return;
int src_i = index*20;
int dst_i = index*16*2;
//sixteen pixels of two bytes each are in twenty bytes
dst_buffer[dst_i] = (src_buffer[src_i] & 0b00111111) << 2 | src_buffer[src_i+16] & 0b00000011 ;
dst_buffer[dst_i+1] = (src_buffer[src_i] & 0b11000000) >> 6;
dst_buffer[dst_i+2] = (src_buffer[src_i+1] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00001100) >> 2;
dst_buffer[dst_i+3] = (src_buffer[src_i+1] & 0b11000000) >> 6;
dst_buffer[dst_i+4] = (src_buffer[src_i+2] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b00110000) >> 4;
dst_buffer[dst_i+5] = (src_buffer[src_i+2] & 0b11000000) >> 6;
dst_buffer[dst_i+6] = (src_buffer[src_i+3] & 0b00111111) << 2 | (src_buffer[src_i+16] & 0b11000000) >> 6;
dst_buffer[dst_i+7] = (src_buffer[src_i+3] & 0b11000000) >> 6;
dst_buffer[dst_i+8] = (src_buffer[src_i+4] & 0b00111111) << 2 | src_buffer[src_i+17] & 0b00000011;
dst_buffer[dst_i+9] = (src_buffer[src_i+4] & 0b11000000) >> 6;
dst_buffer[dst_i+10] = (src_buffer[src_i+5] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00001100) >> 2;
dst_buffer[dst_i+11] = (src_buffer[src_i+5] & 0b11000000) >> 6;
dst_buffer[dst_i+12] = (src_buffer[src_i+6] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b00110000) >> 4;
dst_buffer[dst_i+13] = (src_buffer[src_i+6] & 0b11000000) >> 6;
dst_buffer[dst_i+14] = (src_buffer[src_i+7] & 0b00111111) << 2 | (src_buffer[src_i+17] & 0b11000000) >> 6;
dst_buffer[dst_i+15] = (src_buffer[src_i+7] & 0b11000000) >> 6;
dst_buffer[dst_i+16] = (src_buffer[src_i+8] & 0b00111111) << 2 | src_buffer[src_i+18] & 0b00000011;
dst_buffer[dst_i+17] = (src_buffer[src_i+8] & 0b11000000) >> 6;
dst_buffer[dst_i+18] = (src_buffer[src_i+9] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00001100) >> 2;
dst_buffer[dst_i+19] = (src_buffer[src_i+9] & 0b11000000) >> 6;
dst_buffer[dst_i+20] = (src_buffer[src_i+10] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b00110000) << 4;
dst_buffer[dst_i+21] = (src_buffer[src_i+10] & 0b11000000) >> 6;
dst_buffer[dst_i+22] = (src_buffer[src_i+11] & 0b00111111) << 2 | (src_buffer[src_i+18] & 0b11000000) >> 6;
dst_buffer[dst_i+23] = (src_buffer[src_i+11] & 0b11000000) >> 6;
dst_buffer[dst_i+24] = (src_buffer[src_i+12] & 0b00111111) << 2 | src_buffer[src_i+19] & 0b00000011;
dst_buffer[dst_i+25] = (src_buffer[src_i+12] & 0b11000000) >> 6;
dst_buffer[dst_i+26] = (src_buffer[src_i+13] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00001100) >> 2;
dst_buffer[dst_i+27] = (src_buffer[src_i+13] & 0b11000000) >> 6;
dst_buffer[dst_i+28] = (src_buffer[src_i+14] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b00110000) >> 4;
dst_buffer[dst_i+29] = (src_buffer[src_i+14] & 0b11000000) >> 6;
dst_buffer[dst_i+30] = (src_buffer[src_i+15] & 0b00111111) << 2 | (src_buffer[src_i+19] & 0b11000000) >> 6;
dst_buffer[dst_i+31] = (src_buffer[src_i+15] & 0b11000000) >> 6;
}
I get this tearing:
To me, this seems like some sort of synchronization, caching issue that I don’t understand. Could you guys please tell me what’s causing it?
Thank you,
xlucny
Full code:
SAMPLE_INV.zip (2.9 MB)
System:
NVIDIA Jetson Orin NX (16GB ram)
Jetpack 6.2, L4T 36.4.3
CUDA 12.6.68