Hello guys, I feel like I’m missing something quite basic here. I have an image data on which I’m doing a depacking operation (the details are not that important). I have a CPU version of processing and a (seemingly) equivalent GPU processing. The two functions output different results - CPU has a correct result and GPU an incorrect one.
//GPU
__global__
void depacking_kernel(uint8_t *src, uint8_t *dst)
{
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int src_it = (index * 3);
unsigned int dst_it = (index * 4);
dst[dst_it]=src[src_it];
dst[dst_it+1]=src[src_it+1] & 0b00001111;
dst[dst_it+2]=((src[src_it+1] & 0b11110000) >> 4) | ((src[src_it+2] & 0b00001111) << 4);
dst[dst_it+3]=((src[src_it+2] & 0b11110000) >> 4);
}
void depack(uint8_t *srcGpu, uint8_t *dstGpu,unsigned int src_length)
{
int num_threads = 64;
//from every three bytes, four bytes are made
int num_blocks = (src_length/3)/num_threads;
if (num_blocks>65535)
{
cerr << "too many blocks";
exit(-1);
}
depacking_kernel<<<num_blocks,num_threads>>>(srcGpu,dstGpu);
cudaDeviceSynchronize();
}
//CPU
void depack_cpu (uint8_t* src, unsigned int src_datalen, uint8_t* dst,unsigned int dst_datalen)
{
for (auto it=0; it<src_datalen/3;it++)
{
depack_cpu_kernel(src, dst,it);
}
return;
}
void depack_cpu_kernel(uint8_t* src, uint8_t* dst,int index)
{
unsigned int src_it = (index * 3);
unsigned int dst_it = (index * 4);
dst[dst_it]=src[src_it];
dst[dst_it+1]=src[src_it+1] & 0b00001111;
dst[dst_it+2]=((src[src_it+1] & 0b11110000) >> 4) | ((src[src_it+2] & 0b00001111) << 4);
dst[dst_it+3]=((src[src_it+2] & 0b11110000) >> 4);
}
I’m at a loss at why is this happening.
full_code.zip (2.6 MB)