BGRP to BGR24 by cuda failed. What's the problem?

I get the error frames by code followed, and I don’t know why.
I have very a little knowledge about CUDA and don’t know what’s the next.

void bgrp_to_bgr24(unsigned char* dev_pBGRP, unsigned char* dev_pBGR24, int width, int height) {
time_t t = time(0);
std::cout << "before " << asctime(localtime(&t)) << std::endl;
bgrp_to_bgr<<<dim3((width + 31) / 32, (height + 15) / 16), dim3(32, 16)>>>((float*)dev_pBGRP, dev_pBGR24, width, height);
t = time(0);
std::cout << "after " << asctime(localtime(&t)) << std::endl;
}

global static void bgrp_to_bgr(float* dev_pBGRP, unsigned char* dev_pBGR24, int width, int height) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
const int resolution = width * height;
for (int i = x; i < x + 32; ++i) {
for (int j = y; j < y + 16; ++j) {
int pixel_idx = i * width + j;
if (pixel_idx > resolution) {
break;
}
int color_idx = pixel_idx % 3;
int color_offset = pixel_idx / 3;
//dev_pBGR24[pixel_idx] = min(255, __float2uint_rn(dev_pBGRP[pixel_idx * color_idx + color_offset]));
dev_pBGR24[pixel_idx] = (uint8_t)(dev_pBGRP);
}
}
}

the error frame is full of black pixels which can be shown by ffplay.
and there are some 0x01 bytes in the picture when I look into the hex code of the frame, but most of the bytes are zero.

deepglint@192.168.6.106:~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | awk -F: ‘!a[$2]++’
00022b00: 0000 0100 0001 0000 0100 0001 0000 0100 …
00022b10: 0001 0000 0100 0001 0000 0100 0001 0000 …
00022b20: 0100 0001 0000 0100 0001 0000 0100 0001 …
00022f40: 0100 0001 0000 0100 0001 0000 0100 0000 …
00195910: 0000 0000 0000 0100 0000 0000 0000 0000 …
00195940: 0000 0000 0000 0000 0000 0000 0100 0000 …
001ab800: 0001 0000 0100 0000 0000 0000 0000 0000 …

deepglint@192.168.6.106:~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | wc -l
40024

deepglint@192.168.6.106:~$ xxd csoutput-200 | wc -l
388800

all these bytes are repeated over hundreds of thousands time.

deepglint@192.168.6.106:~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | awk -F: ‘!a[$2]++’
00022b00: 0000 0100 0001 0000 0100 0001 0000 0100 …
00022b10: 0001 0000 0100 0001 0000 0100 0001 0000 …
00022b20: 0100 0001 0000 0100 0001 0000 0100 0001 …
00022f40: 0100 0001 0000 0100 0001 0000 0100 0000 …
00195910: 0000 0000 0000 0100 0000 0000 0000 0000 …
00195940: 0000 0000 0000 0000 0000 0000 0100 0000 …
001ab800: 0001 0000 0100 0000 0000 0000 0000 0000 …

deepglint@192.168.6.106:~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | wc -l
40024

deepglint@192.168.6.106:~$ xxd csoutput-200 | wc -l
388800

all these bytes are repeated over hundreds of thousands time.

Hi,
This is a demo which can convert planar bgr float buffer to bgr24.
global void float_to_char(
float *src, unsigned char *dst,
int height, int width, int batch_size)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;

if (x >= height * width)
    return;

for(int j = 0; j < batch_size; j++) {
    int offset = height * width * 3;

    //b
    *(dst + j * offset + x * 3 + 0) =
        (unsigned char)*(src + j * offset + height * width * 0 + x);
    //g
    *(dst + j * offset + x * 3 + 1) =
        (unsigned char)*(src + j * offset + height * width * 1 + x);
    //r
    *(dst + j * offset + x * 3 + 2) =
        (unsigned char)*(src + j * offset + height * width * 2 + x);
}

}

void float_planar_to_char(
float *src, unsigned char *dst,
int height, int width,
int batch_size)
{
float_to_char<<<(height * width-1) / 1024 + 1, 1024, 0, NULL>>>
(src, dst, height, width, batch_size);
}

Thank you, and I also have found a similar resolution which can be found by searching for keywords transpose in CUDA samples.

deepglint@192.168.6.106:/mnt/cuda_sample$ ls
0_Simple 1_Utilities 2_Graphics 3_Imaging 4_Finance 5_Simulations 6_Advanced 7_CUDALibraries bin common EULA.txt Makefile
deepglint@192.168.6.106:/mnt/cuda_sample$ find . | grep transpose
./bin/x86_64/linux/release/transpose
./6_Advanced/transpose
./6_Advanced/transpose/transpose.cu
./6_Advanced/transpose/Makefile
./6_Advanced/transpose/transpose.old
./6_Advanced/transpose/NsightEclipse.xml
./6_Advanced/transpose/transpose.o
./6_Advanced/transpose/transpose
./6_Advanced/transpose/doc
./6_Advanced/transpose/doc/MatrixTranspose.pdf
./6_Advanced/transpose/readme.txt

and my code is too followed for anyone who is here later:

#define TILE_DIM 3
#define BLOCK_ROWS 3

global void transposeCoalesced(float* idata, unsigned char* odata, int width, int height)
{
shared unsigned char tile[TILE_DIM][TILE_DIM];

int resolution = width * height;
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)*width;

xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)*height;

for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
{
	int idata_idx = index_in + i * width;
	if (idata_idx < resolution) {
    	tile[threadIdx.y+i][threadIdx.x] = idata[idata_idx];
	}
}

__syncthreads();

for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
{
	int odata_idx = index_out + i * height;
	if (odata_idx < resolution) {
    	odata[odata_idx] = tile[threadIdx.x][threadIdx.y+i];
	}
}

}

void bgrp_to_bgr24(unsigned char* dev_pBGRP, unsigned char* dev_pBGR24, int channel, int width, int height) {
transposeCoalesced<<<dim3((widthheight + TILE_DIM - 1) / TILE_DIM, TILE_DIM / TILE_DIM), dim3(TILE_DIM, BLOCK_ROWS)>>>((float)dev_pBGRP, dev_pBGR24, width*height, channel);
}