I get the error frames by code followed, and I don’t know why.
I have very a little knowledge about CUDA and don’t know what’s the next.
void bgrp_to_bgr24(unsigned char* dev_pBGRP, unsigned char* dev_pBGR24, int width, int height) {
time_t t = time(0);
std::cout << "before " << asctime(localtime(&t)) << std::endl;
bgrp_to_bgr<<<dim3((width + 31) / 32, (height + 15) / 16), dim3(32, 16)>>>((float*)dev_pBGRP, dev_pBGR24, width, height);
t = time(0);
std::cout << "after " << asctime(localtime(&t)) << std::endl;
}
global static void bgrp_to_bgr(float* dev_pBGRP, unsigned char* dev_pBGR24, int width, int height) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
const int resolution = width * height;
for (int i = x; i < x + 32; ++i) {
for (int j = y; j < y + 16; ++j) {
int pixel_idx = i * width + j;
if (pixel_idx > resolution) {
break;
}
int color_idx = pixel_idx % 3;
int color_offset = pixel_idx / 3;
//dev_pBGR24[pixel_idx] = min(255, __float2uint_rn(dev_pBGRP[pixel_idx * color_idx + color_offset]));
dev_pBGR24[pixel_idx] = (uint8_t)(dev_pBGRP);
}
}
}
the error frame is full of black pixels which can be shown by ffplay.
and there are some 0x01 bytes in the picture when I look into the hex code of the frame, but most of the bytes are zero.
deepglint@192.168.6.106 :~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | awk -F: ‘!a[$2]++’
00022b00: 0000 0100 0001 0000 0100 0001 0000 0100 …
00022b10: 0001 0000 0100 0001 0000 0100 0001 0000 …
00022b20: 0100 0001 0000 0100 0001 0000 0100 0001 …
00022f40: 0100 0001 0000 0100 0001 0000 0100 0000 …
00195910: 0000 0000 0000 0100 0000 0000 0000 0000 …
00195940: 0000 0000 0000 0000 0000 0000 0100 0000 …
001ab800: 0001 0000 0100 0000 0000 0000 0000 0000 …
deepglint@192.168.6.106 :~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | wc -l
40024
deepglint@192.168.6.106 :~$ xxd csoutput-200 | wc -l
388800
all these bytes are repeated over hundreds of thousands time.
deepglint@192.168.6.106 :~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | awk -F: ‘!a[$2]++’
00022b00: 0000 0100 0001 0000 0100 0001 0000 0100 …
00022b10: 0001 0000 0100 0001 0000 0100 0001 0000 …
00022b20: 0100 0001 0000 0100 0001 0000 0100 0001 …
00022f40: 0100 0001 0000 0100 0001 0000 0100 0000 …
00195910: 0000 0000 0000 0100 0000 0000 0000 0000 …
00195940: 0000 0000 0000 0000 0000 0000 0100 0000 …
001ab800: 0001 0000 0100 0000 0000 0000 0000 0000 …
deepglint@192.168.6.106 :~$ xxd csoutput-200 | grep -v ‘0000 0000 0000 0000 0000 0000 0000 0000’ | wc -l
40024
deepglint@192.168.6.106 :~$ xxd csoutput-200 | wc -l
388800
all these bytes are repeated over hundreds of thousands time.
leif
January 3, 2018, 9:13am
#3
Hi,
This is a demo which can convert planar bgr float buffer to bgr24.
global void float_to_char(
float *src, unsigned char *dst,
int height, int width, int batch_size)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
if (x >= height * width)
return;
for(int j = 0; j < batch_size; j++) {
int offset = height * width * 3;
//b
*(dst + j * offset + x * 3 + 0) =
(unsigned char)*(src + j * offset + height * width * 0 + x);
//g
*(dst + j * offset + x * 3 + 1) =
(unsigned char)*(src + j * offset + height * width * 1 + x);
//r
*(dst + j * offset + x * 3 + 2) =
(unsigned char)*(src + j * offset + height * width * 2 + x);
}
}
void float_planar_to_char(
float *src, unsigned char *dst,
int height, int width,
int batch_size)
{
float_to_char<<<(height * width-1) / 1024 + 1, 1024, 0, NULL>>>
(src, dst, height, width, batch_size);
}
Hi,
This is a demo which can convert planar bgr float buffer to bgr24.
global void float_to_char(
float *src, unsigned char *dst,
int height, int width, int batch_size)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
if (x >= height * width)
return;
for(int j = 0; j < batch_size; j++) {
int offset = height * width * 3;
//b
*(dst + j * offset + x * 3 + 0) =
(unsigned char)*(src + j * offset + height * width * 0 + x);
//g
*(dst + j * offset + x * 3 + 1) =
(unsigned char)*(src + j * offset + height * width * 1 + x);
//r
*(dst + j * offset + x * 3 + 2) =
(unsigned char)*(src + j * offset + height * width * 2 + x);
}
}
void float_planar_to_char(
float *src, unsigned char *dst,
int height, int width,
int batch_size)
{
float_to_char<<<(height * width-1) / 1024 + 1, 1024, 0, NULL>>>
(src, dst, height, width, batch_size);
}
Thank you, and I also have found a similar resolution which can be found by searching for keywords transpose in CUDA samples.
deepglint@192.168.6.106 :/mnt/cuda_sample$ ls
0_Simple 1_Utilities 2_Graphics 3_Imaging 4_Finance 5_Simulations 6_Advanced 7_CUDALibraries bin common EULA.txt Makefile
deepglint@192.168.6.106 :/mnt/cuda_sample$ find . | grep transpose
./bin/x86_64/linux/release/transpose
./6_Advanced/transpose
./6_Advanced/transpose/transpose.cu
./6_Advanced/transpose/Makefile
./6_Advanced/transpose/transpose.old
./6_Advanced/transpose/NsightEclipse.xml
./6_Advanced/transpose/transpose.o
./6_Advanced/transpose/transpose
./6_Advanced/transpose/doc
./6_Advanced/transpose/doc/MatrixTranspose.pdf
./6_Advanced/transpose/readme.txt
and my code is too followed for anyone who is here later:
#define TILE_DIM 3
#define BLOCK_ROWS 3
global void transposeCoalesced(float* idata, unsigned char* odata, int width, int height)
{
shared unsigned char tile[TILE_DIM][TILE_DIM];
int resolution = width * height;
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)*width;
xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)*height;
for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
{
int idata_idx = index_in + i * width;
if (idata_idx < resolution) {
tile[threadIdx.y+i][threadIdx.x] = idata[idata_idx];
}
}
__syncthreads();
for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
{
int odata_idx = index_out + i * height;
if (odata_idx < resolution) {
odata[odata_idx] = tile[threadIdx.x][threadIdx.y+i];
}
}
}
void bgrp_to_bgr24(unsigned char* dev_pBGRP, unsigned char* dev_pBGR24, int channel, int width, int height) {
transposeCoalesced<<<dim3((widthheight + TILE_DIM - 1) / TILE_DIM, TILE_DIM / TILE_DIM), dim3(TILE_DIM, BLOCK_ROWS)>>>((float )dev_pBGRP, dev_pBGR24, width*height, channel);
}