I’m working on porting a video codec to CUDA, for a nice speedup. Today I ran into a strange problem while writing code to de-interleave YUYV images to separate Y, U and V channels.
I wrote a kernel such that the input
In: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
Would be unpacked using the YUYV pattern into
Y: 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30
U: 1 5 9 13 17 21 25 29
V: 3 7 11 15 19 23 27 31
In the simulator it works fine but when run on the GPU I get this strange output instead
Y: 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30
U: 1 5 9 13 17 21 25 29
V: 1 5 9 13 17 21 25 29
It seems pixel.d is equal to pixel.b suddenly. I have looked at this for hours and haven’t been able to see any problem, either this must be a CUDA bug or I’m really stupid. The code is below:
#include <cutil.h>
#include <cuda.h>
#include <algorithm>
#include <cassert>
#define SRCTYPE uint8_t
#define DSTTYPE uint8_t
typedef struct __align__(4) {
unsigned char a,b,c,d;
} alignedChars;
__global__ void
convert_u8_422_yuyv(uint8_t* dsty, uint8_t* dstu, uint8_t* dstv, uint8_t* _src, int width)
{
alignedChars* src = (alignedChars*)_src;
int offset = threadIdx.x;
alignedChars pixel = src[offset];
dsty[2*offset] = pixel.a;
dstu[offset] = pixel.b;
dsty[2*offset + 1] = pixel.c;
dstv[offset] = pixel.d;
}
int main( int argc, char** argv)
{
int width = 32;
unsigned int size = width;
/* Data on CPU */
SRCTYPE *data = (SRCTYPE*)malloc(size);
DSTTYPE *ddata = (DSTTYPE*)malloc(size);
for(int x=0; x<width; ++x)
data[x] = x;
printf("In: ");
for(int x=0; x<width; ++x)
printf("%4i ", (int)data[x]);
printf("\n");
/** Data on GPU */
SRCTYPE *d_data_src = NULL;
DSTTYPE *d_data_dst = NULL;
DSTTYPE *d_data_dst2 = NULL;
DSTTYPE *d_data_dst3 = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_data_src, size));
cudaMemset(d_data_src, 0, size);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_data_dst, size));
cudaMemset(d_data_dst, 0, size);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_data_dst2, size));
cudaMemset(d_data_dst2, 0, size);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_data_dst3, size));
cudaMemset(d_data_dst3, 0, size);
/** Copy to GPU */
CUDA_SAFE_CALL(cudaMemcpy(d_data_src, data, size, cudaMemcpyHostToDevice));
/** Invoke kernel */
dim3 block_size;
dim3 grid_size;
int shared_size;
block_size.x = width/4;
block_size.y = block_size.z = 1;
grid_size.x = grid_size.y = grid_size.z = 1;
shared_size = 0;
convert_u8_422_yuyv<<<grid_size, block_size, shared_size>>>(d_data_dst, d_data_dst2, d_data_dst3, d_data_src, width/4);
/** Copy back */
CUDA_SAFE_CALL(cudaMemcpy(ddata, d_data_dst, size, cudaMemcpyDeviceToHost));
printf("Y: ");
for(int x=0; x<width/2; ++x)
{
printf("%4i ", (int)ddata[x]);
}
printf("\n");
CUDA_SAFE_CALL(cudaMemcpy(ddata, d_data_dst2, size, cudaMemcpyDeviceToHost));
printf("U: ");
for(int x=0; x<width/4; ++x)
{
printf("%4i ", (int)ddata[x]);
}
printf("\n");
CUDA_SAFE_CALL(cudaMemcpy(ddata, d_data_dst3, size, cudaMemcpyDeviceToHost));
printf("V: ");
for(int x=0; x<width/4; ++x)
{
printf("%4i ", (int)ddata[x]);
}
printf("\n");
free(data);
cudaFree(d_data_src);
cudaFree(d_data_dst);
cudaFree(d_data_dst2);
}
As you can see the kernel is extremely simple (I stripped everything that had to do with 2D or large images). Still it has this strange behaviour. I have looked at the generated PTX code but was unable to find anything wrong either.