I have modified the CUDA sample code “12_camera_v4l2_cuda” so that it calculates the means of a 32x32 grid of rectangles (each rectangle measures 128 pixels wide by 64 pixels high). The source image is 4096x2048 pixels. I want the CPU to be able to read the 32x32=1024 means that are calculated for each of the sub images.
Unfortunately I get a “bus error” when I attempt to print out the mean for the first of the rectangles.
Here is my code. I am using unified memory for the
struct rectangleStats
{
float mean;
// These fields are the rectangle offset. i.e. 0-31.
uint8_t rect_x;
uint8_t rect_y;
};
// Total of 1024 threads for an image 4096 x 2048.
__global__ void findSdMeanKernel(int* pDevPtr, rectangleStats* stats)
{
// Each row is 4096 bytes long, 1 byte per luminance pixel.
int row = blockIdx.y * blockDim.y + threadIdx.y;
int start_row = row * 64;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int start_col = col * 128;
uint8_t x_offset, y_offset;
uint32_t pixel;
float mean;
uint32_t sum_pixels = 0;
for(x_offset = 0; x_offset < 128; x_offset++)
{
for(y_offset = 0; y_offset < 64; y_offset++)
{
pixel = ((char *)pDevPtr)[(start_row + y_offset) * 4096 + start_col + x_offset];
sum_pixels += pixel;
}
}
mean = (float)sum_pixels/(128.0 * 64.0);
stats[row * BOX_W + col].mean = mean;
stats[row * BOX_W + col].rect_x = col;
stats[row * BOX_W + col].rect_y = row;
return;
}
int findSdMean(CUdeviceptr pDevPtr)
{
dim3 threadsPerBlock(BOX_W, BOX_H);
dim3 blocks(1,1);
rectangleStats *stats;
cudaMallocManaged(&stats, 32 * 32 * sizeof(rectangleStats));
findSdMeanKernel<<<blocks, threadsPerBlock>>>((int *)pDevPtr, stats);
// The line below cause the "bus error".
printf("Mean at x, y %d, %d is %f\n", stats[0].rect_x, stats[0].rect_y, stats[0].mean);
cudaFree(stats);
return 0;
}
Please can someone help me to debug my code?
(As a side note, I also find the code to run very slowly as soon as I make use of unified memory. If I don’t allocate and pass any unified memory to the kernel, the whole thing runs much quicker.)