I have modified the CUDA sample code “12_camera_v4l2_cuda” so that it calculates the means of a 32x32 grid of rectangles (each rectangle measures 128 pixels wide by 64 pixels high). The source image is 4096x2048 pixels. I want the CPU that calls the CUDA code to be able to read the 32x32=1024 means that are calculated for each of the sub rectangles.
Unfortunately I experience 2 problems:
- I get a “bus error” when I attempt to print out the mean for the first of the rectangles.
- The code runs slow if I access the unified memory in my CUDA function. That is to say, if I just keep the algorithm local I get 30fps but storing the mean in the passed-in unified memory gives me 12fps.
Here is my code. I am using unified memory for the stats array.
struct rectangleStats
{
float mean;
// These fields are the rectangle offset. i.e. 0-31.
uint8_t rect_x;
uint8_t rect_y;
};
// Total of 1024 threads for an image 4096 x 2048.
__global__ void findSdMeanKernel(int* pDevPtr, rectangleStats* stats)
{
// Each row is 4096 bytes long, 1 byte per luminance pixel.
int row = blockIdx.y * blockDim.y + threadIdx.y;
int start_row = row * 64;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int start_col = col * 128;
uint8_t x_offset, y_offset;
uint32_t pixel;
float mean;
uint32_t sum_pixels = 0;
for(x_offset = 0; x_offset < 128; x_offset++)
{
for(y_offset = 0; y_offset < 64; y_offset++)
{
pixel = ((char *)pDevPtr)[(start_row + y_offset) * 4096 + start_col + x_offset];
sum_pixels += pixel;
}
}
mean = (float)sum_pixels/(128.0 * 64.0);
stats[row * BOX_W + col].mean = mean;
stats[row * BOX_W + col].rect_x = col;
stats[row * BOX_W + col].rect_y = row;
return;
}
int findSdMean(CUdeviceptr pDevPtr)
{
dim3 threadsPerBlock(BOX_W, BOX_H);
dim3 blocks(1,1);
rectangleStats *stats;
cudaMallocManaged(&stats, 32 * 32 * sizeof(rectangleStats));
findSdMeanKernel<<<blocks, threadsPerBlock>>>((int *)pDevPtr, stats);
// The line below cause the "bus error".
printf("Mean at x, y %d, %d is %f\n", stats[0].rect_x, stats[0].rect_y, stats[0].mean);
cudaFree(stats);
return 0;
}
I have discovered that the malloc/free calls do not slow things down so it seems to be the fact I access the unified memory that slows things down.
Any ideas for the cause of these 2 issues? Thanks.