I’m writing a kernel to turn an image into grayscale. In my main function, I load a JPG image, reorganize its content so that instead of rgbrgbrgb...rgb
, it’s rrr...rggg...gbbb...b
, load it into the GPU RAM, run the kernel, load the result into CPU RAM, organize the result back into its original format, and write it out. I am trying to use vectorized memory access in my kernel, however, it is not working.
Inside the NCU app, when I profile this kernel, I get the following error:
Code inside main function
// GPU SoA Improved //
{
int width, height, channels;
unsigned char *image = stbi_load("image.jpg", &width, &height, &channels, 0);
printf("width: %d, height: %d, channels: %d\n", width, height, channels);
unsigned char *reorganizedImage = (unsigned char *)malloc(width * height * channels * sizeof(unsigned char));
for (int counter = 0; counter < width * height; counter++)
{
reorganizedImage[0 * width * height + counter] = image[counter * channels];
reorganizedImage[1 * width * height + counter] = image[counter * channels + 1];
reorganizedImage[2 * width * height + counter] = image[counter * channels + 2];
}
unsigned char *h_image_out;
unsigned char *d_image_out;
cudaHostAlloc((void **)&h_image_out, width * height * channels, cudaHostAllocDefault);
memcpy(h_image_out, reorganizedImage, width * height * channels);
cudaMalloc((void **)&d_image_out, width * height * channels);
cudaMemcpy(d_image_out, h_image_out, width * height * channels, cudaMemcpyHostToDevice);
double start = tick();
dim3 blockDimensions = {512};
dim3 gridDimensions = {(width * height + blockDimensions.x - 1) / (blockDimensions.x * 4)};
applyGrayscaleFilterSoAImproved<<<gridDimensions, blockDimensions>>>((uchar4 *)d_image_out, width, height);
cudaDeviceSynchronize();
double end = tick();
printf("Time taken by GPU SoA Improved: %f\n", end - start);
cudaMemcpy(h_image_out, d_image_out, width * height * channels, cudaMemcpyDeviceToHost);
memcpy(reorganizedImage, h_image_out, width * height * channels);
for (int counter = 0; counter < width * height; counter++)
{
image[counter * channels] = reorganizedImage[0 * width * height + counter];
image[counter * channels + 1] = reorganizedImage[1 * width * height + counter];
image[counter * channels + 2] = reorganizedImage[2 * width * height + counter];
}
stbi_write_jpg("image_out_SoAImproved.jpg", width, height, channels, image, 100);
free(reorganizedImage);
cudaFree(d_image_out);
cudaFreeHost(h_image_out);
stbi_image_free(image);
}
////
Kernel code
__global__ void applyGrayscaleFilterSoAImproved(uchar4 *image, int width, int height)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < width * height)
{
uchar4 red = image[0 * width * height + index];
uchar4 green = image[1 * width * height + index];
uchar4 blue = image[2 * width * height + index];
uchar4 gray = make_uchar4(
(red.x + green.x + blue.x) / 3,
(red.y + green.y + blue.y) / 3,
(red.z + green.z + blue.z) / 3,
(red.w + green.w + blue.w) / 3
);
image[0 * width * height + index] = gray;
image[1 * width * height + index] = gray;
image[2 * width * height + index] = gray;
}
}