How to apply a ROI image fast?

Hello all. I am new in CUDA programming, and I would like to accelerate a simple kernel I wrote that applies a ROI into an image. Right now, in Nsight, this kernel takes 800ms on avg, which is a lot for what it does.
My algorithm is simple. I have to read an input image with 4 channels (RGBA) and write all pixels from it to an output image that are set to one in a third image, the ROI image. The code I wrote is the following:

global void applyROI(unsigned char * inputImage, int width, int height, int components, int step_input, unsigned char *outputImage, int step_output, unsigned char * roi, int step_roi) {
if (threadIdx.x + (blockDim.x * blockIdx.x) >= width) return;
int global_idx = threadIdx.x + (blockDim.x * blockIdx.x);
int global_idy = threadIdx.y + (blockDim.y * blockIdx.y);
int roiIndex = global_idx + (step_roi * global_idy);
int inputIndex = (global_idx + (step_input / 4 * global_idy));
int outputIndex = (global_idx + (step_output / 4 * global_idy));
if (roi[roiIndex] != 0) {
int out = (int)outputImage;
int in = (int)inputImage;
out[outputIndex] = in[inputIndex];
} else {
int out = (int)outputImage;
out[outputIndex] = 0;
}
}

What am I doing wrong? How can I accelerate this method?
Thanks in advance,
Martin

are you building a debug project? If so, try building a release project.

what GPU are you running on? Is this on windows, or linux? what is the size of the image?

It is in release. GPU quaddro p4000. on Windows. Image is 3640x2160 resolution.

You might need to provide a complete code to diagnose whatever is going on. Your kernel should require about 800 microseconds, not 800 milliseconds. Here is a fully worked test case around the kernel you have shown:

$ cat t297.cu
#include <cstdio>


__global__ void applyROI(unsigned char * inputImage, int width, int height, int components, int step_input, unsigned char *outputImage, int step_output, unsigned char * roi, int step_roi) {
  if (threadIdx.x + (blockDim.x * blockIdx.x) >= width) return;
  int global_idx = threadIdx.x + (blockDim.x * blockIdx.x);
  int global_idy = threadIdx.y + (blockDim.y * blockIdx.y);
  int roiIndex = global_idx + (step_roi * global_idy);
  int inputIndex = (global_idx + (step_input / 4 * global_idy));
  int outputIndex = (global_idx + (step_output / 4 * global_idy));
  if (roi[roiIndex] != 0) {
    int *out = (int*)outputImage;
    int *in = (int*)inputImage;
    out[outputIndex] = in[inputIndex];
    } else {
    int *out = (int*)outputImage;
    out[outputIndex] = 0;
    }
}

int main(){
  int x = 3640;
  int y = 2160;
  int nc = 4;
  unsigned char *idata, *odata, *roi;
  cudaMalloc(&idata, x*y*nc);
  cudaMalloc(&odata, x*y*nc);
  cudaMalloc(&roi, x*y);
  cudaMemset(idata, 8, x*y*nc);
  cudaMemset(odata, 0, x*y*nc);
  cudaMemset(roi, 1, x*y);
  dim3 block(16,16);
  dim3 grid((x+block.x-1)/block.x, y/block.y);
  applyROI<<<grid , block >>>(idata, x, y, nc, nc*x, odata, nc*x, roi, x);
  unsigned char *result = new unsigned char[x*y*nc];
  cudaMemcpy(result, odata, x*y*nc, cudaMemcpyDeviceToHost);
  for (int i = 0; i < x*y*nc; i++) if (result[i] != 8){printf("mismatch at %d, was %d, should be %d\n", i, (int)(result[i]), 8); return -1;}
  return 0;
}

$ nvcc t297.cu -o t297
$ cuda-memcheck ./t297
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t297
==14027== NVPROF is profiling process 14027, command: ./t297
==14027== Profiling application: ./t297
==14027== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.97%  17.077ms         1  17.077ms  17.077ms  17.077ms  [CUDA memcpy DtoH]
                    1.00%  173.25us         1  173.25us  173.25us  173.25us  applyROI(unsigned char*, int, int, int, int, unsigned char*, int, unsigned char*, int)
                    0.02%  3.6800us         3  1.2260us  1.0560us  1.5680us  [CUDA memset]
      API calls:   93.05%  342.57ms         3  114.19ms  354.85us  341.80ms  cudaMalloc
                    4.83%  17.793ms         1  17.793ms  17.793ms  17.793ms  cudaMemcpy
                    1.41%  5.2034ms       384  13.550us     349ns  574.74us  cuDeviceGetAttribute
                    0.51%  1.8905ms         4  472.64us  279.29us  750.10us  cuDeviceTotalMem
                    0.13%  466.85us         4  116.71us  105.22us  138.24us  cuDeviceGetName
                    0.04%  152.62us         3  50.872us  28.787us  88.425us  cudaMemset
                    0.01%  33.038us         1  33.038us  33.038us  33.038us  cudaLaunchKernel
                    0.01%  24.609us         4  6.1520us  4.1290us  9.2150us  cuDeviceGetPCIBusId
                    0.00%  9.4270us         8  1.1780us     497ns  2.4920us  cuDeviceGet
                    0.00%  6.3100us         3  2.1030us     343ns  3.6770us  cuDeviceGetCount
                    0.00%  2.6990us         4     674ns     633ns     713ns  cuDeviceGetUuid
$

The profiler indicates the kernel duration as less than 200us on my Tesla P100. Your Quadro P4000 will be slower than that, but should not be more than 4x slower. (if, when you wrote 800ms, you actually meant 800 microseconds, then your results may be plausible.)

Perhaps you should try running and profiling my complete code on your system. If you still need help, provide a complete code like I have done, and also provide the complete command line you are using to compile the code, as well as the profiler output.

When you ran the CUDA profiler on this code, what bottleneck(s) did it point out? Have you tried to address those? Watch out for low global memory efficiency due to disadvantageous access pattern.

Processing data one byte at a time is usually not the best way to handle data on a GPU. You state that your input data is RGBA, so presumably you are dealing with 32-bit pixels? If so, I would suggest reading and processing the data as uchar4.

It was 800 microseconds, sorry. The code is straightforward and attached afterwards. When I run the profiler, the bottleneck is, of course, memory dependence with 92.58% (in the issue stall reasons graph). My question referred to if there was a way to accelerate this. I still find 800 microseconds for such a simple task to be too much. The image size is 7680 x 2160, I said it wrong earlier. Perhaps, instead of receiving an input image and copying (selectively) into an output through a kernel, I could copy via cudaMemCpy the image (device to device) to the output, followed by setting pixels of the output to zero. Would this be faster?

Thanks!!

__global__ void applyROI(unsigned char * inputImage, int width, int height, int components, int step_input, unsigned char *outputImage, int step_output, unsigned char * roi, int step_roi) {
    if (threadIdx.x + (blockDim.x * blockIdx.x) >= width) return;
    int global_idx = threadIdx.x + (blockDim.x * blockIdx.x);
    int global_idy = threadIdx.y + (blockDim.y * blockIdx.y);
    int roiIndex = global_idx + (step_roi * global_idy);
    int inputIndex, outputIndex;
    inputIndex = (global_idx + (step_input / 4 * global_idy));
    outputIndex = (global_idx + (step_output / 4 * global_idy));
    if (roi[roiIndex] != 0) {
        uchar4 *out = (uchar4*)outputImage;
        uchar4 *in = (uchar4*)inputImage;
        out[outputIndex] = in[inputIndex];
    } else {
        uchar4 *out = (uchar4*)outputImage;
        out[outputIndex] = 0;
    }
}
void applyROI(unsigned char * inputImage, int width, int height, int components, int step_input, unsigned char *outputImage, int step_output, unsigned char * roi, int step_roi, cudaStream_t stream) {
    int blockSize = 256;
    dim3 grid(ceil(width / blockSize), height, 1);
    dim3 threads(blockSize, 1, 1);
    applyROI <<<grid, threads, 0, stream >>> (inputImage, width, height, components, step_input, outputImage, step_output, roi, step_roi);
    CUDA_CHECK_LAST_ERROR;
}

What time would you consider adequate?

You are moving around quite a bit of data. Based on the memory bandwidth of the Quadro P4000, how close is your code to the bandwidth limit? The GPU specification lists a theoretical bandwidth of 243 GB/sec, but I guess the maximum practically achievable bandwidth is more like 200 GB/sec (you could measure it with the “zcopy” code appended below).

Why not try it, then ponder the results?

#include <stdlib.h>
#include <stdio.h>

#define ZCOPY_THREADS  128
#define ZCOPY_DEFLEN   100000000
#define ZCOPY_ITER     10           // as in STREAM benchmark

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
    LARGE_INTEGER t;
    static double oofreq;
    static int checkedForHighResTimer;
    static BOOL hasHighResTimer;

    if (!checkedForHighResTimer) {
        hasHighResTimer = QueryPerformanceFrequency (&t);
        oofreq = 1.0 / (double)t.QuadPart;
        checkedForHighResTimer = 1;
    }
    if (hasHighResTimer) {
        QueryPerformanceCounter (&t);
        return (double)t.QuadPart * oofreq;
    } else {
        return (double)GetTickCount() * 1.0e-3;
    }
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif

__global__ void zcopy (const double2 * __restrict__ src, 
                       double2 * __restrict__ dst, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        dst[i] = src[i];
    }
}    

struct zcopyOpts {
    int len;
};

static int processArgs (int argc, char *argv[], struct zcopyOpts *opts)
{
    int error = 0;
    memset (opts, 0, sizeof(*opts));
    while (argc) {
        if (*argv[0] == '-') {
            switch (*(argv[0]+1)) {
            case 'n':
                opts->len = atol(argv[0]+2);
                break;
            default:
                fprintf (stderr, "Unknown switch '%c%s'\n", '-', argv[0]+1);
                error++;
                break;
            }
        }
        argc--;
        argv++;
    }
    return error;
}

int main (int argc, char *argv[])
{
    double start, stop, elapsed, mintime;
    double2 *d_a, *d_b;
    int errors;
    struct zcopyOpts opts;

    errors = processArgs (argc, argv, &opts);
    if (errors) {
        return EXIT_FAILURE;
    }
    opts.len = (opts.len) ? opts.len : ZCOPY_DEFLEN;

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * opts.len));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * opts.len));
    
    /* Initialize device memory */
    CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * opts.len)); // zero
    CUDA_SAFE_CALL (cudaMemset(d_b, 0xff, sizeof(d_b[0]) * opts.len)); // NaN

    /* Compute execution configuration */
    dim3 dimBlock(ZCOPY_THREADS);
    int threadBlocks = (opts.len + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);
    
    printf ("zcopy: operating on vectors of %d double2s (= %.3e bytes)\n", 
            opts.len, (double)sizeof(d_a[0]) * opts.len);
    printf ("zcopy: using %d threads per block, %d blocks\n", 
            dimBlock.x, dimGrid.x);

    mintime = fabs(log(0.0));
    for (int k = 0; k < ZCOPY_ITER; k++) {
        start = second();
        zcopy<<<dimGrid,dimBlock>>>(d_a, d_b, opts.len);
        CHECK_LAUNCH_ERROR();
        stop = second();
        elapsed = stop - start;
        if (elapsed < mintime) mintime = elapsed;
    }
    printf ("zcopy: mintime = %.3f msec  throughput = %.2f GB/sec\n",
            1.0e3 * mintime, (2.0e-9 * sizeof(d_a[0]) * opts.len) / mintime);

    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));

    return EXIT_SUCCESS;
}