I just did a quick test, and it looks like subtracting two 1920x1080x8bit images should take 8 ms, including the memory copies. My test was on a busy system, so your results may be faster. I couldn’t allocate page locked memory of the size required to hold the two input images and the output image. That’s only 6 MB, which surprised me a little.
Note that I have not checked it for correctness, I just wanted a rough estimate of the performance:
#include <stdio.h>
#include "cuda.h"
#include "cutil.h"
__device__ unsigned char byte(unsigned int b, int num)
{
return (b >> (8 * num)) & 0xFF;
}
__global__ void diff(unsigned int *a, unsigned int *b, unsigned int *result)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int a_local = a[index];
unsigned int b_local = b[index];
unsigned int result_local = 0;
result_local |= byte(a_local, 3) - byte(b_local, 3);
result_local <<= 8;
result_local |= byte(a_local, 2) - byte(b_local, 2);
result_local <<= 8;
result_local |= byte(a_local, 1) - byte(b_local, 1);
result_local <<= 8;
result_local |= byte(a_local, 0) - byte(b_local, 0);
result[index] = result_local;
}
int main()
{
int elements = 1920 * 1080 / 4; // 4 bytes per int
int malloc_size = sizeof(unsigned int) * elements;
unsigned int *a_host = (unsigned int *) malloc(malloc_size);
unsigned int *b_host = (unsigned int *) malloc(malloc_size);
unsigned int *result_host = (unsigned int *) malloc(malloc_size);
unsigned int *a_device, *b_device, *result_device;
CUDA_SAFE_CALL(cudaMalloc((void**) &a_device, malloc_size));
CUDA_SAFE_CALL(cudaMalloc((void**) &b_device, malloc_size));
CUDA_SAFE_CALL(cudaMalloc((void**) &result_device, malloc_size));
dim3 dimBlock(256);
dim3 dimGrid(elements/dimBlock.x);
unsigned int timer;
cutCreateTimer(&timer);
cutStartTimer(timer);
int iterations = 500;
for (int i=0; i < iterations; i++) {
CUDA_SAFE_CALL(cudaMemcpy(a_device, a_host, malloc_size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(b_device, b_host, malloc_size, cudaMemcpyHostToDevice));
diff<<<dimGrid, dimBlock>>>(a_device, b_device, result_device);
CUDA_SAFE_CALL(cudaMemcpy(result_host, result_device, malloc_size,
cudaMemcpyDeviceToHost));
}
cutStopTimer(timer);
float total_time = cutGetTimerValue(timer);
printf("time per image: %f milliseconds\n", total_time/iterations);
return 0;
}
This code uses the cutil library included with the SDK, so you’ll have to link with it.