This is a code I’ve written as part of an Image Processing library that is to do a simple addition between two images.
Kernel and host wrapper methods inserted here:
[codebox]
#pragma warning(disable : 4996) // this is a known VS2005 issue and was solved in 2008 where the original code was compiled
#include “CudaHostAPI.h”
#pragma message (“Note: Including CudaHostAPI.h”)
#include “CudaHostDefs.h”
#pragma message (“Note: Including CudaHostDefs.h”)
#include <cuda.h>
#pragma message (“Note: Including cuda.h”)
// GPU function to Add two float images (this outputs the output in the input for faster accessing)
// ************************************************************
global void AddComp(float* fpIn1Img, float* fpIn2Img)
{
// for all threads in the block grid, compute the X & Y pixel indices in the source image of the starting pixel that THIS INDEXED THREAD will process
unsigned int uiOffset = __mul24(__mul24(__mul24(blockIdx.y, blockDim.y) + threadIdx.y , gridDim.x ) + blockIdx.x , blockDim.x) + threadIdx.x;
// compute for this thread
fpIn1Img[uiOffset] = fpIn1Img[uiOffset] + fpIn2Img[uiOffset];
return;
}
// GPU Host wrapper for Addition of two float image
// ************************************************************
int AddGPU(float* fpIn1Img, float* fpIn2Img, float* fpOutImg, size_t szWidth, size_t szHeight)
{
// locals to manage CUDA calls and setup thread parameters
cudaError_t cuErr1, cuErr2;
// set up block and grid dimensions
dim3 blockSz(BLOCKWIDTH,BLOCKHEIGHT);
// To cover the whole image with a grid of thread blocks,
// divide image width in pixels by the thread block width and set image height as grid height
dim3 gridSz(CUDA_DIVUP(szWidth, BLOCKWIDTH), CUDA_DIVUP(szHeight, BLOCKHEIGHT));
cuErr1 = cudaMemcpy(fpCudaIn1Img, fpIn1Img, szWidth * szHeight * sizeof(float), cudaMemcpyHostToDevice);
cuErr2 = cudaThreadSynchronize();
if (!((cuErr1 == CUDA_SUCCESS) && (cuErr2 == CUDA_SUCCESS))) {
return -11;
}
cuErr1 = cudaMemcpy(fpCudaIn2Img, fpIn2Img, szWidth * szHeight * sizeof(float), cudaMemcpyHostToDevice);
cuErr2 = cudaThreadSynchronize();
if (!((cuErr1 == CUDA_SUCCESS) && (cuErr2 == CUDA_SUCCESS))) {
return -11;
}
// Call the GPU kernel to do the core computations
AddComp <<<gridSz, blockSz>>>(fpCudaIn1Img, fpCudaIn2Img);
cuErr1 = cudaGetLastError();
// get the timing and check errors
if (!(cuErr1==CUDA_SUCCESS)&&(cuErr2==CUDA_SUCCESS)) {
return -11;
}
if (fpOutImg) {
cuErr1 = cudaMemcpy(fpOutImg, fpCudaOutImg, szWidth * szHeight * sizeof(float), cudaMemcpyDeviceToHost);
}
else {
cuErr1 = cudaMemcpy(fpIn1Img, fpCudaIn1Img, szWidth * szHeight * sizeof(float), cudaMemcpyDeviceToHost);
}
cuErr2 = cudaThreadSynchronize();
if (!((cuErr1==CUDA_SUCCESS)&&(cuErr2==CUDA_SUCCESS))) {
return -12; // return HandleError (-12, "AddGPU(): cudaMemcpy(fpOutImg)", NOMSG);
}
return (szWidth * szHeight);
}
[/codebox]
I tried to play around with the block and grid dimensions but to no avail, I also tried to replace some of the dimensions by loops with pointer advancements but this also came up slower, the code above is only about 1.5-2 times faster than the parallel naive C++ code (which is of course un-optimizable baring multi-core code).
Oh yeah I’m running this on a 32 core Geforce 9500 GT.
I noticed that changing to loops instead of 2d block dimensions can have a very bad hit on performance even if I maintain 16 float coalescing (the image dimensions for testing are 1024x1024), I’m thus left clueless, can any of you veterans tell me whether there be any way to accelerate this ere code? External Image