I’m an x264 developer and I’m beginning the long and dreadful process of porting x264’s motion estimation functions to CUDA. To begin with, I need to get a SAD (sum of absolute differences) function working. Here’s what I have now:
CPP file:
void gpuSAD(unsigned char * auRef, unsigned char * auCur, unsigned int * auSAD, unsigned int uNumMacroblocks)
{
CUdeviceptr dref;
CUdeviceptr dmb;
CUdeviceptr dSAD;
unsigned int timer = 0;
float timerval;
cuMemAlloc(&dref, 256*uNumMacroblocks);
cuMemAlloc(&dmb, 256*uNumMacroblocks);
cuMemAlloc(&dSAD, 4*uNumMacroblocks);
cuParamSeti(gpuSADfn, 0, dref);
cuParamSeti(gpuSADfn, 4, dmb);
cuParamSeti(gpuSADfn, 8, dSAD);
cuParamSeti(gpuSADfn, 12, uNumMacroblocks);
cuParamSetSize(gpuSADfn, 16);
cutCreateTimer(&timer);
printf("Copy %d ref blocks, and %d current blocks to GPU\n",uNumMacroblocks,uNumMacroblocks);
cutStartTimer(timer);
cuMemcpyHtoD(dref, auRef, 256*uNumMacroblocks);
cuMemcpyHtoD(dmb, auCur, 256*uNumMacroblocks);
timerval = cutGetTimerValue(timer);
printf("Done in %f (ms)\n", timerval);
printf("%f MB/sec\n", (256*(float)uNumMacroblocks*2/1024/1024*1000)/timerval);
printf("Calculating %d SADs...\n", uNumMacroblocks);
// warmup launch to remove overhead of 1st launch
cuLaunch(gpuSADfn);
cuCtxSynchronize();
// time real launch
cutResetTimer(timer);
cuLaunch(gpuSADfn);
cuCtxSynchronize();
timerval = cutGetTimerValue(timer);
printf("Done in %f (ms)\n", timerval);
printf("Copy results to CPU\n");
cutResetTimer(timer);
cuMemcpyDtoH(auSAD, dSAD, 4*uNumMacroblocks);
timerval = cutGetTimerValue(timer);
printf("Done in %f (ms)\n", timerval);
printf("%f MB/sec\n", (4*(float)uNumMacroblocks*1000)/timerval/1024/1024);
CUT_SAFE_CALL(cutDeleteTimer(timer));
CUDA_SAFE_CALL(cuMemFree(dref));
CUDA_SAFE_CALL(cuMemFree(dmb));
CUDA_SAFE_CALL(cuMemFree(dSAD));
}
CU file:
/* helper functions */
__device__ unsigned int singleSAD(unsigned char * ReferenceBlock, unsigned char * CurrentBlock, unsigned int height, unsigned int width)
{
unsigned int i;
unsigned int uSAD = 0;
int a;
for (i=0;i<width*height;i++)
{
a = ReferenceBlock[i] - CurrentBlock[i];
if (a < 0)
{
a = -a;
}
uSAD += a;
}
return uSAD;
}
extern "C"
__global__ void
gpuSAD( unsigned char* ReferenceBlocks, unsigned char* CurrentBlocks, unsigned int* Output, unsigned int NumberOfBlocks)
{
unsigned int i;
//unsigned int width = 16;
//unsigned int height = 16;
for(i = 0; i < NumberOfBlocks; i++)
{
Output[i] = 15;
//singleSAD( ReferenceBlocks+i*width*height,CurrentBlocks+i*width*height,width,height);
}
}
Note how all it does is store 15 in the output–yet it doesn’t work! The output contains seemingly random values, as if there was a pointer problem… but all the code appears solid.
Am I doing something obviously retarded that I am missing?