Help a newbie with CUDA development!

I’m an x264 developer and I’m beginning the long and dreadful process of porting x264’s motion estimation functions to CUDA. To begin with, I need to get a SAD (sum of absolute differences) function working. Here’s what I have now:

CPP file:

void gpuSAD(unsigned char * auRef, unsigned char * auCur, unsigned int * auSAD, unsigned int uNumMacroblocks)

{

    CUdeviceptr dref;

    CUdeviceptr dmb;

    CUdeviceptr dSAD;

    unsigned int timer = 0;

    float timerval;

   cuMemAlloc(&dref,  256*uNumMacroblocks);

    cuMemAlloc(&dmb,  256*uNumMacroblocks);

    cuMemAlloc(&dSAD,  4*uNumMacroblocks);

   cuParamSeti(gpuSADfn, 0, dref);

    cuParamSeti(gpuSADfn, 4, dmb);

    cuParamSeti(gpuSADfn, 8, dSAD);

	cuParamSeti(gpuSADfn, 12, uNumMacroblocks);

    cuParamSetSize(gpuSADfn, 16);

    cutCreateTimer(&timer);

   printf("Copy %d ref blocks, and %d current blocks to GPU\n",uNumMacroblocks,uNumMacroblocks);

    cutStartTimer(timer);

   cuMemcpyHtoD(dref, auRef, 256*uNumMacroblocks);

    cuMemcpyHtoD(dmb, auCur, 256*uNumMacroblocks);

   timerval = cutGetTimerValue(timer); 

    printf("Done in %f (ms)\n", timerval);

    printf("%f MB/sec\n", (256*(float)uNumMacroblocks*2/1024/1024*1000)/timerval);

   printf("Calculating %d SADs...\n", uNumMacroblocks);

   // warmup launch to remove overhead of 1st launch 

    cuLaunch(gpuSADfn);

    cuCtxSynchronize();

   // time real launch

    cutResetTimer(timer);

    cuLaunch(gpuSADfn);

    cuCtxSynchronize();

    timerval = cutGetTimerValue(timer); 

    printf("Done in %f (ms)\n", timerval);

    printf("Copy results to CPU\n");

   cutResetTimer(timer);

    

    cuMemcpyDtoH(auSAD, dSAD, 4*uNumMacroblocks);

   timerval = cutGetTimerValue(timer); 

    printf("Done in %f (ms)\n", timerval);

    printf("%f MB/sec\n", (4*(float)uNumMacroblocks*1000)/timerval/1024/1024);

    CUT_SAFE_CALL(cutDeleteTimer(timer));

    CUDA_SAFE_CALL(cuMemFree(dref));

    CUDA_SAFE_CALL(cuMemFree(dmb));

    CUDA_SAFE_CALL(cuMemFree(dSAD));

}

CU file:

/* helper functions */

__device__ unsigned int singleSAD(unsigned char * ReferenceBlock, unsigned char * CurrentBlock, unsigned int height, unsigned int width)

{

	unsigned int i;

	unsigned int uSAD = 0;

	int a;

	for (i=0;i<width*height;i++)

	{

  a = ReferenceBlock[i] - CurrentBlock[i];

  if (a < 0)

  {

  	a = -a;

  }

  uSAD += a;

	}

	return uSAD;

}

extern "C"

__global__ void

gpuSAD( unsigned char* ReferenceBlocks, unsigned char* CurrentBlocks, unsigned int* Output, unsigned int NumberOfBlocks)

{

	unsigned int i;

	//unsigned int width = 16;

	//unsigned int height = 16;

    for(i = 0; i < NumberOfBlocks; i++)

	{

  Output[i] = 15;

  //singleSAD( ReferenceBlocks+i*width*height,CurrentBlocks+i*width*height,width,height);

	}

}

Note how all it does is store 15 in the output–yet it doesn’t work! The output contains seemingly random values, as if there was a pointer problem… but all the code appears solid.

Am I doing something obviously retarded that I am missing?

I’m not familiar with the driver API. But you launch your kernel with cuLaunch, which invokes the kernel on a grid of 1x1 blocks, that is, you use only one block. And the documentation says that the number of threads within this block must be specified by a previous call of cuFuncSetBlockShape. And you don’t call this function. So I don’t know how CUDA behaves in that situation but for sure you shouldn’t be doing that. It’s even likely that your kernel doesn’t even start.

First you need to call cuFuncSetBlockShape to define the number of threads per block and you might want to use cuLaunchGrid to invoke the kernel with several blocks, which makes all the force of CUDA. Did you understand the thread/block approach of CUDA? Have a look on the documentation and the examples in the SDK.

I know the point of CUDA, but I want to make it work on one block before I go for multiple blocks.

I tried setting cuFuncSetBlockShape but it didn’t help…

Heve you tried checking error codes? Or at least reading Programming Manual and checking matixMulDrv SDK sample?

  1. cuInit() is missing in your code.
  2. cuLaunch() accepts CUfunction, not C-function pointer. You obtain CUfunction by loading module with cuModuleLoad*() and calling cuModuleGetFunction() for that module.

Sorry for neglecting that; that is done elsewhere in the code and has been tested (no error codes).

I’ll post the rest of the code:

main.cpp:

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <gpuSAD.h>

void cpuSAD(unsigned char * auRef, unsigned char * auCur, unsigned int * auSAD, unsigned int uNumMacroblocks)

{

	unsigned int i,m;

	unsigned int uSAD = 0;

	int a;

	for (m=0;m<uNumMacroblocks;m++)

	{

  for (i=0;i<256;i++)

  {

  	a = auRef[(m<<8) + i] - auCur[(m<<8)+i];

  	if (a < 0)

  	{

    a = -a;

  	}

      uSAD += a;

  }

  auSAD[m] = uSAD;

  uSAD = 0;

	}

}

#define NUM_MACROBLOCKS 10 // 1080p

int main(int argc, char** argv)

{

	unsigned char *auRef;

	unsigned char *auCur;

	unsigned int *auGPUSAD;

	unsigned int *auCPUSAD;

	int i;

        int iRetVal;

	argc=argc;

	gpuInit(argv);	

#if defined NON_PINNED

	auRef = (unsigned char *)malloc (NUM_MACROBLOCKS*256);

	auCur = (unsigned char *)malloc (NUM_MACROBLOCKS*256);

	auGPUSAD = (unsigned int *)malloc (NUM_MACROBLOCKS*4);

	auCPUSAD = (unsigned int *)malloc (NUM_MACROBLOCKS*4);

#else

	iRetVal = cuMemAllocHost ((void **)&auRef, NUM_MACROBLOCKS*256);

	iRetVal = cuMemAllocHost ((void **)&auCur, NUM_MACROBLOCKS*256);

	iRetVal = cuMemAllocHost ((void **)&auGPUSAD, NUM_MACROBLOCKS*4);

	iRetVal = cuMemAllocHost ((void **)&auCPUSAD, NUM_MACROBLOCKS*4);

#endif	

	for (i=0;i<NUM_MACROBLOCKS*256;i++)

	{

  auRef[i] = i;

  auCur[i] = 256-(i % 256);

	}

	gpuSAD(auRef, auCur, auGPUSAD, NUM_MACROBLOCKS);

	printf("Testing results...\n");

	cpuSAD(auRef, auCur, auCPUSAD, NUM_MACROBLOCKS);

	

	

	for (i=0;i<NUM_MACROBLOCKS;i++)

	{

  if (auGPUSAD[i] != auCPUSAD[i])

  {

  	printf("ERROR CPU SAD (%d) doesn't match GPU SAD (%d) \n", auCPUSAD[i],auGPUSAD[i]);

  }

	}

	printf("DONE!\n");

#if defined NON_PINNED

	free (auRef);

	free (auCur);

	free (auGPUSAD);

	free (auCPUSAD);

#else

	cuMemFreeHost(auRef);

	cuMemFreeHost(auCur);

	cuMemFreeHost(auGPUSAD);

	cuMemFreeHost(auCPUSAD);

#endif

}

and the rest of gpuSAD.cpp:

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cutil.h>

CUdevice cuDevice;

CUcontext cuContext;

CUmodule cuModule;

CUfunction gpuSADfn;

static CUresult

initCUDA(char* executablePath, CUfunction *gpuSAD )

{

    CUfunction cuFunction = 0;

    char* module_path;

    CUT_DEVICE_INIT_DRV(cuDevice);

    CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );

    if ( CUDA_SUCCESS != status )

	{

  printf("Failed to create status.  Error code: %d\n",status);

        goto Error;

	}

    module_path = cutFindFilePath("gpuSAD.cubin", executablePath);

    if (module_path == 0) {

        status = CUDA_ERROR_NOT_FOUND;

  printf("Failed to find module path.  Error code: %d\n",status);

        goto Error;

    }

    status = cuModuleLoad(&cuModule, module_path);

    cutFree(module_path);

    if ( CUDA_SUCCESS != status ) {

  printf("Failed to load CUDA module.  Error code: %d\n",status);

        goto Error;

    }

    status = cuModuleGetFunction( &cuFunction, cuModule, "gpuSAD" );

    if ( CUDA_SUCCESS != status )

	{

  printf("Failed to load gpuSAD.  Error code: %d\n",status);

        goto Error;

	}

    *gpuSAD = cuFunction;

    return CUDA_SUCCESS;

Error:

    cuCtxDetach(cuContext);

    return status;

}

void gpuInit(char **argv)

{

    if(initCUDA(argv[0], &gpuSADfn) != CUDA_SUCCESS)

	{

  printf("initCUDA failed.  Terminating.\n");

  exit(0);

	}

}

This is wrong; CUDA considers pointers to be 64 bit, not 32 bit, so you need to skip 8 bytes instead of 4, and the parameter size will be 28.

And it works! Thanks a bunch.

Now to try out the actual SAD function.

Edit: And that works too… now the next step is to optimize it (right now its, obviously, pathetically slow).

What’s a good way to take advantage of the SIMD aspect (on each individual chip) with a sum of absolute differences operation on 16x16,8x16,16x8, etc blocks?

The biggest thing you need to do to get good performance is to coalesce the memory reads. Almost nothing else matters with such a simple kernel.

I believe CUDA 1.1 added some hardware intrinsics for computing SAD (at least I remember them in the manual). You can try them out, but it is likely that the small number of computations you do are completely hidden in the memory latency.

I also see that you are summing values up. Have a look at the reduction example in the SDK to see how to do this efficiently in parallel. Ideally, you should have 1 thread per array element in your source image to take maximum advantage of the parallel nature. However, due to the coalescing constraints (can only coalesce 32, 64, and 128 bit reads), you may need to handle 4 elements (32 bit read) in a single thread. You can try 64-bits and 128-bits, but in every test I’ve done, 32-bit coalesced reads always win.

Hopefully that gives you some ideas. If you have any questions on my ramblings, just ask.

How do I coalesce the reads?

Here’s my updated code (also note I’m using cuFuncSetBlockShape( gpuSADfn, 32, 1, 1); and cuLaunchGrid(gpuSADfn,uNumMacroblocks,1); now):

/* helper functions */

__device__ inline int abs(int x)

{

	return (x > 0) ? x : -x;

}

__device__ unsigned int singleSAD(unsigned char * ReferenceBlock, unsigned char * CurrentBlock, unsigned int height, unsigned int width)

{

	unsigned int i;

	unsigned int uSAD = 0;

	int a[8];

	for (i=0;i<width*height;i+=8)

	{

  a[0] = abs(ReferenceBlock[i+0] - CurrentBlock[i+0]);

  a[1] = abs(ReferenceBlock[i+1] - CurrentBlock[i+1]);

  a[2] = abs(ReferenceBlock[i+2] - CurrentBlock[i+2]);

  a[3] = abs(ReferenceBlock[i+3] - CurrentBlock[i+3]);

  a[4] = abs(ReferenceBlock[i+4] - CurrentBlock[i+4]);

  a[5] = abs(ReferenceBlock[i+5] - CurrentBlock[i+5]);

  a[6] = abs(ReferenceBlock[i+6] - CurrentBlock[i+6]);

  a[7] = abs(ReferenceBlock[i+7] - CurrentBlock[i+7]);

  uSAD += a[0];uSAD += a[1];uSAD += a[2];uSAD += a[3];

  uSAD += a[4];uSAD += a[5];uSAD += a[6];uSAD += a[7];

	}

	return uSAD;

}

extern "C"

__global__ void

gpuSAD( unsigned char* ReferenceBlocks, unsigned char* CurrentBlocks, unsigned int* Output )

{

	unsigned int i;

	unsigned int width = 16;

	unsigned int height = 16;

    //for(i = threadIdx.x; i < threadIdx.x + blockDim.x; i++)

	//{

  i = threadIdx.x;

  Output[i] = singleSAD( ReferenceBlocks+i*width*height,CurrentBlocks+i*width*height,width,height);

	//}

}

The original code took 720ms for 8160 SADs (absurd), while this takes 72ms (still ridiculous). A reference binary I have takes 0.3 milliseconds or so, so I’m guessing memory latency is the main issue.

Am I correct in unrolling the 8 replicate instructions in order to cause the compiler to SIMD them? As I said, I’m not exactly sure how CUDA SIMD works.

Unfortunately global reads of unsigned char cannot be coalesced - the words being read must be at least 32 bits. There are two possible solutions: read the unsigned chars through texture (a la sobelFilter), or stage the data into shared memory as unsigned ints and then read the data from shared memory as bytes.

Staging into shared memory is likely to have better performance, if you can conform to coalescing constraints when reading the data as 32-bit words. Refer to the sobelFilter app in the SDK - the shared memory is declared as an unsized array of unsigned chars LocalBlock:

extern shared unsigned char LocalBlock;

If you declare another variable:

extern shared unsigned int LocalBlockUI;

then LocalBlockUI aliases LocalBlock but can be addressed as an array of unsigned int.

Do the coalesced global reads to write into LocalBlockUI, and process the bytes from LocalBlock using the 4x unrolled structure of sobelFilter to avoid shared memory conflicts.

Needless to say, this method gets a lot trickier if the kernel needs any other data to reside in shared memory.

Check whether you’re improving coalescing by configuring the profiler to monitor “gld_incoherent” and “gld_coherent” events.

You definitely want the SAD intrinsics from CUDA 1.1. From the programming guide:

__[u]sad(x,y,z) (Sum of Absolute Difference) returns the sum of integer parameter z and the absolute value of the difference between integer parameters x and y.

Done (32-bit calls used instead of 8-bit), another factor of 4 speedup or so, but still vastly slower than the reference binary I was given. Is the cache on each of the multiprocessors automatic, or do I have to force something to be put into that memory cache?

Here’s my function now:

unsigned int *ReferenceBlockInt = (unsigned int *)ReferenceBlock;

	unsigned int *CurrentBlockInt = (unsigned int *)CurrentBlock;

	unsigned int i;

	unsigned int uSAD = 0;

	for (i=0;i<width*height/4;i++)

	{

  unsigned int ref = ReferenceBlockInt[i];

  unsigned int cur = CurrentBlockInt[i];

  uSAD = __usad(ref>>24,cur>>24,uSAD);

  ref = ref << 8; cur = cur << 8;

  uSAD = __usad(ref>>24,cur>>24,uSAD);

  ref = ref << 8; cur = cur << 8;

  uSAD = __usad(ref>>24,cur>>24,uSAD);

  ref = ref << 8; cur = cur << 8;

  uSAD = __usad(ref>>24,cur>>24,uSAD);

	}

	return uSAD;

What occupancy does your kernel get? What is the block shape? How large is your grid? It’s very important to make good use of the GPU paralellism.

Also, there is no automatic cache (except for constants). You need to ‘cache’ yourself by storing things in shared memory.

I tried the following:

shared unsigned char RBlock[256];
shared unsigned char CBlock[256];
for(i=0;i<256;)
{
unsigned int ref = ReferenceBlockInt[i>>2];
unsigned int cur = CurrentBlockInt[i>>2];
RBlock[i]=(unsigned char)(ref>>24);CBlock[i]=(unsigned char)(cur>>24);
ref = ref << 8; cur = cur << 8;i++;
RBlock[i]=(unsigned char)(ref>>24);CBlock[i]=(unsigned char)(cur>>24);
ref = ref << 8; cur = cur << 8;i++;
RBlock[i]=(unsigned char)(ref>>24);CBlock[i]=(unsigned char)(cur>>24);
ref = ref << 8; cur = cur << 8;i++;
RBlock[i]=(unsigned char)(ref>>24);CBlock[i]=(unsigned char)(cur>>24);
i++;
}

But there was no performance boost at all from this preloading, suggesting again that there is no coalescing going on. Syncing the threads changes nothing. It seems that regardless of where I put the reads, the exact same amount of reading has to be done, resulting in no real performance gain.

There is something else that is confusing me greatly, also:

SobelFilter: extern shared unsigned char LocalBlock;

When I try to use extern, I get: local and shared variables cannot have external linkage

How is SobelFilter doing something that NVCC tells me is impossible?

The extern declaration only works for unsized arrays, maybe that is your problem.

Coalescing also will only occur if adjacent threads are accessing adjacent memory. for loops generally should be initialized with TID-derived values (e.g. threadIdx.x) and increment by values derived from the block and/or grid size (e.g. blockDim.x).

It will be easier if you modify an SDK sample that already does something along the lines of what you want to do. sobelFilter is a good place to start. The good news is your app doesn’t have overfetch to compute its results, so you should be able to tighten up the sobelFilter code quite a bit. If necessary, start with the simple sobelTex kernel as opposed to the optimized SobelShared kernel.

Heh, apparently the test cases I was given (which have now been rewritten) had all the input blocks be exactly the same, resulting in not detecting the fact that my code was in fact completely broken. I fixed it and I’m now down to 5.8 milliseconds… now time to implement the coalesced reads.

I can’t seem to get it to coalesce the reads or even correctly read the data when the reads are coalesced.

Here’s the new code; notice the loadSADs function. Its built around a block width of 8 this time, just for testing purposes.

/* helper functions */

__shared__ unsigned int RBlock[64*8];

__shared__ unsigned int CBlock[64*8];

__device__ unsigned int singleSAD(unsigned int offset)

{

	unsigned int uSAD = 0;

	unsigned int i;

	for (i=offset*64;i<offset*64+64;i++)

	{

  unsigned int ref = RBlock[i];

  unsigned int cur = CBlock[i];

  uSAD = __usad(ref>>24,cur>>24,uSAD);

  ref = ref << 8; cur = cur << 8;

  uSAD = __usad(ref>>24,cur>>24,uSAD);

  ref = ref << 8; cur = cur << 8;

  uSAD = __usad(ref>>24,cur>>24,uSAD);

  ref = ref << 8; cur = cur << 8;

  uSAD = __usad(ref>>24,cur>>24,uSAD);

	}

	return uSAD;

}

__device__ void loadSADs(unsigned char* ReferenceBlocks, unsigned char* CurrentBlocks,unsigned int offset)

{

	unsigned int *ReferenceBlockInt = (unsigned int *)ReferenceBlocks;

	unsigned int *CurrentBlockInt = (unsigned int *)CurrentBlocks;

	unsigned int i,j;

	unsigned int start = (blockIdx.x-offset);

	unsigned int finish = (start+blockDim.x);

	start *= 64;finish *= 64;

	__syncthreads();

	for(i=start+offset,j=offset;i<finish;i+=blockDim.x,j+=blockDim.x)

  RBlock[j] = ReferenceBlockInt[i];

	for(i=start+offset,j=offset;i<finish;i+=blockDim.x,j+=blockDim.x)

  CBlock[j] = CurrentBlockInt[i];

	__syncthreads();

}

extern "C"

__global__ void

gpuSAD( unsigned char* ReferenceBlocks, unsigned char* CurrentBlocks, unsigned int* Output )

{

	unsigned int i,offset;

	i = blockIdx.x;

	offset = (blockIdx.x % blockDim.x);

	loadSADs(ReferenceBlocks,CurrentBlocks,offset);

	Output[i] = singleSAD(offset);

}

For each thread, that original code would have each thread load the reference block into shared memory. If your blockDim (64,1), then 64 threads are each loading a single reference block to shared memory. You could have your function only do the load for (threadIdx.x == 0), but in the case with a single thread to load of the entire RBlock and CBlock, you won’t be able to get coalesced reads.

Instead, each thread should load 4,8,or 16 bytes into shared memory (RBlock, CBlock). For coalesced reads, each warp (32 threads) will need to follow these memory addressing guidelines.[i]

  • BaseAddress + Size * N

  • Size = 4, 8, or 16 bytes of memory to be read or written to per thread

  • Base address = must be aligned to 16 * size[/i]

For this code sample, I assume thread block dimensions are declared as dim3 threads(64,1)

I’m not sure if the newer code below performs the correct memory addressing for reads. Instead of using for loops, we prefer to have each of threads in a thread block, each performing a read. This gets us the coalesced reads.) With the change, it now requires 2 warps to read 64 elements (1 block) into shared memory. The code might look like this:

__device__ void loadSADs(unsigned char* ReferenceBlocks, unsigned char* CurrentBlocks, unsigned int offset)

{

   unsigned int *ReferenceBlockInt = (unsigned int *)ReferenceBlocks;

   unsigned int *CurrentBlockInt = (unsigned int *)CurrentBlocks;

   unsigned int start = (blockIdx.x-offset);

   unsigned int finish = (start+blockDim.x);

  unsigned int i = start+offset;

   unsigned int j = offset;

  RBlock[j*blockDim.x+threadIdx.x] =ReferenceBlockInt[i*blockDim.x+threadIdx.x];

   CBlock[j*blockDim.x+threadIdx.x] =CurrentBlockInt[i*blockDim.x+threadIdx.x];

  __syncthreads();

}

Now for the singleSAD, removed the for loop. Now there are 2 warps to work the first SAD. Next we iteratively reduce the uSAD(s) until we only have 1 final result.

__device__ unsigned int singleSAD(unsigned int offset)

{

   __shared__ unsigned int uSAD[64];

   unsigned int i = offset*blockDim.x;

   unsinged int x  = threadIdx.x;

  unsigned int ref = RBlock[i+threadIdx.x];

   unsigned int cur = CBlock[i+threadIdx.x];

  // First initialize all elements to 0

   if (threadIdx.x < 64) {

      uSAD[threadIdx.x] = 0;

   }

   __syncthreads();

  // Each threads perform a SAD operation

   uSAD[x] = __usad(ref>>24,cur>>24,uSAD[x]);

   ref = ref << 8; cur = cur << 8;

   uSAD[x] = __usad(ref>>24,cur>>24,uSAD[x]);

   ref = ref << 8; cur = cur << 8;

   uSAD[x] = __usad(ref>>24,cur>>24,uSAD[x]);

   ref = ref << 8; cur = cur << 8;

   uSAD[x] = __usad(ref>>24,cur>>24,uSAD[x]);

  // Now we must reduce iteratively to get one SAD result

   for (int j=blockDim.x/2; j > 0; j >>= 1)

   {

       __syncthreads();

     // For each iteration, we perform continue to reduction on 1/2 of the blockDim.x

      uSAD[x] += uSAD[j+x]; 

   }

   return uSAD[0];

}