Simple/1st CUDA program: Reverse bits in byte Why is it faster on the CPU?

So I wrote my 1st CUDA app. I’m not from a heavy math background and 3-dimensions tend to confuse me. I’m more of a bit-stream man, so I wrote an app that reads bytes from stdin, reverses the bits in each byte (in r8 in certain parlance), and writes the results to stdout.

The results of the program are correct. (Always a good starting point!) But I’ve been led to two questions by this exercise.

  1. Why is the CUDA implementation of this that I’ve written slower than an iterative CPU only implementation?

  2. Am I doing anything wrong in this app? Especially am I missing some form of synchronization to make sure the copies and the kernel call happen in the correct order?

Coda is attached in it’s entirety (with an .txt extension), but here are the important parts:

main: 2k buffer (arbitrary size). Everything is 1-dimensional, use as many threads/block as possible, with the 2k buffer, this means a single grid of 4 blocks with 512 threads in each block. Read into host buffer, transfer to dev buffer, execute r8(), transfer results back, write them out – repeat.

int main(int argc, const char* argv[])

{

	char host_buf[2048] = { '

int main(int argc, const char* argv)

{

char host_buf[2048] = { '
int main(int argc, const char* argv[])

{

	char host_buf[2048] = { '

int main(int argc, const char* argv)

{

char host_buf[2048] = { '
int main(int argc, const char* argv[])

{

	char host_buf[2048] = { '

int main(int argc, const char* argv)

{

char host_buf[2048] = { '
int main(int argc, const char* argv[])

{

	char host_buf[2048] = { '

int main(int argc, const char* argv)

{

char host_buf[2048] = { '\0' };

void *dev_buf = NULL;

size_t nb = 0;

cudaError_t cu_error = cudaSuccess;

int dev_no = 0;

struct cudaDeviceProp props;

dim3 dimBlock( sizeof(host_buf) );

dim3 dimGrid(1);

/* .. *SNIP* .. */

// If our max threads/block is less than our buffer size

// Adjust.

if (props.maxThreadsPerBlock < dimBlock.x)

{

// Make a block as wide as we can.

dimBlock.x = props.maxThreadsPerBlock;

assert(dimBlock.x > 0);

}

// Allocate a buffer in device memory.

cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

/* .. *SNIP* .. */

do

{

// Read some data in.

size_t nb_written = 0;

nb = fread(host_buf, 1, sizeof(host_buf), stdin);

if (nb > 0)

{

// Copy that data to the card.

cudaMemcpy(

dev_buf,

host_buf,

nb,

cudaMemcpyHostToDevice);

// How many grids is that?

dimGrid.x  = nb + dimBlock.x - 1;

dimGrid.x /= dimBlock.x;

// Tell the card to process that data.

r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

// Copy results back.

cudaMemcpy(

host_buf,

dev_buf,

nb,

cudaMemcpyDeviceToHost);

// Write the results to stdout.

nb_written = fwrite(host_buf, 1, nb, stdout);

/* … SNIP … */

}

} while (nb != 0);

/* .. *SNIP* .. */

return 0;

}

' };

	void *dev_buf = NULL;

	size_t nb = 0;

	cudaError_t cu_error = cudaSuccess;

	int dev_no = 0;

	struct cudaDeviceProp props;

	dim3 dimBlock( sizeof(host_buf) );

	dim3 dimGrid(1);

	/* .. *SNIP* .. */

	// If our max threads/block is less than our buffer size

	// Adjust.

	if (props.maxThreadsPerBlock < dimBlock.x)

	{

  // Make a block as wide as we can.

  dimBlock.x = props.maxThreadsPerBlock;

  assert(dimBlock.x > 0);

	}

	// Allocate a buffer in device memory.

	cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

	/* .. *SNIP* .. */

	do

	{

  // Read some data in.

  size_t nb_written = 0;

 nb = fread(host_buf, 1, sizeof(host_buf), stdin);

 if (nb > 0)

  {

  	// Copy that data to the card.

  	cudaMemcpy(

    dev_buf,

    host_buf,

    nb,

    cudaMemcpyHostToDevice);

 	// How many grids is that?

  	dimGrid.x  = nb + dimBlock.x - 1;

  	dimGrid.x /= dimBlock.x;

 	// Tell the card to process that data.

  	r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

 	// Copy results back.

  	cudaMemcpy(

    host_buf,

    dev_buf,

    nb,

    cudaMemcpyDeviceToHost);

  	

  	// Write the results to stdout.

  	nb_written = fwrite(host_buf, 1, nb, stdout);

  	/* .. *SNIP* .. */

  }

	} while (nb != 0);

	/* .. *SNIP* .. */

	return 0;

}

’ };

void *dev_buf = NULL;

size_t nb = 0;

cudaError_t cu_error = cudaSuccess;

int dev_no = 0;

struct cudaDeviceProp props;

dim3 dimBlock( sizeof(host_buf) );

dim3 dimGrid(1);

/* .. *SNIP* .. */

// If our max threads/block is less than our buffer size

// Adjust.

if (props.maxThreadsPerBlock < dimBlock.x)

{

// Make a block as wide as we can.

dimBlock.x = props.maxThreadsPerBlock;

assert(dimBlock.x > 0);

}

// Allocate a buffer in device memory.

cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

/* .. *SNIP* .. */

do

{

// Read some data in.

size_t nb_written = 0;

nb = fread(host_buf, 1, sizeof(host_buf), stdin);

if (nb > 0)

{

// Copy that data to the card.

cudaMemcpy(

dev_buf,

host_buf,

nb,

cudaMemcpyHostToDevice);

// How many grids is that?

dimGrid.x  = nb + dimBlock.x - 1;

dimGrid.x /= dimBlock.x;

// Tell the card to process that data.

r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

// Copy results back.

cudaMemcpy(

host_buf,

dev_buf,

nb,

cudaMemcpyDeviceToHost);

// Write the results to stdout.

nb_written = fwrite(host_buf, 1, nb, stdout);

/* … SNIP … */

}

} while (nb != 0);

/* .. *SNIP* .. */

return 0;

}

' };

	void *dev_buf = NULL;

	size_t nb = 0;

	cudaError_t cu_error = cudaSuccess;

	int dev_no = 0;

	struct cudaDeviceProp props;

	dim3 dimBlock( sizeof(host_buf) );

	dim3 dimGrid(1);

	/* .. *SNIP* .. */

	// If our max threads/block is less than our buffer size

	// Adjust.

	if (props.maxThreadsPerBlock < dimBlock.x)

	{

  // Make a block as wide as we can.

  dimBlock.x = props.maxThreadsPerBlock;

  assert(dimBlock.x > 0);

	}

	// Allocate a buffer in device memory.

	cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

	/* .. *SNIP* .. */

	do

	{

  // Read some data in.

  size_t nb_written = 0;

 nb = fread(host_buf, 1, sizeof(host_buf), stdin);

 if (nb > 0)

  {

  	// Copy that data to the card.

  	cudaMemcpy(

    dev_buf,

    host_buf,

    nb,

    cudaMemcpyHostToDevice);

 	// How many grids is that?

  	dimGrid.x  = nb + dimBlock.x - 1;

  	dimGrid.x /= dimBlock.x;

 	// Tell the card to process that data.

  	r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

 	// Copy results back.

  	cudaMemcpy(

    host_buf,

    dev_buf,

    nb,

    cudaMemcpyDeviceToHost);

  	

  	// Write the results to stdout.

  	nb_written = fwrite(host_buf, 1, nb, stdout);

  	/* .. *SNIP* .. */

  }

	} while (nb != 0);

	/* .. *SNIP* .. */

	return 0;

}

’ };

void *dev_buf = NULL;

size_t nb = 0;

cudaError_t cu_error = cudaSuccess;

int dev_no = 0;

struct cudaDeviceProp props;

dim3 dimBlock( sizeof(host_buf) );

dim3 dimGrid(1);

/* .. *SNIP* .. */

// If our max threads/block is less than our buffer size

// Adjust.

if (props.maxThreadsPerBlock < dimBlock.x)

{

// Make a block as wide as we can.

dimBlock.x = props.maxThreadsPerBlock;

assert(dimBlock.x > 0);

}

// Allocate a buffer in device memory.

cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

/* .. *SNIP* .. */

do

{

// Read some data in.

size_t nb_written = 0;

nb = fread(host_buf, 1, sizeof(host_buf), stdin);

if (nb > 0)

{

// Copy that data to the card.

cudaMemcpy(

dev_buf,

host_buf,

nb,

cudaMemcpyHostToDevice);

// How many grids is that?

dimGrid.x  = nb + dimBlock.x - 1;

dimGrid.x /= dimBlock.x;

// Tell the card to process that data.

r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

// Copy results back.

cudaMemcpy(

host_buf,

dev_buf,

nb,

cudaMemcpyDeviceToHost);

// Write the results to stdout.

nb_written = fwrite(host_buf, 1, nb, stdout);

/* … SNIP … */

}

} while (nb != 0);

/* .. *SNIP* .. */

return 0;

}

' };

	void *dev_buf = NULL;

	size_t nb = 0;

	cudaError_t cu_error = cudaSuccess;

	int dev_no = 0;

	struct cudaDeviceProp props;

	dim3 dimBlock( sizeof(host_buf) );

	dim3 dimGrid(1);

	/* .. *SNIP* .. */

	// If our max threads/block is less than our buffer size

	// Adjust.

	if (props.maxThreadsPerBlock < dimBlock.x)

	{

  // Make a block as wide as we can.

  dimBlock.x = props.maxThreadsPerBlock;

  assert(dimBlock.x > 0);

	}

	// Allocate a buffer in device memory.

	cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

	/* .. *SNIP* .. */

	do

	{

  // Read some data in.

  size_t nb_written = 0;

 nb = fread(host_buf, 1, sizeof(host_buf), stdin);

 if (nb > 0)

  {

  	// Copy that data to the card.

  	cudaMemcpy(

    dev_buf,

    host_buf,

    nb,

    cudaMemcpyHostToDevice);

 	// How many grids is that?

  	dimGrid.x  = nb + dimBlock.x - 1;

  	dimGrid.x /= dimBlock.x;

 	// Tell the card to process that data.

  	r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

 	// Copy results back.

  	cudaMemcpy(

    host_buf,

    dev_buf,

    nb,

    cudaMemcpyDeviceToHost);

  	

  	// Write the results to stdout.

  	nb_written = fwrite(host_buf, 1, nb, stdout);

  	/* .. *SNIP* .. */

  }

	} while (nb != 0);

	/* .. *SNIP* .. */

	return 0;

}

’ };

void *dev_buf = NULL;

size_t nb = 0;

cudaError_t cu_error = cudaSuccess;

int dev_no = 0;

struct cudaDeviceProp props;

dim3 dimBlock( sizeof(host_buf) );

dim3 dimGrid(1);

/* .. *SNIP* .. */

// If our max threads/block is less than our buffer size

// Adjust.

if (props.maxThreadsPerBlock < dimBlock.x)

{

// Make a block as wide as we can.

dimBlock.x = props.maxThreadsPerBlock;

assert(dimBlock.x > 0);

}

// Allocate a buffer in device memory.

cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

/* .. *SNIP* .. */

do

{

// Read some data in.

size_t nb_written = 0;

nb = fread(host_buf, 1, sizeof(host_buf), stdin);

if (nb > 0)

{

// Copy that data to the card.

cudaMemcpy(

dev_buf,

host_buf,

nb,

cudaMemcpyHostToDevice);

// How many grids is that?

dimGrid.x  = nb + dimBlock.x - 1;

dimGrid.x /= dimBlock.x;

// Tell the card to process that data.

r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

// Copy results back.

cudaMemcpy(

host_buf,

dev_buf,

nb,

cudaMemcpyDeviceToHost);

// Write the results to stdout.

nb_written = fwrite(host_buf, 1, nb, stdout);

/* … SNIP … */

}

} while (nb != 0);

/* .. *SNIP* .. */

return 0;

}

' };

	void *dev_buf = NULL;

	size_t nb = 0;

	cudaError_t cu_error = cudaSuccess;

	int dev_no = 0;

	struct cudaDeviceProp props;

	dim3 dimBlock( sizeof(host_buf) );

	dim3 dimGrid(1);

	/* .. *SNIP* .. */

	// If our max threads/block is less than our buffer size

	// Adjust.

	if (props.maxThreadsPerBlock < dimBlock.x)

	{

  // Make a block as wide as we can.

  dimBlock.x = props.maxThreadsPerBlock;

  assert(dimBlock.x > 0);

	}

	// Allocate a buffer in device memory.

	cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));

	/* .. *SNIP* .. */

	do

	{

  // Read some data in.

  size_t nb_written = 0;

 nb = fread(host_buf, 1, sizeof(host_buf), stdin);

 if (nb > 0)

  {

  	// Copy that data to the card.

  	cudaMemcpy(

    dev_buf,

    host_buf,

    nb,

    cudaMemcpyHostToDevice);

 	// How many grids is that?

  	dimGrid.x  = nb + dimBlock.x - 1;

  	dimGrid.x /= dimBlock.x;

 	// Tell the card to process that data.

  	r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

 	// Copy results back.

  	cudaMemcpy(

    host_buf,

    dev_buf,

    nb,

    cudaMemcpyDeviceToHost);

  	

  	// Write the results to stdout.

  	nb_written = fwrite(host_buf, 1, nb, stdout);

  	/* .. *SNIP* .. */

  }

	} while (nb != 0);

	/* .. *SNIP* .. */

	return 0;

}

r8: Which index in the 1D array am I responsible for? Read that into a local variable (register I assume) from global device memory, use it to build a reversed version of the byte, write it back out to global device memory, return.

__global__ void r8(void *buf_void, size_t len)

{

	// What's the index I'm in charge of?

	char x = 0;

	int c = 0;

	char b;

	char *buf = (char*)buf_void;

	// Compute the index.  Since we only use one dimension of

	// the grid, this makes life simpler.  All we care about

	// are the X dimensions.  So it's the blockIdx * the width

	// of a block + the threadIdx.  All other dimensions fall

	// out.

	unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;

	// Read the original byte.

	b = buf[idx];

	// Build reverse.

	for (c = 0; c < 8; ++c)

	{

  // If bit is set, set mirror bit.

  if ( b & (1 << c) )

  	x |= ( 1 << (7-c) );

	}

	// Write back result.

	buf[idx] = x;

}

This app runs in about 0.225 seconds on CUDA. A CPU only implementation that does the same reverse building runs in 0.002 seconds.

What am I doing stupid?
r8.cu.txt (2.98 KB)

Okay, I did a little more testing and I should qualify:

The previous mentioned times were using the nvcc generated executable as the input with -O3 as the only option.

I’m using a GeForce 8800 GTX.

Running the test on a 2meg sample file I get 0.270 seconds for CUDA, and 0.137 seconds for CPU.

On a 20meg sample file I get 0.779 seconds for CUDA, and 1.335 seconds for CPU.

On a 200meg sample file I get 6.019 seconds for CUDA, and 13.532 seconds for CPU.

Buffers are 2k for all tests.

So CUDA is faster once there the device initialization is taken into account? Is there a 0.200 second overhead to initializing CUDA?

Several people have observed that coalesced memory reads involving 8-bit chars do not go as fast as 32 or 64 bit reads. Try modifying your program so each thread reads 32-bit ints from the buffer, splits them into 4 x 8 bit bytes, does the reversal, packs them back together and writes them out as 32-bit ints. I’ve had to do this when reading large lists of 16-bit shorts to get good performance.

It doesn’t even coalesce; if you read the manual carefully, you’ll see that only reads of 32, 64 and 128 bit are coalesced.

So:

  • read and write 32 bit quantities. Reading and writing this structure might help you do so:
struct __align__((4)) u8_4 { // 32 bit

    uint8_t a,b,c,d;

};
  • use something more intelligent to reverse the bits in your byte, as this implementation is very inefficient; at least unroll the loop, but there are some methods (afaik) which take a lot less operations, google for it

One more thing you should be aware of is that calculating something on GPU is efficient if math/IO ration is high, i.e. if you do a lot of computations but not much memory reading/writing. As this ratio increases, GPU becomes faster and faster compared to CPU.

Also, GPU really does not like divergent branching (which is your case). You should find another way to reverse bits, preferably without loops and ifs. Consider using something like this:

c = (( c & 0x55 ) << 1) | (( c & 0xAA ) >> 1 );

c = (( c & 0x33 ) << 2) | (( c & 0xCC ) >> 2 );

c = (( c & 0x0F ) << 4) | (( c & 0xF0 ) >> 4 );

Which should be faster (although you may not see actual performance gain because memory reading/writing is really slow).

Oh, I know there are faster ways to code the actual r8, but I was sticking for “readable” code before making such a simple operation faster.

unsigned char b; // reverse this byte

b = ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;

From http://graphics.stanford.edu/~seander/bith…eByteWith64Bits actually seems like it would be the fastest on the CUDA (4 instructions – there’s a 3 instruction version, but it does a modulo, so I’m assuming it would be slower).

But that form of optimization will help the CPU version as well as the CUDA version. I was more worried/curious about the fact that the CPU version was beating the CUDA version by 2 orders of magnitude. That 2nd post I have seems to indicate that given enough data, there’s something amortizing it out and making CUDA faster – perhaps some initialization cost?

Anyway, the 8-bit reads/writes being bad is more what I was looking for. I’ll go back and see if 32-bit reads/writes will help it.

With such a simple kernel you don’t even have to worry about how many instructions you are using. The ONLY thing that matters is getting the most memory bandwidth you can, and doing that means getting the reads and writes coalesced. For every memory read you perform, you can interleave dozens of integer/floating point operations in the memory latency.

The memory access pattern should always be your first optimization. Coalesced vs non-coalesced can mean performance differences around a factor of 20.

Now, about your “slowness”

  1. The CUDA driver has a significant initialization time on the first call.

  2. The kernel has a significant initialization time on the first call
    To benchmark the kernel properly, initialize, call the kernel once, then call it again a few dozen (or hundered) times to build up a nice average and report the average execution time. Or use CUDA_PROFILE=1 and examine cuda_profile.log. The profiler (in CUDA 1.1) can also tell you if your reads are uncoalesced or not

  3. Transfering memory to/from the GPU is slow. Very, very slow.Unless a single computation takes 10’s of seconds to minutes, then I would say ANY operation that requires copy to gpu, compute, then copy back will always be similar in speed to the CPU. The objective becomes to push the entire algorithm on to the GPU so that copies are only needed at the beginning and end of a long calculation.

  4. There is still a small (5us - 100us depending on grid size) overhead for each kernel launch. For very fast executing kernels, this overhead can dominate.

  5. There is an “amortization” as you say for small kernel executions on the GPU. If you run 1 block, it executes in time A. On a GTX, you can run (depending on the grid/block size and registers) up to 128 blocks concurrently meaning executing 128 blocks will still take time A!!! Only when you add the 129th block does your kernel take time 2A because the last kernel must execute after all the others finish (I’m assuming equal time of execution for each block). But you can go all the way to 256 without increasing your execution time above 2A… (Note that 128 is an example and will change, it is really number of multiprocessors * number of concurrent blocks on a multiproc)

Oh, and one more
6) There is another “amortization” as you put it for larger kernels related to the number of FLOPs (or integer ops) that can be performed. It is best illustrated as an example: I have kernels in my project that perform the same number of memory reads. But the 2nd performs twice as many floating point operations. They run in the exact same time, but the first executes only 20 GFLOP/s and the second reaches 40 GFLOP/s. This is because they have the same memory bandwidth and are limited by that. I could easily create another kernel that performed even more FLOPS and still run in the same time. 120 GFLOP/s is easily achievable without having to tweak code at all. You only need to start optimizing instructions when you want to try and push closer to 170 (or 340 if you perform lots of MADs)

Ahhh… thank you for the information, MisterAnderson42. Very helpful.

Playing with the profiler now…

Okay, after playing with the profiler a little, I have a few questions. Perhaps I should start another thread, but since I’m using this program, I’ll stay here for now.

So I modified my program to do r8s on each byte in an unsigned. Running my little program against a 2,000,000 byte file, I get lines like these in the profiler output:

method=[ memcopy ] gputime=[ 3.392 ]
method=[ _Z11r8_unsignedPvm ] gputime=[ 19.072 ] cputime=[ 36.000 ] occupancy=[ 0.667 ] gld_incoherent=[ 0 ] gld_coherent=[ 32 ] divergent_branch=[ 0 ] instructions=[ 2556 ]
method=[ memcopy ] gputime=[ 3.328 ]
method=[ memcopy ] gputime=[ 3.264 ]
method=[ _Z11r8_unsignedPvm ] gputime=[ 18.112 ] cputime=[ 30.000 ] occupancy=[ 0.667 ] gld_incoherent=[ 0 ] gld_coherent=[ 32 ] divergent_branch=[ 0 ] instructions=[ 0 ]
method=[ memcopy ] gputime=[ 3.360 ]
method=[ memcopy ] gputime=[ 3.264 ]
method=[ _Z11r8_unsignedPvm ] gputime=[ 18.528 ] cputime=[ 31.000 ] occupancy=[ 0.667 ] gld_incoherent=[ 0 ] gld_coherent=[ 0 ] divergent_branch=[ 0 ] instructions=[ 0 ]
method=[ memcopy ] gputime=[ 3.328 ]
method=[ memcopy ] gputime=[ 3.264 ]
method=[ _Z11r8_unsignedPvm ] gputime=[ 18.272 ] cputime=[ 30.000 ] occupancy=[ 0.667 ] gld_incoherent=[ 0 ] gld_coherent=[ 0 ] divergent_branch=[ 0 ] instructions=[ 0 ]
method=[ memcopy ] gputime=[ 3.328 ]

My questions are:

  1. The memcopy lines only have gputime. This is NOT the total transfer time for those copies, correct?
  2. The first kernel call makes sense to me. But the second one doesn’t. Why are there 0 instructions? How can there be 32 coherent loads when there aren’t any instructions?
  3. Similar to #2, what would be causing the other two kernel calls with all zeros?

When instructions=0, cta_launched=0 it seems…

This is incorrect. I found a line where gld_coherent=32, instructions=0, but cta_launched=1.

I’m still confused by instructions=0…