#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <unistd.h>
#include <assert.h>
#include <stdint.h>

static void print_usage(void);
__global__ void r8(void *buf, size_t len);


void print_usage(void)
{
	fprintf(stderr,
		"Usage: r8\n"
		"\n"
		"r8 reads bytes from stdin reverses the bits in that byte\n"
		"before writing it out to stdout.\n"
		"\n");
}


__global__ void r8(void *buf_void, size_t len)
{
	// What's the index I'm in charge of?
	char x = 0;
	int c = 0;
	char b;
	char *buf = (char*)buf_void;

	// Compute the index.  Since we only use one dimension of
	// the grid, this makes life simpler.  All we care about
	// are the X dimensions.  So it's the blockIdx * the width
	// of a block + the threadIdx.  All other dimensions fall
	// out.
	unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;

	// Read the original byte.
	b = buf[idx];

	// Build reverse.
	for (c = 0; c < 8; ++c)
	{
		// If bit is set, set mirror bit.
		if ( b & (1 << c) )
			x |= ( 1 << (7-c) );
	}

	// Write back result.
	buf[idx] = x;
}


int main(int argc, const char* argv[])
{
	char host_buf[2048] = { '\0' };
	void *dev_buf = NULL;
	size_t nb = 0;
	cudaError_t cu_error = cudaSuccess;
	int dev_no = 0;
	struct cudaDeviceProp props;
	dim3 dimBlock( sizeof(host_buf) );
	dim3 dimGrid(1);

	memset(&props, 0, sizeof(props));

	if (argc != 1)
	{
		print_usage();
		exit(1);
	}

	cu_error = cudaGetDevice(&dev_no);
	if (cu_error != cudaSuccess)
	{
		fprintf(stderr,
			"Error getting the current CUDA device number.\n");
		exit(1);
	}

	cu_error = cudaGetDeviceProperties(&props, dev_no);
	if (cu_error != cudaSuccess)
	{
		fprintf(stderr,
			"Error getting current CUDA device properties.\n");
		exit(1);
	}

	// If our max threads/block is less than our buffer size
	// Adjust.
	if (props.maxThreadsPerBlock < dimBlock.x)
	{
		// Make a block as wide as we can.
		dimBlock.x = props.maxThreadsPerBlock;
		assert(dimBlock.x > 0);
	}

	// Allocate a buffer in device memory.
	cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
	if (cu_error != cudaSuccess)
	{
		fprintf(stderr,
			"Error allocating %zu bytes on the CUDA card.\n",
			sizeof(host_buf) );
		exit(1);
	}
	

	do
	{
		// Read some data in.
		size_t nb_written = 0;

		nb = fread(host_buf, 1, sizeof(host_buf), stdin);

		if (nb > 0)
		{
			// Copy that data to the card.
			cudaMemcpy(
				dev_buf,
				host_buf,
				nb,
				cudaMemcpyHostToDevice);

			// How many grids is that?
			dimGrid.x  = nb + dimBlock.x - 1;
			dimGrid.x /= dimBlock.x;

			// Tell the card to process that data.
			r8<<<dimGrid, dimBlock>>>(dev_buf, nb);

			// Copy results back.
			cudaMemcpy(
				host_buf,
				dev_buf,
				nb,
				cudaMemcpyDeviceToHost);
			
			// Write the results to stdout.
			nb_written = fwrite(host_buf, 1, nb, stdout);
			if (nb_written != nb)
			{
				fprintf(stderr,
					"Error writing %zu byte%s to stdout: %m\n",
					nb,
					( nb == 1 ? "" : "s" ) );
				exit(1);
			}
		}
	} while (nb != 0);

	fflush(stdout);

	cudaFree(dev_buf);
	dev_buf = NULL;
	

	return 0;
}