#include #include #include #include #include #include static void print_usage(void); __global__ void r8(void *buf, size_t len); void print_usage(void) { fprintf(stderr, "Usage: r8\n" "\n" "r8 reads bytes from stdin reverses the bits in that byte\n" "before writing it out to stdout.\n" "\n"); } __global__ void r8(void *buf_void, size_t len) { // What's the index I'm in charge of? char x = 0; int c = 0; char b; char *buf = (char*)buf_void; // Compute the index. Since we only use one dimension of // the grid, this makes life simpler. All we care about // are the X dimensions. So it's the blockIdx * the width // of a block + the threadIdx. All other dimensions fall // out. unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; // Read the original byte. b = buf[idx]; // Build reverse. for (c = 0; c < 8; ++c) { // If bit is set, set mirror bit. if ( b & (1 << c) ) x |= ( 1 << (7-c) ); } // Write back result. buf[idx] = x; } int main(int argc, const char* argv[]) { char host_buf[2048] = { '\0' }; void *dev_buf = NULL; size_t nb = 0; cudaError_t cu_error = cudaSuccess; int dev_no = 0; struct cudaDeviceProp props; dim3 dimBlock( sizeof(host_buf) ); dim3 dimGrid(1); memset(&props, 0, sizeof(props)); if (argc != 1) { print_usage(); exit(1); } cu_error = cudaGetDevice(&dev_no); if (cu_error != cudaSuccess) { fprintf(stderr, "Error getting the current CUDA device number.\n"); exit(1); } cu_error = cudaGetDeviceProperties(&props, dev_no); if (cu_error != cudaSuccess) { fprintf(stderr, "Error getting current CUDA device properties.\n"); exit(1); } // If our max threads/block is less than our buffer size // Adjust. if (props.maxThreadsPerBlock < dimBlock.x) { // Make a block as wide as we can. dimBlock.x = props.maxThreadsPerBlock; assert(dimBlock.x > 0); } // Allocate a buffer in device memory. cu_error = cudaMalloc(&dev_buf, sizeof(host_buf)); if (cu_error != cudaSuccess) { fprintf(stderr, "Error allocating %zu bytes on the CUDA card.\n", sizeof(host_buf) ); exit(1); } do { // Read some data in. size_t nb_written = 0; nb = fread(host_buf, 1, sizeof(host_buf), stdin); if (nb > 0) { // Copy that data to the card. cudaMemcpy( dev_buf, host_buf, nb, cudaMemcpyHostToDevice); // How many grids is that? dimGrid.x = nb + dimBlock.x - 1; dimGrid.x /= dimBlock.x; // Tell the card to process that data. r8<<>>(dev_buf, nb); // Copy results back. cudaMemcpy( host_buf, dev_buf, nb, cudaMemcpyDeviceToHost); // Write the results to stdout. nb_written = fwrite(host_buf, 1, nb, stdout); if (nb_written != nb) { fprintf(stderr, "Error writing %zu byte%s to stdout: %m\n", nb, ( nb == 1 ? "" : "s" ) ); exit(1); } } } while (nb != 0); fflush(stdout); cudaFree(dev_buf); dev_buf = NULL; return 0; }