So I wrote my 1st CUDA app. I’m not from a heavy math background and 3-dimensions tend to confuse me. I’m more of a bit-stream man, so I wrote an app that reads bytes from stdin, reverses the bits in each byte (in r8 in certain parlance), and writes the results to stdout.
The results of the program are correct. (Always a good starting point!) But I’ve been led to two questions by this exercise.
-
Why is the CUDA implementation of this that I’ve written slower than an iterative CPU only implementation?
-
Am I doing anything wrong in this app? Especially am I missing some form of synchronization to make sure the copies and the kernel call happen in the correct order?
Coda is attached in it’s entirety (with an .txt extension), but here are the important parts:
main: 2k buffer (arbitrary size). Everything is 1-dimensional, use as many threads/block as possible, with the 2k buffer, this means a single grid of 4 blocks with 512 threads in each block. Read into host buffer, transfer to dev buffer, execute r8(), transfer results back, write them out – repeat.
int main(int argc, const char* argv[])
{
char host_buf[2048] = { '
int main(int argc, const char* argv)
{
char host_buf[2048] = { '
int main(int argc, const char* argv[])
{
char host_buf[2048] = { '
int main(int argc, const char* argv)
{
char host_buf[2048] = { '
int main(int argc, const char* argv[])
{
char host_buf[2048] = { '
int main(int argc, const char* argv)
{
char host_buf[2048] = { '
int main(int argc, const char* argv[])
{
char host_buf[2048] = { '
int main(int argc, const char* argv)
{
char host_buf[2048] = { '\0' };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* … SNIP … */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
' };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* .. *SNIP* .. */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
’ };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* … SNIP … */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
' };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* .. *SNIP* .. */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
’ };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* … SNIP … */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
' };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* .. *SNIP* .. */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
’ };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* … SNIP … */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
' };
void *dev_buf = NULL;
size_t nb = 0;
cudaError_t cu_error = cudaSuccess;
int dev_no = 0;
struct cudaDeviceProp props;
dim3 dimBlock( sizeof(host_buf) );
dim3 dimGrid(1);
/* .. *SNIP* .. */
// If our max threads/block is less than our buffer size
// Adjust.
if (props.maxThreadsPerBlock < dimBlock.x)
{
// Make a block as wide as we can.
dimBlock.x = props.maxThreadsPerBlock;
assert(dimBlock.x > 0);
}
// Allocate a buffer in device memory.
cu_error = cudaMalloc(&dev_buf, sizeof(host_buf));
/* .. *SNIP* .. */
do
{
// Read some data in.
size_t nb_written = 0;
nb = fread(host_buf, 1, sizeof(host_buf), stdin);
if (nb > 0)
{
// Copy that data to the card.
cudaMemcpy(
dev_buf,
host_buf,
nb,
cudaMemcpyHostToDevice);
// How many grids is that?
dimGrid.x = nb + dimBlock.x - 1;
dimGrid.x /= dimBlock.x;
// Tell the card to process that data.
r8<<<dimGrid, dimBlock>>>(dev_buf, nb);
// Copy results back.
cudaMemcpy(
host_buf,
dev_buf,
nb,
cudaMemcpyDeviceToHost);
// Write the results to stdout.
nb_written = fwrite(host_buf, 1, nb, stdout);
/* .. *SNIP* .. */
}
} while (nb != 0);
/* .. *SNIP* .. */
return 0;
}
r8: Which index in the 1D array am I responsible for? Read that into a local variable (register I assume) from global device memory, use it to build a reversed version of the byte, write it back out to global device memory, return.
__global__ void r8(void *buf_void, size_t len)
{
// What's the index I'm in charge of?
char x = 0;
int c = 0;
char b;
char *buf = (char*)buf_void;
// Compute the index. Since we only use one dimension of
// the grid, this makes life simpler. All we care about
// are the X dimensions. So it's the blockIdx * the width
// of a block + the threadIdx. All other dimensions fall
// out.
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
// Read the original byte.
b = buf[idx];
// Build reverse.
for (c = 0; c < 8; ++c)
{
// If bit is set, set mirror bit.
if ( b & (1 << c) )
x |= ( 1 << (7-c) );
}
// Write back result.
buf[idx] = x;
}
This app runs in about 0.225 seconds on CUDA. A CPU only implementation that does the same reverse building runs in 0.002 seconds.
What am I doing stupid?
r8.cu.txt (2.98 KB)