Hi all,
I have been working on a GPU based cryptographic application performing litecoin mining. Litecoin is a craptographic currency, but unline Bitcoin it specifies the memory-hard “scrypt” algorithm for hashing.
In my implementation of scrypt, the host (CPU) performs a HMAC SHA256 message digest, and then the GPU takes over and runs 2048 rounds of memory-hard Salsa20/8 goodness. Afterwards the CPU again performs a HMAC SHA256 digest and looks at the result. The required CPU time is significantly shorter than the GPU time.
In order to get best performance I tried to use two steams, and all host computations are supposed to overlap with kernel execution. Ideally, kernel executions would be performed sequentially, with zero gaps.
What I am seeing in practice is that CPU and kernels never overlap, despite using the streams API to the best of my knowledge. This wastes about 30% of the peformance I expect from my GPUs.
I have slimmed down my code to include just some essentials (for example I removed the memory hard bits of the crypto algorithm). But I exactly reproduced the stream handling that is causing me so much grief. Would anyone have the time to look at my stream handling in host code?
This code uses the nvToolsExt.h header so that the emulated CPU workload (a Sleep of 5ms each time) shows up in the graphs. It never overlaps with any kernels, and as a result there are huge gaps between GPU work phases.
This is the pseudo-code for my stream handling loop:
Setup input for stream #0 (takes 5ms)
Copy input for stream #0 (async H2D copy)
Launch kernel for stream #0 (execution takes about 20ms)
FOREVER
Copy output from stream #0 (async D2H copy)
Setup input for stream #1 (takes 5ms)
Copy input for stream #1 (async H2D copy)
Launch kernel for stream #1 (execution takes about 20ms)
Synchronize with stream #0
Analyze output from stream #0 (takes 5ms)
Swap roles of stream #0 and #1
END LOOP
I am testing on Windows 7, both 32 and 64 bit versions exhibit the same behavior. CUDA toolkit and SDK is 5.0. Driver is version 314.22. The GPUs in the system are GTX 260, GTX 460, GTX 260.
This is the code. Compiles best on Windows, Linux may need minor adjustments (provide a Sleep() function)
////////////////////////////////////////////////////////////////////////////
/* Test project which was set up to try streams for overlapping
* host and device side computation.
* Device + Host code.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdint.h>
#include <windows.h> // for Sleep(time in ms)
// includes CUDA
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper functions for SDK examples
// this comes with GPU Computing SDK (CUDA 5.0)
#include <nvToolsExt.h>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
// just some constants defining the work units per block, etc...
#define WU_PER_WARP 32
#define WARPS_PER_BLOCK 3
#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK)
#define WU_PER_LAUNCH (blocks*WU_PER_BLOCK)
#define RL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
__device__ void xor_salsa8(unsigned long *B, unsigned long *C)
{
uint32_t x0 = (B[ 0] ^= C[ 0]), x1 = (B[ 1] ^= C[ 1]), x2 = (B[ 2] ^= C[ 2]), x3 = (B[ 3] ^= C[ 3]);
uint32_t x4 = (B[ 4] ^= C[ 4]), x5 = (B[ 5] ^= C[ 5]), x6 = (B[ 6] ^= C[ 6]), x7 = (B[ 7] ^= C[ 7]);
uint32_t x8 = (B[ 8] ^= C[ 8]), x9 = (B[ 9] ^= C[ 9]), xa = (B[10] ^= C[10]), xb = (B[11] ^= C[11]);
uint32_t xc = (B[12] ^= C[12]), xd = (B[13] ^= C[13]), xe = (B[14] ^= C[14]), xf = (B[15] ^= C[15]);
for (int i = 0; i < 4; ++i) {
/* Operate on columns. */
x4 ^= RL(x0 + xc, 7); x9 ^= RL(x5 + x1, 7); xe ^= RL(xa + x6, 7); x3 ^= RL(xf + xb, 7);
x8 ^= RL(x4 + x0, 9); xd ^= RL(x9 + x5, 9); x2 ^= RL(xe + xa, 9); x7 ^= RL(x3 + xf, 9);
xc ^= RL(x8 + x4, 13); x1 ^= RL(xd + x9, 13); x6 ^= RL(x2 + xe, 13); xb ^= RL(x7 + x3, 13);
x0 ^= RL(xc + x8, 18); x5 ^= RL(x1 + xd, 18); xa ^= RL(x6 + x2, 18); xf ^= RL(xb + x7, 18);
/* Operate on rows. */
x1 ^= RL(x0 + x3, 7); x6 ^= RL(x5 + x4, 7); xb ^= RL(xa + x9, 7); xc ^= RL(xf + xe, 7);
x2 ^= RL(x1 + x0, 9); x7 ^= RL(x6 + x5, 9); x8 ^= RL(xb + xa, 9); xd ^= RL(xc + xf, 9);
x3 ^= RL(x2 + x1, 13); x4 ^= RL(x7 + x6, 13); x9 ^= RL(x8 + xb, 13); xe ^= RL(xd + xc, 13);
x0 ^= RL(x3 + x2, 18); x5 ^= RL(x4 + x7, 18); xa ^= RL(x9 + x8, 18); xf ^= RL(xe + xd, 18);
}
B[ 0] += x0; B[ 1] += x1; B[ 2] += x2; B[ 3] += x3; B[ 4] += x4; B[ 5] += x5; B[ 6] += x6; B[ 7] += x7;
B[ 8] += x8; B[ 9] += x9; B[10] += xa; B[11] += xb; B[12] += xc; B[13] += xd; B[14] += xe; B[15] += xf;
}
////////////////////////////////////////////////////////////////////////////////
//! Running the Salsa 20/8 core "loops" times per work unit.
//! @param g_idata input data in global memory
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
salsaKernel(uint32_t*g_idata, uint32_t *g_odata, int loops)
{
__shared__ unsigned long X[WARPS_PER_BLOCK][WU_PER_WARP][32+1]; // +1 to resolve bank conflicts
volatile int warpIdx = threadIdx.x / warpSize;
volatile int warpThread = threadIdx.x % warpSize;
g_idata += 32 * (blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP);
g_odata += 32 * (blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP);
#pragma unroll WU_PER_WARP
for (int wu=0; wu < WU_PER_WARP; ++wu) X[warpIdx][wu][warpThread] = g_idata[32*wu+warpThread];
for (int i = 0; i < loops; i++)
{ xor_salsa8(&X[warpIdx][warpThread][0], &X[warpIdx][warpThread][16]);
xor_salsa8(&X[warpIdx][warpThread][16], &X[warpIdx][warpThread][0]); }
#pragma unroll WU_PER_WARP
for (int wu=0; wu < WU_PER_WARP; ++wu) g_odata[32*wu+warpThread] = X[warpIdx][wu][warpThread];
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple streams test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest(int argc, char **argv)
{
printf("%s Starting...\n\n", argv[0]);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaSetDevice(devID));
// get device properties
struct cudaDeviceProp prop;
checkCudaErrors(cudaGetDeviceProperties(&prop, devID));
// initialize with enough blocks to fill the device nicely
int blocks = 16 * prop.multiProcessorCount;
int mem_size = 32 * sizeof(uint32_t) * WU_PER_LAUNCH;
int loops = 2048;
// create two streams to operate on
cudaStream_t streamHandle[2];
checkCudaErrors(cudaStreamCreate(&streamHandle[0]));
checkCudaErrors(cudaStreamCreate(&streamHandle[1]));
// allocate page-locked host memory (separate buffers for streams)
uint32_t *h_idata[2];
checkCudaErrors(cudaMallocHost(&h_idata[0], mem_size));
checkCudaErrors(cudaMallocHost(&h_idata[1], mem_size));
uint32_t *h_odata[2];
checkCudaErrors(cudaMallocHost(&h_odata[0], mem_size));
checkCudaErrors(cudaMallocHost(&h_odata[1], mem_size));
// allocate device memory (separate buffers for streams)
uint32_t *d_idata[2];
checkCudaErrors(cudaMalloc((void **) &d_idata[0], mem_size));
checkCudaErrors(cudaMalloc((void **) &d_idata[1], mem_size));
uint32_t *d_odata[2];
checkCudaErrors(cudaMalloc((void **) &d_odata[0], mem_size));
checkCudaErrors(cudaMalloc((void **) &d_odata[1], mem_size));
// setup execution parameters
dim3 threads = dim3(WU_PER_BLOCK, 1, 1);
dim3 grid = dim3(blocks, 1, 1);
printf("grid.x is %d, threads.x is %d\n", grid.x, threads.x);
// do some timing tests. Find a block configuration where execution time exceeds 20ms
StopWatchInterface *timer = 0;
printf("Tuning kernel run time for 20ms...\n");
for (int i=8; i < 16384; i += 8)
{
loops = i;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
salsaKernel<<< grid, threads, 0>>>(d_idata[0], d_odata[0], loops);
cudaDeviceSynchronize();
// check if kernel execution generated and error
getLastCudaError("Kernel execution failed");
sdkStopTimer(&timer);
double time = sdkGetTimerValue(&timer);
printf("loops=%d, Processing time: %f (ms)\n", loops, time);
sdkDeleteTimer(&timer);
if (time > 20.0) break;
}
printf("\nunning dual-stream operation (hopefully overlapping...or not!?!)\n");
// stream points to current stream, other points to the other stream.
int stream = 0, other = 1;
// simulate the CPU time required for input preparation work
char tmp[50]; sprintf(tmp, "Prep #%d\n", stream); nvtxRangeId_t rng1 = nvtxRangeStartA(tmp);
Sleep(5);
nvtxRangeEnd(rng1);
// copy H2D
checkCudaErrors(cudaMemcpyAsync(d_idata[stream], h_idata[stream], mem_size, cudaMemcpyHostToDevice, streamHandle[stream]));
// execute kernel on the first stream
salsaKernel<<< grid, threads, 0, streamHandle[stream] >>>(d_idata[stream], d_odata[stream], loops);
int iterations = 0;
do
{
// copy D2H on the current stream
checkCudaErrors(cudaMemcpyAsync(h_odata[stream], d_odata[stream], mem_size, cudaMemcpyDeviceToHost, streamHandle[stream]));
// simulate the CPU time required for input preparation work
sprintf(tmp, "Prep #%d\n", other); nvtxRangeId_t rng2 = nvtxRangeStartA(tmp);
Sleep(5);
nvtxRangeEnd(rng2);
// copy H2D on the other stream
checkCudaErrors(cudaMemcpyAsync(d_idata[other], h_idata[other], mem_size, cudaMemcpyHostToDevice, streamHandle[other]));
// execute the kernel on the other stream
salsaKernel<<< grid, threads, 0, streamHandle[other] >>>(d_idata[other], d_odata[other], loops);
// wait for this stream's execution to complete
checkCudaErrors(cudaStreamSynchronize(streamHandle[stream]));
// simulate the CPU time required for result processing work
sprintf(tmp, "Analyze #%d\n", other); nvtxRangeId_t rng3 = nvtxRangeStartA(tmp);
Sleep(5);
nvtxRangeEnd(rng3);
stream = (stream+1)&1;
other = (other +1)&1;
} while (++iterations < 10);
cudaThreadExit();
exit(0);
}