I am trying to write a program that takes smaller chunks of data and runs a histogram over them to find the frequency of symbols. This will eventually be used as a portion of a GPU compression algorithm.
I’m trying to look at it from the standpoint of making a single block of data use a single CUDA block, and rely on concurrent kernels to maximize the occupancy of the card. This allows me to compress shorter files without wasting the rest of the GPU, and to keep the GPU full while it’s doing the scatter/gather portions of the algorithm that work best in a single block.
Here’s the problem. When I attempt to split the execution into multiple streams and execute the same kernel for different data in different streams (the kernels only use one block), the kernels serialize behind each other. I would expect it to run the kernels together at the same time on different multiprocessors, but it doesn’t seem to work that way. Even more serious, it won’t overlap communication and processing… the downloads back to main memory wait until all kernels have completed, not just the one in their stream. What am I missing?
[codebox]
#include <sys/time.h>
#include <stdlib.h>
#define MAX_CUDA_SIZE 524288 //512 threads per block * 1024 data per thread
#define THREADS_PER_BLOCK 512
#define BYTES_PER_THREAD 1024
#define NUMBER_OF_STREAMS 4 //should be more than the number of blocks to occupy the GPU
#define DATA_BUCKETS 256 //must be power of 2
typedef unsigned char DATA;
typedef unsigned int uint;
typedef unsigned char uchar;
using namespace std;
double etime() {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + 1.e-6*t.tv_usec;
}
global void gatherFrequencies(DATA *input, uint inputLength, uint *freqTable)
{
for (int i = threadIdx.x; i < DATA_BUCKETS; i += blockDim.x) freqTable[i] = 0;
__syncthreads();
for (int i = threadIdx.x; i < inputLength; i += blockDim.x) {
atomicAdd(freqTable + input[i], 1);
int waste = 0;
for (waste = 0; waste < 100000; waste++) waste += 1;
input[i] = waste;
//freqTable[input[i]]++;
}
}
int main(int argc, char **argv)
{
cudaSetDevice(0);
cout << "Encoding random" << endl;
cudaStream_t streams[NUMBER_OF_STREAMS];
DATA *Hdata[NUMBER_OF_STREAMS], *Ddata[NUMBER_OF_STREAMS];
uint *Hfreq[NUMBER_OF_STREAMS], *Dfreq[NUMBER_OF_STREAMS];
int stream;
for (stream = 0; stream < NUMBER_OF_STREAMS; stream++)
{
cudaStreamCreate(&streams[stream]);
cudaMallocHost(&Hdata[stream], MAX_CUDA_SIZE * sizeof(DATA));
cudaMalloc(&Ddata[stream], MAX_CUDA_SIZE * sizeof(DATA));
cudaMallocHost(&Hfreq[stream], DATA_BUCKETS * sizeof(int));
cudaMalloc(&Dfreq[stream], DATA_BUCKETS * sizeof(int));
for (int i = 0; i < NUMBER_OF_STREAMS; i++)
Hfreq[stream][i] = 43;
}
stream = 0;
cudaEvent_t preupload[NUMBER_OF_STREAMS];
cudaEvent_t upload[NUMBER_OF_STREAMS];
cudaEvent_t pregather[NUMBER_OF_STREAMS];
cudaEvent_t gather[NUMBER_OF_STREAMS];
cudaEvent_t predownload[NUMBER_OF_STREAMS];
cudaEvent_t download[NUMBER_OF_STREAMS];
for (int i = 0; i < NUMBER_OF_STREAMS; i++) {
cudaEventCreate(&preupload[i]);
cudaEventCreate(&upload[i]);
cudaEventCreate(&pregather[i]);
cudaEventCreate(&gather[i]);
cudaEventCreate(&predownload[i]);
cudaEventCreate(&download[i]);
}
int bytesRead = MAX_CUDA_SIZE*sizeof(DATA);
cout.setf(ios::fixed);
cout << setprecision(3);
cout << setw(8);
cout << "Processing " << bytesRead << " bytes of data.\n";
for (stream = 0; stream < NUMBER_OF_STREAMS; stream++)
for (int i = 0; i < MAX_CUDA_SIZE; i++)
Hdata[stream][i] = rand();
int nStreams = stream;
double s = etime();
for (stream = 0; stream < nStreams; stream++)
{
double a = etime();
cudaEventRecord(preupload[stream], streams[stream]);
cudaMemcpyAsync(Ddata[stream], Hdata[stream], bytesRead, cudaMemcpyHostToDevice, streams[stream]); //checkError("Copy to device");
cudaEventRecord(upload[stream], streams[stream]);
double b = etime();
cout << "Upload queued " << (b-s)*1000000 << " - " << (a-s)*1000000 << " = " << (b-a)*1000000 << endl;
}
for (stream = 0; stream < nStreams; stream++)
{
double a = etime();
cudaEventRecord(pregather[stream], streams[stream]);
gatherFrequencies<<<1, 512, 0, streams[stream]>>>(Ddata[stream], bytesRead/sizeof(DATA), Dfreq[stream]); //checkError("gatherFrequencies kernel");
cudaEventRecord(gather[stream], streams[stream]);
double b = etime();
cout << "Histogram complete " << (b-s)*1000000 << " - " << (a-s)*1000000 << " = " << (b-a)*1000000 << endl;
}
cout << "retrieving counts\n";
for (stream = 0; stream < nStreams; stream++)
{
double a = etime();
cudaEventRecord(predownload[stream], streams[stream]);
cudaMemcpyAsync(Hfreq[stream], Dfreq[stream], DATA_BUCKETS * sizeof(int), cudaMemcpyDeviceToHost, streams[stream]);
cudaEventRecord(download[stream], streams[stream]);
double b = etime();
cout << "download complete " << (b-s)*1000000 << " - " << (a-s)*1000000 << " = " << (b-a)*1000000 << endl;
}
cudaThreadSynchronize();
float fpreupload, fupload, fpregather, fgather, fpredownload, fdownload;
for (stream = 0; stream < nStreams; stream++)
{
cudaEventElapsedTime(&fpreupload, preupload[0], preupload[stream]);
cudaEventElapsedTime(&fupload, preupload[0], upload[stream]);
cudaEventElapsedTime(&fpregather, preupload[0], pregather[stream]);
cudaEventElapsedTime(&fgather, preupload[0], gather[stream]);
cudaEventElapsedTime(&fpredownload, preupload[0], predownload[stream]);
cudaEventElapsedTime(&fdownload, preupload[0], download[stream]);
cout << stream << ": upload " << fupload << " - " << fpreupload << " = " << (fupload-fpreupload) << endl;
cout << stream << ": gather " << fgather << " - " << fpregather << " = " << (fgather-fpregather) << endl;
cout << stream << ": download " << fdownload << " - " << fpredownload << " = " << (fdownload-fpredownload) << endl;
}
for (stream = 0; stream < NUMBER_OF_STREAMS; stream++)
{
cudaFreeHost(Hdata[stream]);
cudaFree(Ddata[stream]);
cudaFreeHost(Hfreq[stream]);
cudaFree(Dfreq[stream]);
}
}
[/codebox]
And the output:
[codebox]
nvcc -o histogram -O3 -arch compute_20 -L/usr/local/cuda/lib64 -lcudart encode.cu
./histogram
Encoding random
Processing 524288 bytes of data.
Upload queued 41.962 - 7.153 = 34.809
Upload queued 300.169 - 282.049 = 18.120
Upload queued 382.185 - 360.966 = 21.219
Upload queued 462.055 - 440.121 = 21.935
Histogram complete 846.148 - 566.006 = 280.142
Histogram complete 931.978 - 907.183 = 24.796
Histogram complete 1017.094 - 991.106 = 25.988
Histogram complete 1099.110 - 1075.983 = 23.127
retrieving counts
download complete 1240.969 - 1219.034 = 21.935
download complete 1309.156 - 1300.097 = 9.060
download complete 1393.080 - 1384.020 = 9.060
download complete 1464.128 - 1456.022 = 8.106
0: upload 0.200 - 0.000 = 0.200
0: gather 3051.659 - 0.822 = 3050.837
0: download 12224.588 - 12224.565 = 0.022
1: upload 0.450 - 0.264 = 0.187
1: gather 6115.826 - 3051.661 = 3064.164
1: download 12224.611 - 12224.590 = 0.021
2: upload 0.635 - 0.452 = 0.182
2: gather 9170.156 - 6115.828 = 3054.329
2: download 12224.634 - 12224.613 = 0.021
3: upload 0.820 - 0.637 = 0.183
3: gather 12224.563 - 9170.158 = 3054.405
3: download 12224.656 - 12224.636 = 0.021
[/codebox]
The most telling is the stream 0 download not starting until after the stream 3 gather has completed. (Those last 12 lines are end - start = duration). Notice how the order of the timestamps follows exactly the order of enqueueing (0: upload, 1: upload, 2: upload, 3: upload, 0: gather, 1: gather…). Any ideas?