I try to understand an optimizing technique that is presented in SC07: better latency hiding with more work per thread.
My test is simple, it come from cudpp compact source code , in that we generate flag with
8 elements per thread instead of one element per thread.
template<typename T> __device__ bool isNull(const T elem){
return (elem == 0) ? true : false;
}
template <class T> __device__ void setFlag(unsigned int *oData,
const T *iData,
unsigned int idx){
oData[idx] = (isNull(iData[idx])) ? 0 : 1;
}
template <class T>
__global__ void generateValidFlags8(unsigned int *oData,
const T *iData)
{
unsigned int iGlobal = blockIdx.x * (blockDim.x << 3) + threadIdx.x;
setFlag(oData, iData, iGlobal);
iGlobal += blockDim.x;
setFlag(oData, iData, iGlobal);
iGlobal += blockDim.x;
setFlag(oData, iData, iGlobal);
iGlobal += blockDim.x;
setFlag(oData, iData, iGlobal);
iGlobal += blockDim.x;
setFlag(oData, iData, iGlobal);
iGlobal += blockDim.x;
setFlag(oData, iData, iGlobal);
iGlobal += blockDim.x;
setFlag(oData, iData, iGlobal);
iGlobal += blockDim.x;
setFlag(oData, iData, iGlobal);
}
template <class T>
__global__ void generateValidFlags(unsigned int *oData, const T *iData)
{
unsigned int iGlobal = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
setFlag(oData, iData, iGlobal);
}
int len = 1 << 22;
int num_threads = 128;
int nIters = 10000;
void testMultiread(float* d_idata, unsigned int* d_odata, int len, int nRandom){
int bytes = len * sizeof(float);
float *h_idata = (float*) malloc(bytes);
cudaMalloc((void**)&d_odata, bytes);
// initialize zero data
for (int i=0; i< len; i++)
h_idata[i] = 0.f;
for (unsigned int i=0; i< nRandom; i++)
{
int pos = rand() * len / RAND_MAX;
h_idata[pos] = 1.f;
}
cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice);
cudaEvent_t start, end;
float runTime;
CUDA_SAFE_CALL( cudaEventCreate(&start) );
CUDA_SAFE_CALL( cudaEventCreate(&end) );
#define MULTI_READ 1
#if (!MULTI_READ)
fprintf(stderr, "Call normal version \n");
dim3 threads(num_threads);
dim3 grid(len/num_threads);
generateValidFlags<<<grid, threads>>>(d_odata, d_idata);
CUDA_SAFE_CALL( cudaEventRecord(start, 0) );
// execute the kernel
for (int i=0; i < nIters; ++i)
generateValidFlags<<<grid, threads>>>(d_odata, d_idata);
#else
fprintf(stderr, "Call unrolling version \n");
dim3 threads(num_threads);
dim3 grid(len/8/num_threads);
generateValidFlags8<<<grid, threads>>>(d_odata, d_idata);
CUDA_SAFE_CALL( cudaEventRecord(start, 0) );
// execute the kernel
for (int i=0; i < nIters; ++i)
generateValidFlags8<<<grid, threads>>>(d_odata, d_idata);
#endif
CUDA_SAFE_CALL( cudaEventRecord(end, 0) );
CUDA_SAFE_CALL( cudaEventSynchronize(end) );
CUDA_SAFE_CALL( cudaEventElapsedTime(&runTime, start, end) );
runTime /= float(nIters);
printf("Average time: %f ms\n", runTime);
printf("Bandwidth: %f GiB/s\n\n", (2 * len * sizeof(float)) / (runTime * 1.0e-3 * 1024*1024*1024));
free(h_idata);
}
I think the MULTIREAD version will be faster but in fact it is not
bandwidth with normal one elem per threads is 60.783174 GiB/s
while bw with 8 elems per thread is only 58.9.
The two version has the same number of read/ write and have the same access patten, The multiread/write version have more work per thread it should hide the latency better ence should be faster. Is that right ? So why it is slower. If it is slower why people use it in cudpp. Can any one from NVIDIA answer this question ?
It is not clear to me when we should increase the work load to increase the efficiency. Any idea is appreciated.