How to determine workload Performance technique

I try to understand an optimizing technique that is presented in SC07: better latency hiding with more work per thread.

My test is simple, it come from cudpp compact source code , in that we generate flag with

8 elements per thread instead of one element per thread.

template<typename T> __device__ bool isNull(const T elem){

    return (elem == 0) ? true : false;

}

template <class T> __device__ void setFlag(unsigned int *oData,

                        const T *iData,

                        unsigned int idx){

    oData[idx] = (isNull(iData[idx])) ? 0 : 1;

}

template <class T>

__global__ void generateValidFlags8(unsigned int *oData, 

                                    const T *iData)

{

    unsigned int iGlobal = blockIdx.x * (blockDim.x << 3) + threadIdx.x;

  

    setFlag(oData, iData, iGlobal);

    iGlobal += blockDim.x;

    setFlag(oData, iData, iGlobal);

    iGlobal += blockDim.x;

    setFlag(oData, iData, iGlobal);

    iGlobal += blockDim.x;

    setFlag(oData, iData, iGlobal);

    iGlobal += blockDim.x;

    setFlag(oData, iData, iGlobal);

    iGlobal += blockDim.x;

    setFlag(oData, iData, iGlobal);

    iGlobal += blockDim.x;

    setFlag(oData, iData, iGlobal);

    iGlobal += blockDim.x;

    setFlag(oData, iData, iGlobal);

}

template <class T>

__global__ void generateValidFlags(unsigned int *oData,  const T *iData)

{

    unsigned int iGlobal = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;

    setFlag(oData, iData, iGlobal);

}

int len = 1 << 22;

int num_threads = 128;

int nIters = 10000;

void testMultiread(float* d_idata, unsigned int* d_odata,  int len, int nRandom){

    int bytes = len * sizeof(float);

    float *h_idata = (float*) malloc(bytes);

   cudaMalloc((void**)&d_odata, bytes);

    // initialize zero data

    for (int i=0; i< len; i++)

        h_idata[i] = 0.f;

   for (unsigned int i=0; i< nRandom; i++)

    {

        int pos = rand() * len / RAND_MAX;

        h_idata[pos] = 1.f;

    }

    cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice);

    cudaEvent_t start, end;

    float runTime;

    CUDA_SAFE_CALL( cudaEventCreate(&start) );

    CUDA_SAFE_CALL( cudaEventCreate(&end) );

#define MULTI_READ 1

#if (!MULTI_READ)

    fprintf(stderr, "Call normal version \n");

    dim3  threads(num_threads);

    dim3  grid(len/num_threads);

    generateValidFlags<<<grid, threads>>>(d_odata, d_idata);

    CUDA_SAFE_CALL( cudaEventRecord(start, 0) );

    // execute the kernel

    for (int i=0; i < nIters; ++i)

        generateValidFlags<<<grid, threads>>>(d_odata, d_idata);

#else

    fprintf(stderr, "Call unrolling version \n");

    dim3  threads(num_threads);

    dim3  grid(len/8/num_threads);

    generateValidFlags8<<<grid, threads>>>(d_odata, d_idata);

    CUDA_SAFE_CALL( cudaEventRecord(start, 0) );

    // execute the kernel

    for (int i=0; i < nIters; ++i)

        generateValidFlags8<<<grid, threads>>>(d_odata, d_idata);

#endif

    CUDA_SAFE_CALL( cudaEventRecord(end, 0) );

    CUDA_SAFE_CALL( cudaEventSynchronize(end) );

    CUDA_SAFE_CALL( cudaEventElapsedTime(&runTime, start, end) );

    runTime /= float(nIters);

    printf("Average time: %f ms\n", runTime);

    printf("Bandwidth:    %f GiB/s\n\n", (2 * len * sizeof(float)) / (runTime * 1.0e-3 * 1024*1024*1024));

    free(h_idata);

}

I think the MULTIREAD version will be faster but in fact it is not

bandwidth with normal one elem per threads is 60.783174 GiB/s

while bw with 8 elems per thread is only 58.9.

The two version has the same number of read/ write and have the same access patten, The multiread/write version have more work per thread it should hide the latency better ence should be faster. Is that right ? So why it is slower. If it is slower why people use it in cudpp. Can any one from NVIDIA answer this question ?

It is not clear to me when we should increase the work load to increase the efficiency. Any idea is appreciated.