SDK Scan example

Hello!

I’m currently looking for best method to implement a scan. I took at look at the SDK example as well as the algorithm given in GPU Gems 3.

http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html

From what I can tell both approaches differ. Does anyone know a more detailed description of the SDK example? Why was the design given in GPU Gems not chosen? I also noted that the warp synched version (in the SDK) does not use the volatile keyword. From what I learned so far this is necessary to prevent data caching?! Also, when the kernel is called, a normal uint array point is cast to uint4. Can anyone please shed some light on this? Beginner style?!

Thanks!

I think that you mention following code in scan.cu

inline __device__ int warpScanInclusive(int idata, int id, int *s_Data)

{

    int pos = 2 * id - (id &31);

    s_Data[pos] = 0 ;

    pos += 32 ;

    s_Data[pos] = idata ;

for( int offset = 1 ; offset < 32 ; offset <<= 1 ){

        s_Data[pos] += s_Data[pos - offset];

    }

    return s_Data[pos];

}

in fact, compiler (cuda 3.2) does not unroll the loop and

the assembly code (from cuobjdump on sm1.3) looks like

inline __device__ int warpScanInclusive(int idata, int id, int *s_Data)

{

    int pos = 2 * id - (id &31);

    s_Data[pos] = 0 ;

    pos += 32 ;

    s_Data[pos] = idata ;

    int R0 = idata ;

    for( int offset = 1 ; offset < 32 ; offset <<= 1 ){

        R0 += s_Data[pos - offset];

        s_Data[pos] = R0 ;

    }

    return R0;

}

The above code is correct because “s_Data[pos] = R0” is executed.

in fact you can unroll the loop manually

inline __device__ int warpScanInclusive(int idata, int id, volatile int *s_Data)

{

    int pos = 2 * id - (id &31);

    s_Data[pos] = 0 ;

    pos += 32 ;

    s_Data[pos] = idata ;

s_Data[pos] += s_Data[pos - 1];

    s_Data[pos] += s_Data[pos - 2];

    s_Data[pos] += s_Data[pos - 4];

    s_Data[pos] += s_Data[pos - 8];

    s_Data[pos] += s_Data[pos - 16];

return s_Data[pos];

}

and assembly code is

inline __device__ int warpScanInclusive(int idata, int id, volatile int *s_Data)

{

    int pos = 2 * id - (id &31);

    s_Data[pos] = 0 ;

    pos += 32 ;

    s_Data[pos] = idata ;

R2 = s_Data[pos - 1];

    R1 = s_Data[pos] ;

    R1 = R1 + R2 ;

    s_Data[pos] = R1 ;

R2 = s_Data[pos - 2];

    R1 = s_Data[pos] ;

    R1 = R1 + R2 ;

    s_Data[pos] = R1 ;

R2 = s_Data[pos - 4];

    R1 = s_Data[pos] ;

    R1 = R1 + R2 ;

    s_Data[pos] = R1 ;

R2 = s_Data[pos - 8];

    R1 = s_Data[pos] ;

    R1 = R1 + R2 ;

    s_Data[pos] = R1 ;

R2 = s_Data[pos - 16];

    R1 = s_Data[pos] ;

    R1 = R1 + R2 ;

    s_Data[pos] = R1 ;

return s_Data[pos];

}

This is not efficient because instruction “R1 = s_Data[pos] ;” is redudant.

You can try following code, compiler will keep “s_Data[pos] = acc ;” even s_Data is not declared volatile.

inline __device__ int warpScanInclusive(int idata, int id, int *s_Data)

{

    int pos = 2 * id - (id &31);

    s_Data[pos] = 0 ;

    pos += 32 ;

    s_Data[pos] = idata ;

int acc = idata + s_Data[pos - 1];

    s_Data[pos] = acc ;

    acc += s_Data[pos - 2];

    s_Data[pos] = acc ;

    acc += s_Data[pos - 4];

    s_Data[pos] = acc ;

    acc += s_Data[pos - 8];

    s_Data[pos] = acc;

    acc += s_Data[pos - 16];

    s_Data[pos] = acc ;

    return s_Data[pos];

}

I think that volatile keyword guarantees that “s_Data[pos] += s_Data[pos] + s_Data[pos-1]”

becomes four-steps

R1 = s_Data[pos];

    R2 = s_Data[pos-1];

    R1 = R1 + R2;

    s_Data[pos] = R1;

But this may not be an efficient way.

If fact, if compiler can always write-back L-value, then we don’t need volatile keyword.