I think that you mention following code in scan.cu
inline __device__ int warpScanInclusive(int idata, int id, int *s_Data)
{
int pos = 2 * id - (id &31);
s_Data[pos] = 0 ;
pos += 32 ;
s_Data[pos] = idata ;
for( int offset = 1 ; offset < 32 ; offset <<= 1 ){
s_Data[pos] += s_Data[pos - offset];
}
return s_Data[pos];
}
in fact, compiler (cuda 3.2) does not unroll the loop and
the assembly code (from cuobjdump on sm1.3) looks like
inline __device__ int warpScanInclusive(int idata, int id, int *s_Data)
{
int pos = 2 * id - (id &31);
s_Data[pos] = 0 ;
pos += 32 ;
s_Data[pos] = idata ;
int R0 = idata ;
for( int offset = 1 ; offset < 32 ; offset <<= 1 ){
R0 += s_Data[pos - offset];
s_Data[pos] = R0 ;
}
return R0;
}
The above code is correct because “s_Data[pos] = R0” is executed.
in fact you can unroll the loop manually
inline __device__ int warpScanInclusive(int idata, int id, volatile int *s_Data)
{
int pos = 2 * id - (id &31);
s_Data[pos] = 0 ;
pos += 32 ;
s_Data[pos] = idata ;
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
}
and assembly code is
inline __device__ int warpScanInclusive(int idata, int id, volatile int *s_Data)
{
int pos = 2 * id - (id &31);
s_Data[pos] = 0 ;
pos += 32 ;
s_Data[pos] = idata ;
R2 = s_Data[pos - 1];
R1 = s_Data[pos] ;
R1 = R1 + R2 ;
s_Data[pos] = R1 ;
R2 = s_Data[pos - 2];
R1 = s_Data[pos] ;
R1 = R1 + R2 ;
s_Data[pos] = R1 ;
R2 = s_Data[pos - 4];
R1 = s_Data[pos] ;
R1 = R1 + R2 ;
s_Data[pos] = R1 ;
R2 = s_Data[pos - 8];
R1 = s_Data[pos] ;
R1 = R1 + R2 ;
s_Data[pos] = R1 ;
R2 = s_Data[pos - 16];
R1 = s_Data[pos] ;
R1 = R1 + R2 ;
s_Data[pos] = R1 ;
return s_Data[pos];
}
This is not efficient because instruction “R1 = s_Data[pos] ;” is redudant.
You can try following code, compiler will keep “s_Data[pos] = acc ;” even s_Data is not declared volatile.
inline __device__ int warpScanInclusive(int idata, int id, int *s_Data)
{
int pos = 2 * id - (id &31);
s_Data[pos] = 0 ;
pos += 32 ;
s_Data[pos] = idata ;
int acc = idata + s_Data[pos - 1];
s_Data[pos] = acc ;
acc += s_Data[pos - 2];
s_Data[pos] = acc ;
acc += s_Data[pos - 4];
s_Data[pos] = acc ;
acc += s_Data[pos - 8];
s_Data[pos] = acc;
acc += s_Data[pos - 16];
s_Data[pos] = acc ;
return s_Data[pos];
}
I think that volatile keyword guarantees that “s_Data[pos] += s_Data[pos] + s_Data[pos-1]”
becomes four-steps
R1 = s_Data[pos];
R2 = s_Data[pos-1];
R1 = R1 + R2;
s_Data[pos] = R1;
But this may not be an efficient way.
If fact, if compiler can always write-back L-value, then we don’t need volatile keyword.