*** nevermind - found the bug ***
*** it was in the for loop, left the updated version for reference ***
I wrote a kernel that calculates the line maximum and minimum of an array of floats using per line reduction. That is one block per line
The kernel runs on a 512x512 array with 128 threads per block (blockDim.x = 128) and 512 threads per grid (gridDim.y = 512)
This is mostly a ripoff from the SDK reduction and it runs at 6.5 ms on a gt240, which is about 150MB/s
looking at the visual profiler also seems to show that it is doing way way way too many global memory accessess
Any chance that someone can tell me where the extra memory accesses are happening?
Thanks
The kernel:
[codebox]
template <class T, int blockSz>
global void MinMax_kernel(const T * data, size_t stride,
T *maxVec, T *minVec, int width, int height)
{
__shared__ T shmax[blockSz];
__shared__ T shmin[blockSz];
int ell = threadIdx.x;
int line = blockIdx.y;
T m;
T M;
data = (T *)((char *)data + line*stride);
// perform first level of reduction,
// reading from global memory, writing to shared memory
m = M = data[ell];
for (int i = blockDim.x + ell; i < width ; i += blockDim.x)
{
T val = data[i];
M = max(M, val);
m = min(m, val);
}
shmin[ell] = m;
shmax[ell] = M;
__syncthreads();
// do reduction in shared mem
if (blockSz >= 512)
{
if (ell >= 256) return;
shmax[ell] = M = max(M, shmax[ell + 256]);
shmin[ell] = m = min(m, shmin[ell + 256]);
__syncthreads();
}
if (blockSz >= 256)
{
if (ell >= 128) return;
shmax[ell] = M = max(M, shmax[ell + 128]);
shmin[ell] = m = min(m, shmin[ell + 128]);
__syncthreads();
}
if (blockSz >= 128)
{
if (ell >= 64) return;
shmax[ell] = M = max(M, shmax[ell + 64]);
shmin[ell] = m = min(m, shmin[ell + 64]);
__syncthreads();
}
if (ell >= 32)
return;
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
volatile T * vshmax = shmax;
volatile T * vshmin = shmin;
if (blockSz >= 64)
{
vshmax[ell] = M = max(M, vshmax[ell + 32]);
vshmin[ell] = m = min(m, vshmin[ell + 32]);
}
if (blockSz >= 32)
{
vshmax[ell] = M = max(M, vshmax[ell + 16]);
vshmin[ell] = m = min(m, vshmin[ell + 16]);
}
if (blockSz >= 16)
{
vshmax[ell] = M = max(M, vshmax[ell + 8]);
vshmin[ell] = m = min(m, vshmin[ell + 8]);
}
if (blockSz >= 8)
{
vshmax[ell] = M = max(M, vshmax[ell + 4]);
vshmin[ell] = m = min(m, vshmin[ell + 4]);
}
if (blockSz >= 4)
{
vshmax[ell] = M = max(M, vshmax[ell + 2]);
vshmin[ell] = m = min(m, vshmin[ell + 2]);
}
if (blockSz >= 2)
{
M = max(M, vshmax[ell + 1]);
m = min(m, vshmin[ell + 1]);
}
// write result for this block to global mem
if (ell)
return;
maxVec[line] = M;
minVec[line] = m;
}
[/codebox]
Visual profiler says (main thing that looks weird is gld)
gridSize x 1
grid size y 512
block size x 128
block size y 1
shared mem 1064
registers 7
occupancy 1
branch 62706
divergent branch 5250
instructions 522168
warp serialize 0
gld 32b 158720
gld 64b 166668
gld 128b 154800
gst 32b 258
gst 64b 0
gst 128b 0
considering there are 5125124 = 1048576 (262144 words), where are all the loads coming from?
*** nevermind - found the bug ***
*** it was in the for loop, left the updated version for reference ***