why this max/min reduction kernel way too slow?


*** nevermind - found the bug ***

*** it was in the for loop, left the updated version for reference ***


I wrote a kernel that calculates the line maximum and minimum of an array of floats using per line reduction. That is one block per line

The kernel runs on a 512x512 array with 128 threads per block (blockDim.x = 128) and 512 threads per grid (gridDim.y = 512)

This is mostly a ripoff from the SDK reduction and it runs at 6.5 ms on a gt240, which is about 150MB/s

looking at the visual profiler also seems to show that it is doing way way way too many global memory accessess

Any chance that someone can tell me where the extra memory accesses are happening?

Thanks

The kernel:

[codebox]

template <class T, int blockSz>

global void MinMax_kernel(const T * data, size_t stride,

T *maxVec, T *minVec, int width, int height)

{

__shared__ T shmax[blockSz];

__shared__ T shmin[blockSz];

int ell  = threadIdx.x;

int line = blockIdx.y;

T m;

T M;

data = (T *)((char *)data + line*stride);

// perform first level of reduction,

// reading from global memory, writing to shared memory

m = M = data[ell];

for (int i = blockDim.x + ell; i < width ; i +=  blockDim.x)

{

	T val = data[i];

	M = max(M, val);

	m = min(m, val);

}

shmin[ell] = m;

shmax[ell] = M;

__syncthreads();

// do reduction in shared mem

if (blockSz >= 512)

{

	if (ell >= 256) return;

	

	shmax[ell] = M = max(M, shmax[ell + 256]);

	shmin[ell] = m = min(m, shmin[ell + 256]);

	__syncthreads();

}

if (blockSz >= 256)

{

	if (ell >= 128) return;

	shmax[ell] = M = max(M, shmax[ell + 128]);

	shmin[ell] = m = min(m, shmin[ell + 128]);

	__syncthreads();

}

if (blockSz >= 128)

{

	if (ell >= 64) return;

	shmax[ell] = M = max(M, shmax[ell + 64]);

	shmin[ell] = m = min(m, shmin[ell + 64]);

	__syncthreads();

}

if (ell >= 32)

	return;

// now that we are using warp-synchronous programming (below)

// we need to declare our shared memory volatile so that the compiler

// doesn't reorder stores to it and induce incorrect behavior.

volatile T * vshmax = shmax;

volatile T * vshmin = shmin;

if (blockSz >=  64) 

{

	vshmax[ell] = M = max(M, vshmax[ell + 32]);

	vshmin[ell] = m = min(m, vshmin[ell + 32]);

}

if (blockSz >=  32)

{

	vshmax[ell] = M = max(M, vshmax[ell + 16]);

	vshmin[ell] = m = min(m, vshmin[ell + 16]);

}

if (blockSz >=  16)

{

	vshmax[ell] = M = max(M, vshmax[ell + 8]);

	vshmin[ell] = m = min(m, vshmin[ell + 8]);

}

if (blockSz >=   8)

{

	vshmax[ell] = M = max(M, vshmax[ell + 4]);

	vshmin[ell] = m = min(m, vshmin[ell + 4]);

}

if (blockSz >=   4)

{

	vshmax[ell] = M = max(M, vshmax[ell + 2]);

	vshmin[ell] = m = min(m, vshmin[ell + 2]);

}

if (blockSz >=   2)

{

	M = max(M, vshmax[ell + 1]);

	m = min(m, vshmin[ell + 1]);

}

// write result for this block to global mem

if (ell) 

	return;

maxVec[line] = M;

minVec[line] = m;

}

[/codebox]

Visual profiler says (main thing that looks weird is gld)

gridSize x 1

grid size y 512

block size x 128

block size y 1

shared mem 1064

registers 7

occupancy 1

branch 62706

divergent branch 5250

instructions 522168

warp serialize 0

gld 32b 158720

gld 64b 166668

gld 128b 154800

gst 32b 258

gst 64b 0

gst 128b 0

considering there are 5125124 = 1048576 (262144 words), where are all the loads coming from?


*** nevermind - found the bug ***

*** it was in the for loop, left the updated version for reference ***


Change
[font=“Courier New”] for (int i = blockDim.x + ell; i < width ; i++)[/font]
into
[font=“Courier New”] for (int i = blockDim.x + ell; i < width ; i+=blockDim.x)[/font]
to avoid reprocessing data multiple times…

EDIT: seems I’m too late.