WARP Level Sorting Efficient?

I motified avi’s code to choose the largest element of an array

// Use first warp of block to compute parallel reduction on the

		// partial sum in shared memory.

		if (threadIdx.x < 32) {

			#pragma unroll 

			for(int i=32; i<blockDim.x; i+=32)

			{ 

				if (shared_buff[threadIdx.x] < shared_buff[threadIdx.x+i])

					shared_buff[threadIdx.x] = shared_buff[threadIdx.x+i];

			}

		}

		

		if ((threadIdx.x < 16)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+16])) { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+16]; }

		if ((threadIdx.x < 8)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+8]))  { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+8]; }

		if ((threadIdx.x < 4)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+4]))  { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+4]; }

		if ((threadIdx.x < 2)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+2]))  { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+2]; }

				 if (threadIdx.x == 0)

		{

					   if (shared_buff[threadIdx.x] < shared_buff[threadIdx.x + 1]) { shared_buff[threadIdx.x] = shared_buff[threadIdx.x + 1]; 

				}

due to all the comparison and if conditions is this code efficient of cuda?

There are max and min functions in the CUDA math library, and if you don’t know what the C ternary operator is, now is the time to learn. And just to blow you mind a little, if you aren’t going to reuse the contents of the shared memory array after the reduction, those threadIdx.x < {16,8,4,2} tests aren’t strictly necessary either.

A stupid question but isn’t the C ternary operator just an aesthetic difference or does it improve code efficiency?

Take a look at the following two trivial kernels, which are functionally the same - they both find the row wise maximum of three input arrays and compute the square root of it:

__global__ void branchTest0(float *a, float *b, float *c, float *d)

{

	float z0, z1;

	unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

	float aval = a[tidx], bval = b[tidx], cval = c[tidx];

	z0 = (aval > bval) ? aval : bval;

	z1 = (z0 > cval) ? z0 : cval;

	d[tidx] = sqrt(z1);

}

__global__ void branchTest1(float *a, float *b, float *c, float *d)

{

	float z0;

	unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

	float aval = a[tidx], bval = b[tidx], cval = c[tidx];

	if ( (aval > bval) && (aval > cval) ) {

		z0 = aval;

	} else if ( (aval < cval) && (bval < cval) ) {

		z0 = cval;

	} else {

		z0 = bval;

	}

	d[tidx] = sqrt(z0);

The only difference between them is one uses a pair of ternary operators and the other uses a pair of if{} statements to find the maximum. One is clearly a bit more elegant looking than the other, but you might not imagine there would be much difference in the code produced. So bring on the compiler…

The logical and computation part of branchTest0 compiles to this PTX:

max.f32 	%f4, %f2, %f3;

	max.f32 	%f5, %f1, %f4;

	sqrt.approx.f32 	%f6, %f5;

The logic and computation part of brachTest1 compiles to this PTX:

set.gt.u32.f32 	%r4, %f1, %f2;

	neg.s32 	%r5, %r4;

	set.gt.u32.f32 	%r6, %f1, %f3;

	neg.s32 	%r7, %r6;

	and.b32 	%r8, %r5, %r7;

	mov.u32 	%r9, 0;

	setp.eq.s32 	%p1, %r8, %r9;

	@%p1 bra 	$Lt_1_2562;

	.loc	16	22	0

	mov.f32 	%f4, %f1;

	bra.uni 	$Lt_1_2306;

$Lt_1_2562:

	.loc	16	23	0

	set.lt.u32.f32 	%r10, %f1, %f2;

	neg.s32 	%r11, %r10;

	set.lt.u32.f32 	%r12, %f2, %f3;

	neg.s32 	%r13, %r12;

	and.b32 	%r14, %r11, %r13;

	neg.s32 	%r15, %r14;

	slct.f32.s32 	%f4, %f2, %f3, %r15;

$Lt_1_2306:

	.loc	16	29	0

	sqrt.approx.f32 	%f5, %f4;

I will leave it as an exercise for the reader to decide which code might be more efficient.

I thought of it as an elegant operator that performed a if-else chain. Having seen your example, thank you avidday, it seems that the back-forth osrting bit of my LDPC decoder is rather divergent as opposed to slight divergent.

Well, you don’t expect nvcc to solve an undecidable problem, do you? ;)

BTW, they are not so equivalent (once compiled)…

#include <stdio.h>

#include <math.h>

__global__ void branchTest0(float *a, float *b, float *c, float *d)

{

	float z0, z1;

	unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

	float aval = a[tidx], bval = b[tidx], cval = c[tidx];

	z0 = (aval > bval) ? aval : bval;

	z1 = (z0 > cval) ? z0 : cval;

	d[tidx] = sqrt(z1);

}

__global__ void branchTest1(float *a, float *b, float *c, float *d)

{

	float z0;

	unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

	float aval = a[tidx], bval = b[tidx], cval = c[tidx];

	if ( (aval > bval) && (aval > cval) ) {

		z0 = aval;

	} else if ( (aval < cval) && (bval < cval) ) {

		z0 = cval;

	} else {

		z0 = bval;

	}

	d[tidx] = sqrt(z0);

}

__global__ void branchTest2(float *a, float *b, float *c, float *d)

{

	float z0, z1;

	unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

	float aval = a[tidx], bval = b[tidx], cval = c[tidx];

	if(aval > bval)

		z0 = aval;

	else

		z0 = bval;

	if(z0 > cval)

		z1 = z0;

	else

		z1 = cval;

	d[tidx] = sqrt(z1);

}

int main()

{

	float * da, *db, *dc, *dd;

	cudaMalloc((void**)(&da), sizeof(float));

	cudaMalloc((void**)(&db), sizeof(float));

	cudaMalloc((void**)(&dc), sizeof(float));

	cudaMalloc((void**)(&dd), sizeof(float));

	

	float a = 0;

	float b = nanf("");

	float c = 0;

	float d;

	cudaMemcpy(da, &a, sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(db, &b, sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(dc, &c, sizeof(float), cudaMemcpyHostToDevice);

	branchTest0<<<1, 1>>>(da, db, dc, dd);

	cudaMemcpy(&d, dd, sizeof(float), cudaMemcpyDeviceToHost);

	printf("branchTest0(%f, %f, %f) = %f\n", a, b, c, d);

	branchTest1<<<1, 1>>>(da, db, dc, dd);

	cudaMemcpy(&d, dd, sizeof(float), cudaMemcpyDeviceToHost);

	printf("branchTest1(%f, %f, %f) = %f\n", a, b, c, d);

	

	branchTest2<<<1, 1>>>(da, db, dc, dd);

	cudaMemcpy(&d, dd, sizeof(float), cudaMemcpyDeviceToHost);

	printf("branchTest2(%f, %f, %f) = %f\n", a, b, c, d);

	return 0;

}

outputs:

branchTest0(0.000000, nan, 0.000000) = 0.000000

branchTest1(0.000000, nan, 0.000000) = nan

branchTest2(0.000000, nan, 0.000000) = 0.000000

Bottom line: if you want to compute a maximum, just use the (f)max(f) function and don’t rely on unsafe compiler optimizations, which might disappear the day anyone starts complaining about IEEE-754 (non-)compliance… :/

[quote name=‘avidday’ post=‘1035457’ date=‘Apr 7 2010, 06:06 PM’]

Take a look at the following two trivial kernels, which are functionally the same - they both find the row wise maximum of three input arrays and compute the square root of it:

[codebox]__global__ void branchTest0(float *a, float *b, float *c, float *d)

{

float z0, z1;

unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

float aval = a[tidx], bval = b[tidx], cval = c[tidx];

z0 = (aval > bval) ? aval : bval;

z1 = (z0 > cval) ? z0 : cval;

d[tidx] = sqrt(z1);

}

global void branchTest1(float *a, float *b, float *c, float *d)

{

float z0, z1;

unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

float aval = a[tidx], bval = b[tidx], cval = c[tidx];

if (aval > bval)

	z0 = aval;

else

	z0 = bval;

	

if (z0 > cval)

	z1 = z0;

else

	z1 = cval;

d[tidx] = sqrt(z1);

}[/codebox]

Which now compiles to

[codebox]

max.f32 	%f4, %f2, %f3;

max.f32 	%f5, %f1, %f4;

sqrt.approx.f32 	%f6, %f5;

[/codebox]

vs

[codebox]

setp.gt.f32 	%p1, %f1, %f2;

selp.f32 	%f4, %f1, %f2, %p1;

setp.gt.f32 	%p2, %f4, %f3;

selp.f32 	%f5, %f4, %f3, %p2;

sqrt.approx.f32 	%f6, %f5;

[/codebox]

The most interesting part of is that the compiler recognizes the max operation and substitutes it for the ?: operator.

If I change the source so the substitution doesn’t work anymore by doing

[codebox]

z0 = (aval == bval) ? aval : bval;

z1 = (z0 == cval) ? z0 : cval;

[/codebox]

(replace the > with the == operator, the same for the if case) they both compile to exactly the same ptx code

[codebox]

setp.eq.f32 	%p1, %f1, %f2;

selp.f32 	%f4, %f1, %f2, %p1;

setp.eq.f32 	%p2, %f4, %f3;

selp.f32 	%f5, %f4, %f3, %p2;

sqrt.approx.f32 	%f6, %f5;

[/codebox]

[quote name=‘avidday’ post=‘1035457’ date=‘Apr 7 2010, 06:06 PM’]

Take a look at the following two trivial kernels, which are functionally the same - they both find the row wise maximum of three input arrays and compute the square root of it:

[codebox]__global__ void branchTest0(float *a, float *b, float *c, float *d)

{

float z0, z1;

unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

float aval = a[tidx], bval = b[tidx], cval = c[tidx];

z0 = (aval > bval) ? aval : bval;

z1 = (z0 > cval) ? z0 : cval;

d[tidx] = sqrt(z1);

}

global void branchTest1(float *a, float *b, float *c, float *d)

{

float z0, z1;

unsigned int tidx = threadIdx.x + blockDim.x*blockIdx.x;

float aval = a[tidx], bval = b[tidx], cval = c[tidx];

if (aval > bval)

	z0 = aval;

else

	z0 = bval;

	

if (z0 > cval)

	z1 = z0;

else

	z1 = cval;

d[tidx] = sqrt(z1);

}[/codebox]

Which now compiles to

[codebox]

max.f32 	%f4, %f2, %f3;

max.f32 	%f5, %f1, %f4;

sqrt.approx.f32 	%f6, %f5;

[/codebox]

vs

[codebox]

setp.gt.f32 	%p1, %f1, %f2;

selp.f32 	%f4, %f1, %f2, %p1;

setp.gt.f32 	%p2, %f4, %f3;

selp.f32 	%f5, %f4, %f3, %p2;

sqrt.approx.f32 	%f6, %f5;

[/codebox]

The most interesting part of is that the compiler recognizes the max operation and substitutes it for the ?: operator.

If I change the source so the substitution doesn’t work anymore by doing

[codebox]

z0 = (aval == bval) ? aval : bval;

z1 = (z0 == cval) ? z0 : cval;

[/codebox]

(replace the > with the == operator, the same for the if case) they both compile to exactly the same ptx code

[codebox]

setp.eq.f32 	%p1, %f1, %f2;

selp.f32 	%f4, %f1, %f2, %p1;

setp.eq.f32 	%p2, %f4, %f3;

selp.f32 	%f5, %f4, %f3, %p2;

sqrt.approx.f32 	%f6, %f5;

[/codebox]

:-)… But sadly, the compiler expects the poor programmer to solve it… Isn’t it?