Reduction from SDK

Hello there,

does anyone have a snippet that will utilize one of the reduction kernels from the SDK.

I cannot get anything simple to work, Its probably because ive stared at it for 2 days straight.

im generating a simple array using

int size=1<<5

	int maxThreads = 128;  // number of threads per block

	int whichKernel = 6;

	int maxBlocks = 64;	

	T gpu_result = 0;

	.

	.

	.

	for(int i=0; i<size; i++) 

	{

		h_idata[i] = 1;

	}

	T cpu_result = reduceCPU<T>(h_idata, size);

	printf("CPU result = %d\n", cpu_result);

	printf("Size = %10d\n", size);

	int numBlocks = 0;

	int numThreads = 0;

	getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);

	// allocate mem for the result on host side

	T* h_odata = (T*) malloc(numBlocks*sizeof(T));

	printf("%d blocks\n", numBlocks);

	// allocate device memory and data

	T* d_idata = NULL;

	T* d_odata = NULL;

	cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );

	cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );

	// copy data directly to device memory

	cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );

	cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );

	// choose kernel

	if (datatype == REDUCE_DOUBLE)

		reduce_sm13<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

	else

		reduce_sm10<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

	

	cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );

		printf("RESULT = %d\n",gpu_result);

my idea being it that the sum of the array will be equal to the size.

but I always end up with something weird.

If anyone has any help or pointer please let me know while i go nurse my headache >.<

Thanks

OK, i got rid of my headache and i have it sort of working,

but now the problem is that it will only come back with a result if the size of the array is 256 or more.

this is what i have… anyone have any ideas?

int main( int argc, char** argv) 

{

	char *typeChoice;

	cutGetCmdLineArgumentstr( argc, (const char**) argv, "type", &typeChoice);

	if (0 == typeChoice)

	{

		typeChoice = (char*)malloc(4 * sizeof(char));

		strcpy(typeChoice, "int");

	}

	ReduceType datatype = REDUCE_INT;

	if (!strcasecmp(typeChoice, "float"))

		datatype = REDUCE_FLOAT;

	else if (!strcasecmp(typeChoice, "double"))

		datatype = REDUCE_DOUBLE;

	else

		datatype = REDUCE_INT;

	printf("Reducing array of type %s.\n", typeChoice);

	cudaDeviceProp deviceProp;

	deviceProp.major = 1;

	deviceProp.minor = 0;

	int desiredMinorRevision = 0;

	if (datatype == REDUCE_DOUBLE)

	{

		deviceProp.minor = 3;

		desiredMinorRevision = 3;

	}

	int dev;

	cutilSafeCallNoSync(cudaChooseDevice(&dev, &deviceProp));

	cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));

	if(deviceProp.major > 1 || deviceProp.minor >= desiredMinorRevision)

	{

		printf("Using Device %d: \"%s\"\n", dev, deviceProp.name);

		cutilSafeCallNoSync(cudaSetDevice(dev));

	}

	else if (desiredMinorRevision == 3)

	{

		printf("There is no device supporting compute capability %d.%d.\n\n",

			1, desiredMinorRevision);

		printf("TEST PASSED");

		cudaThreadExit();

		cutilExit(argc, argv);

	}

	switch (datatype)

	{

	default:

	case REDUCE_INT:

		runTest<int>( argc, argv, datatype);

		break;

	case REDUCE_FLOAT:

		runTest<float>( argc, argv, datatype);

		break;

	case REDUCE_DOUBLE:

		runTest<double>( argc, argv, datatype);

		break;

	}

	cudaThreadExit();

	cutilExit(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Compute sum reduction on CPU

//! We use Kahan summation for an accurate sum of large arrays.

//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm

//! 

//! @param data	   pointer to input data

//! @param size	   number of input data elements

////////////////////////////////////////////////////////////////////////////////

template<class T>

T reduceCPU(T *data, int size)

{

	T sum = data[0];

	T c = (T)0.0;			  

	for (int i = 1; i < size; i++)

	{

		T y = data[i] - c;  

		T t = sum + y;	  

		c = (t - sum) - y;  

		sum = t;			

	}

	return sum;

}

////////////////////////////////////////////////////////////////////////////////

// Compute the number of threads and blocks to use for the given reduction kernel

// For the kernels >= 3, we set threads / block to the minimum of maxThreads and

// n/2. For kernels < 3, we set to the minimum of maxThreads and n.  For kernel 

// 6, we observe the maximum specified number of blocks, because each thread in 

// that kernel can process a variable number of elements.

////////////////////////////////////////////////////////////////////////////////

void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)

{

	if (whichKernel < 3)

	{

		threads = (n < maxThreads) ? n : maxThreads;

		blocks = n / threads;

		if(blocks<1)

		{

			blocks=1;

		}

	}

	else

	{

		if (n == 1) 

			threads = 1;

		else

			threads = (n < maxThreads*2) ? n / 2 : maxThreads;

		blocks = n / (threads * 2);

		if (whichKernel == 6)

			blocks = min(maxBlocks, blocks);

	}

}

template <class T> void runTest( int argc, char** argv, ReduceType datatype) 

{

	int size = 256;	// number of elements to reduce (MIN=16)

	//int size = 1<<4;

	int maxThreads = 128;  // number of threads per block

	int whichKernel = 6;

	int maxBlocks = 64;

	int numBlocks = 0;

	int numThreads = 0;

	T gpu_result = 0;

	bool needReadBack = true;

	bool useSM13 = (datatype == REDUCE_DOUBLE); 

	printf("%d elements\n", size);

	printf("%d threads (max)\n", maxThreads);

	// create pointer and array

	unsigned int bytes = size * sizeof(T);

	T *h_idata = (T *) malloc(bytes);

	T sum=0;

	// Generate Data

	for(int i=0; i<size; i++) 

	{

		h_idata[i] = (T)i;//(T)i;

		sum=sum+h_idata[i];

	}

	// Calc CPU Result

	T cpu_result = reduceCPU<T>(h_idata, size);

	//Quick summary

	printf("CPU result = %d\n", cpu_result);

	printf("Size = %10d\n", size);

	printf("SUM = %10d\n", sum);

	// get number of blocks and theads

	getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);

	// allocate mem for the result on host side

	T* h_odata = (T*) malloc(numBlocks*sizeof(T));

	printf("%d blocks\n", numBlocks);

	// allocate device memory and data

	T* d_idata = NULL;

	T* d_odata = NULL;

	cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );

	cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );

	// copy data directly to device memory

	cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );

	cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );

	// Choose Kernal to run

	if (datatype == REDUCE_DOUBLE)

		reduce_sm13<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

	else

		reduce_sm10<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

	// copy result back from Device

	cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, bytes, cudaMemcpyDeviceToHost) );

	//printf("RESULT = %d\n",gpu_result);

	

	// Add up all instanciated blocks

	// - This needs to be optimized for larger reductions requiring more blocks

	sum=0;

	printf("Number of Blocks: %d\n",numBlocks);

	if(numBlocks==1)

	{

		cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );

	}

	else

	{

		for(int i=0; i<numBlocks; i++) 

		{

			sum=sum+h_odata[i];

			printf("Block Results: %d\n",h_odata[i]);

		}

		gpu_result=sum;

	}

	

	// Print Result

	printf("FINAL RESULT = %d\n", gpu_result);

	printf("  CPU RESULT = %d\n", cpu_result);

	// cleanup

	//free(h_idata);

	//free(h_odata);

	cutilSafeCallNoSync(cudaFree(d_idata));

	cutilSafeCallNoSync(cudaFree(d_odata));

	system("pause");

}

this is all in my Reduction project, which includes the SDK’s reduction_kernel.cu, reduction_kernel_sm10.cu and reduction_kernel_sm13.cu

Maybe it’s because your maxthreads is set to 128? If the array size is less, maybe some of the threads are reading out-of-bounds memory or something?

In any case, I don’t think that it’s worth doing a reduction on the GPU with fewer than a few thousand elements (other than just to experiment with CUDA, of course).