OK, i got rid of my headache and i have it sort of working,
but now the problem is that it will only come back with a result if the size of the array is 256 or more.
this is what i have… anyone have any ideas?
int main( int argc, char** argv)
{
char *typeChoice;
cutGetCmdLineArgumentstr( argc, (const char**) argv, "type", &typeChoice);
if (0 == typeChoice)
{
typeChoice = (char*)malloc(4 * sizeof(char));
strcpy(typeChoice, "int");
}
ReduceType datatype = REDUCE_INT;
if (!strcasecmp(typeChoice, "float"))
datatype = REDUCE_FLOAT;
else if (!strcasecmp(typeChoice, "double"))
datatype = REDUCE_DOUBLE;
else
datatype = REDUCE_INT;
printf("Reducing array of type %s.\n", typeChoice);
cudaDeviceProp deviceProp;
deviceProp.major = 1;
deviceProp.minor = 0;
int desiredMinorRevision = 0;
if (datatype == REDUCE_DOUBLE)
{
deviceProp.minor = 3;
desiredMinorRevision = 3;
}
int dev;
cutilSafeCallNoSync(cudaChooseDevice(&dev, &deviceProp));
cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
if(deviceProp.major > 1 || deviceProp.minor >= desiredMinorRevision)
{
printf("Using Device %d: \"%s\"\n", dev, deviceProp.name);
cutilSafeCallNoSync(cudaSetDevice(dev));
}
else if (desiredMinorRevision == 3)
{
printf("There is no device supporting compute capability %d.%d.\n\n",
1, desiredMinorRevision);
printf("TEST PASSED");
cudaThreadExit();
cutilExit(argc, argv);
}
switch (datatype)
{
default:
case REDUCE_INT:
runTest<int>( argc, argv, datatype);
break;
case REDUCE_FLOAT:
runTest<float>( argc, argv, datatype);
break;
case REDUCE_DOUBLE:
runTest<double>( argc, argv, datatype);
break;
}
cudaThreadExit();
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Compute sum reduction on CPU
//! We use Kahan summation for an accurate sum of large arrays.
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
//!
//! @param data pointer to input data
//! @param size number of input data elements
////////////////////////////////////////////////////////////////////////////////
template<class T>
T reduceCPU(T *data, int size)
{
T sum = data[0];
T c = (T)0.0;
for (int i = 1; i < size; i++)
{
T y = data[i] - c;
T t = sum + y;
c = (t - sum) - y;
sum = t;
}
return sum;
}
////////////////////////////////////////////////////////////////////////////////
// Compute the number of threads and blocks to use for the given reduction kernel
// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel
// 6, we observe the maximum specified number of blocks, because each thread in
// that kernel can process a variable number of elements.
////////////////////////////////////////////////////////////////////////////////
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
if (whichKernel < 3)
{
threads = (n < maxThreads) ? n : maxThreads;
blocks = n / threads;
if(blocks<1)
{
blocks=1;
}
}
else
{
if (n == 1)
threads = 1;
else
threads = (n < maxThreads*2) ? n / 2 : maxThreads;
blocks = n / (threads * 2);
if (whichKernel == 6)
blocks = min(maxBlocks, blocks);
}
}
template <class T> void runTest( int argc, char** argv, ReduceType datatype)
{
int size = 256; // number of elements to reduce (MIN=16)
//int size = 1<<4;
int maxThreads = 128; // number of threads per block
int whichKernel = 6;
int maxBlocks = 64;
int numBlocks = 0;
int numThreads = 0;
T gpu_result = 0;
bool needReadBack = true;
bool useSM13 = (datatype == REDUCE_DOUBLE);
printf("%d elements\n", size);
printf("%d threads (max)\n", maxThreads);
// create pointer and array
unsigned int bytes = size * sizeof(T);
T *h_idata = (T *) malloc(bytes);
T sum=0;
// Generate Data
for(int i=0; i<size; i++)
{
h_idata[i] = (T)i;//(T)i;
sum=sum+h_idata[i];
}
// Calc CPU Result
T cpu_result = reduceCPU<T>(h_idata, size);
//Quick summary
printf("CPU result = %d\n", cpu_result);
printf("Size = %10d\n", size);
printf("SUM = %10d\n", sum);
// get number of blocks and theads
getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
// allocate mem for the result on host side
T* h_odata = (T*) malloc(numBlocks*sizeof(T));
printf("%d blocks\n", numBlocks);
// allocate device memory and data
T* d_idata = NULL;
T* d_odata = NULL;
cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );
// copy data directly to device memory
cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );
// Choose Kernal to run
if (datatype == REDUCE_DOUBLE)
reduce_sm13<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
else
reduce_sm10<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
// copy result back from Device
cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, bytes, cudaMemcpyDeviceToHost) );
//printf("RESULT = %d\n",gpu_result);
// Add up all instanciated blocks
// - This needs to be optimized for larger reductions requiring more blocks
sum=0;
printf("Number of Blocks: %d\n",numBlocks);
if(numBlocks==1)
{
cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );
}
else
{
for(int i=0; i<numBlocks; i++)
{
sum=sum+h_odata[i];
printf("Block Results: %d\n",h_odata[i]);
}
gpu_result=sum;
}
// Print Result
printf("FINAL RESULT = %d\n", gpu_result);
printf(" CPU RESULT = %d\n", cpu_result);
// cleanup
//free(h_idata);
//free(h_odata);
cutilSafeCallNoSync(cudaFree(d_idata));
cutilSafeCallNoSync(cudaFree(d_odata));
system("pause");
}
this is all in my Reduction project, which includes the SDK’s reduction_kernel.cu, reduction_kernel_sm10.cu and reduction_kernel_sm13.cu