Sorry if this comes off as a total n00b question but I am new at CUDA and I have spent the past day at work googling around for this to no avail. i’m trying to write my own reduction algorithm based on the one in the SDK. Everything works fine for matrices smaller than 4096 but anything bigger gives me a “invalid configuration argument” error. From going through these forums I believe the problem is that I’m putting too much data in the shared memory; yet i’ve used the “copying chunks of data” from global into shared described in other threads but still have the same error. Does anyone have any ideas? Thanks in advance
[codebox]#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
// includes, kernels
#include <VectorSum_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void normalize(float *weights, unsigned int size);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
unsigned int array_size = 4096;
unsigned int mem_size = sizeof( float) * array_size;
float* data = (float*) malloc( mem_size);
for( unsigned int i = 0; i < array_size; ++i)
{
data[i] = float(i);
}
normalize(data, array_size);
cutilExit(argc, argv);
}
void normalize(float *weights, unsigned int size)
{
printf("Adding Sums \n");
cudaSetDevice(cutGetMaxGflopsDeviceId());
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
unsigned int num_threads = 256;
unsigned int full_blocks = size/num_threads;
unsigned int mem_size = sizeof(float)*size;
// Declare Device Global memory & Copy Host Memory To It
float* d_idata;
cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size));
cutilSafeCall( cudaMemcpy( d_idata, weights, mem_size, cudaMemcpyHostToDevice));
// Declare and Allocate Device Memory for Result
float* d_odata;
cutilSafeCall( cudaMalloc( (void**) &d_odata, sizeof(float)*full_blocks));
// setup execution parameters
dim3 grid( full_blocks, 1, 1);
dim3 threads( num_threads, 1, 1);
// execute the kernel
VectorSumKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// allocate mem for the result on host side
float* h_odata = (float*) malloc( sizeof(float)*full_blocks);
// copy result from device to host
cutilSafeCall( cudaMemcpy( h_odata, d_odata, sizeof( float) * full_blocks, cudaMemcpyDeviceToHost));
float* d_odata2;
cutilSafeCall( cudaMalloc( (void**) &d_odata2, sizeof(float)*mem_size));
normalizeSumKernel <<< grid, threads, mem_size >>> (d_idata, d_odata2, total);
float* h_odata2 = (float*) malloc( sizeof(float)*mem_size);
cutilSafeCall( cudaMemcpy( h_odata2, d_odata2, sizeof( float) * mem_size, cudaMemcpyDeviceToHost));
printf (“Total = %f”, exp(total));
// cleanup memory
free( weights);
free( h_odata);
free( h_odata2);
cutilSafeCall(cudaFree(d_idata));
cutilSafeCall(cudaFree(d_odata));
cutilSafeCall(cudaFree(d_odata2));
cudaThreadExit();
}[/codebox]
This is the kernel file[codebox]#ifndef TEMPLATE_KERNEL_H
#define TEMPLATE_KERNEL_H
#include <stdio.h>
#define SDATA( index) cutilBankChecker(sdata, index)
global void
VectorSumKernel( float* g_idata, float* g_odata)
{
// shared memory
// the size is determined by the host application
extern shared float sdata;
// access thread id
const unsigned int tid = threadIdx.x;
const unsigned int bid = 256*blockIdx.x;
// read in input data from global memory
sdata[tid] = g_idata[bid+tid];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid==0) g_odata[blockIdx.x] = sdata[0];
}
global void
normalizeSumKernel (float* g_idata, float* g_odata, float total)
{
extern __shared__ float sdata[];
//access thread id
const unsigned int tid = threadIdx.x;
const unsigned int bid = 256*blockIdx.x;
sdata[tid] = __logf(g_idata[bid + tid]) - total;
g_odata[bid+tid] = sdata[tid];
__syncthreads();
}
#endif // #ifndef TEMPLATE_KERNEL_H[/codebox]