Problems with larger arrays

Sorry if this comes off as a total n00b question but I am new at CUDA and I have spent the past day at work googling around for this to no avail. i’m trying to write my own reduction algorithm based on the one in the SDK. Everything works fine for matrices smaller than 4096 but anything bigger gives me a “invalid configuration argument” error. From going through these forums I believe the problem is that I’m putting too much data in the shared memory; yet i’ve used the “copying chunks of data” from global into shared described in other threads but still have the same error. Does anyone have any ideas? Thanks in advance

[codebox]#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <VectorSum_kernel.cu>

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void normalize(float *weights, unsigned int size);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv)

{

unsigned int array_size = 4096;

unsigned int mem_size = sizeof( float) * array_size;

float* data = (float*) malloc( mem_size);

for( unsigned int i = 0; i < array_size; ++i)

{

    data[i] = float(i);

}

normalize(data, array_size);

cutilExit(argc, argv);

}

void normalize(float *weights, unsigned int size)

{

printf("Adding Sums \n");

cudaSetDevice(cutGetMaxGflopsDeviceId());

unsigned int timer = 0;

cutilCheckError( cutCreateTimer( &timer));

cutilCheckError( cutStartTimer( timer));

unsigned int num_threads = 256;

unsigned int full_blocks = size/num_threads;

unsigned int mem_size = sizeof(float)*size;

// Declare Device Global memory & Copy Host Memory To It

float* d_idata;

cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size));

cutilSafeCall( cudaMemcpy( d_idata, weights, mem_size, cudaMemcpyHostToDevice));

// Declare and Allocate Device Memory for Result

float* d_odata;

cutilSafeCall( cudaMalloc( (void**) &d_odata, sizeof(float)*full_blocks));

// setup execution parameters

dim3  grid( full_blocks, 1, 1);

dim3  threads( num_threads, 1, 1);

// execute the kernel

VectorSumKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);

// check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed");

// allocate mem for the result on host side

float* h_odata = (float*) malloc( sizeof(float)*full_blocks);

// copy result from device to host

cutilSafeCall( cudaMemcpy( h_odata, d_odata, sizeof( float) * full_blocks, cudaMemcpyDeviceToHost));

float* d_odata2;

cutilSafeCall( cudaMalloc( (void**) &d_odata2, sizeof(float)*mem_size));

normalizeSumKernel <<< grid, threads, mem_size >>> (d_idata, d_odata2, total);

float* h_odata2 = (float*) malloc( sizeof(float)*mem_size);

cutilSafeCall( cudaMemcpy( h_odata2, d_odata2, sizeof( float) * mem_size, cudaMemcpyDeviceToHost));

printf (“Total = %f”, exp(total));

// cleanup memory

free( weights);

free( h_odata);

free( h_odata2);

cutilSafeCall(cudaFree(d_idata));

cutilSafeCall(cudaFree(d_odata));

cutilSafeCall(cudaFree(d_odata2));

cudaThreadExit();

}[/codebox]

This is the kernel file[codebox]#ifndef TEMPLATE_KERNEL_H

#define TEMPLATE_KERNEL_H

#include <stdio.h>

#define SDATA( index) cutilBankChecker(sdata, index)

global void

VectorSumKernel( float* g_idata, float* g_odata)

{

// shared memory

// the size is determined by the host application

extern shared float sdata;

// access thread id

const unsigned int tid = threadIdx.x;

const unsigned int bid = 256*blockIdx.x;

// read in input data from global memory

sdata[tid] = g_idata[bid+tid];

__syncthreads();

// do reduction in shared mem

for(unsigned int s=blockDim.x/2; s>0; s>>=1)

{

  if (tid < s)

  {

      sdata[tid] += sdata[tid + s];

  }

  __syncthreads();

}

if (tid==0) g_odata[blockIdx.x] = sdata[0];

}

global void

normalizeSumKernel (float* g_idata, float* g_odata, float total)

{

extern __shared__ float sdata[];

//access thread id

const unsigned int tid = threadIdx.x;

const unsigned int bid = 256*blockIdx.x;

sdata[tid] = __logf(g_idata[bid + tid]) - total;

g_odata[bid+tid] = sdata[tid];

__syncthreads();

}

#endif // #ifndef TEMPLATE_KERNEL_H[/codebox]

The problem is your mem_size definition:

unsigned int mem_size = sizeof(float)*size;

that will result in an illegal configuration:

VectorSumKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);

Shared memory is limited to 16K.

Since you are using num_threads threads, mem_size should be:
unsigned int mem_size = sizeof(float)* num_threads;