Sequential execution from within the kernel fails

I am seeing a really strange behavior during execution of a CUDA code. I have given an example code below to illustrate the problem.
The kernel does summation of two arrays by each thread doing the sum of each element of two arrays ‘a’ and ‘b’. If I try to add the elements of the array ‘c’ also within the same kernel by using only one thread, it works fine till about N=2048. When I increase the value to, say N=3200, the cuPrintf is not executed or its value is not displayed.
NOTE: In my actual production code, if I try to do the final computation within the kernel by using only a small subset of the total threads (i.e. let other threads return early) I see this behavior. There the problem is more serious as the final computed result is always zero or ‘nan’ for values exceeding certain threshold number. I will post info about my production code after I receive a reply.

Here is the code in question:
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cutil_inline_runtime.h>
#include <sys/time.h>
#include “”

#define N 3200

global void
add(int* a, int* b, int* c)
int tid = blockIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
/* Alternate way of computing final result */
if (tid !=0)
int sum = 0;
for (int count = 0; count < N; ++count)
sum = sum + c[count];
cuPrintf(“Dev sum: %d\n”, sum);

int main()
int* a;
int* b;
int* c;
int* dev_a;
int* dev_b;
int* dev_c;
cudaEvent_t start, stop;
float time;
struct timeval tv1, tv2;
a = (int*)malloc(Nsizeof(int));
b = (int
c = (int

for (int i = 0; i < N; ++i)
a[i] = i;
b[i] = i * 2;
c[i] = 0;

cutilSafeCall(cudaMalloc( (void**)&dev_a, Nsizeof(int) ));
cutilSafeCall(cudaMalloc( (void**)&dev_b, N
sizeof(int) ));
cutilSafeCall(cudaMalloc( (void**)&dev_c, N*sizeof(int) ));

gettimeofday(&tv1, NULL);
cutilSafeCall(cudaMemcpy(dev_a, a, Nsizeof(int), cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dev_b, b, N
sizeof(int), cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dev_c, c, N*sizeof(int), cudaMemcpyHostToDevice));

gettimeofday(&tv2, NULL);
printf(“Time taken in copying to DEvice: %f\n”, (float)(tv2.tv_usec) - (float)(tv1.tv_usec));
cudaEventRecord(start, 0);

add<<<N/16,16>>>(dev_a, dev_b, dev_c);
cudaPrintfDisplay(stdout, true);

cudaEventRecord(stop, 0);
cudaEventElapsedTime(&time, start, stop);

printf(“Time taken by kernel: %f\n”, time);

cutilSafeCall(cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost));
unsigned int sum = 0;
for(int i = 0; i < N; ++i)
sum = sum + c[i];
printf(“Sum is: %d\n”, sum);

return 0;

I am using:
CentOS 5.6
Tesla M2070
CUDA release 4.0