Device Spitting out garbage

Hi All,

I need some help here. I am brand new to CUDA and still trying to figure it out. I wrote this very basic program to help me understant how CUDA works. I am using visual Studio 2005 and Cuda version 2.0. My graphics card is a geForce 9600GT

My program has three parts, each part calulating the same thing. The first part increments the values of an array by 1 within the host. The second part does the same thing but using CUDA but with multiple blocks and threads. The third part also uses CUDA but I use only 1 block.<<<1,1>>>. My problem is that when the number of elements in my array go past 1023 floats the second cuda call seem to spit out just grabage. Below 1024 it seems to work fine. The other two function (host and the first cuda call) work perfectly fine no matter how big the array size.

Any help will be appreciated.


#include <stdio.h>

#include <assert.h>

#include <cuda.h>

#include <stdlib.h>

#include <math.h>

#include <cuda_runtime.h>

#include <cutil.h>

void incrementArrayOnHost(float *a, int N)


int i;

for (i=0; i < N; i++) a[i] = a[i]+1.f;


global void incrementArrayOnDevice(float *a, int N)


int idx = blockIdx.x*blockDim.x + threadIdx.x;

if (idx<N) a[idx] = a[idx]+1.f;


global void incrementArrayOnDevice2(float *a, int N)


int i;

for (i=0; i < (sizeof(float))*N; i++)

  a[i] = a[i]+1.f;


int main(void)


float *a_h, *b_h, *c_h;           // pointers to host memory

float *a_out, *b_out;

float *a_d, *b_d;                 // pointer to device memory

int i, N =1024;

char ans;

unsigned int timer = 0;

unsigned int timer2 = 0;

unsigned int timer3 = 0;

size_t size = N*sizeof(float);

// allocate arrays on host

a_h = (float *)malloc(size);

b_h = (float *)malloc(size);

c_h = (float *)malloc(size);

a_out = (float *)malloc(size);

b_out = (float *)malloc(size);

// initialization of host data

for (i=0; i<N; i++) 


	a_h[i] = (float) i;

	b_h[i] = (float) i;

	c_h[i] = (float) i;


// do calculation on host


cutStartTimer( timer);

incrementArrayOnHost(c_h, N);

cutStopTimer( timer);

//printf("\nCPU TIME: %f\n", cutGetTimerValue( timer));

//Threaded Device Calculation

// allocate array on device 

cudaMalloc((void **) &a_d, size);

// copy data from host to device

cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);

// do calculation on device:

int blockSize = 4;

int nBlocks = N/blockSize + (N%blockSize == 0?0:1);


cutStartTimer( timer2);

incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);

cutStopTimer( timer2);

// Retrieve result from device and store in a_out

cudaMemcpy(a_out, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);


//Non Threaded Calculation

cudaMalloc((void **) &b_d, size);

cudaMemcpy(b_d, b_h, sizeof(float)*N, cudaMemcpyHostToDevice);


cutStartTimer( timer3);

incrementArrayOnDevice2 <<< 1,1 >>> (b_d, N);

cutStopTimer( timer3);

// Retrieve result from device and store in b_out

cudaMemcpy(b_out, b_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

// check results

for (i=0; i<N; i++) {

	//assert(a_h[i] == b_h[i]);

	if(i%100 == 0)


	printf(" compare results  %f   %f	%f\n", c_h[i], a_out[i], b_out[i]);



printf("\nCPU TIME: %f\n", cutGetTimerValue( timer));

printf("\nGPU TIME Multi_thread: %f\n", cutGetTimerValue( timer2));

printf("\nGPU TIME: %f\n", cutGetTimerValue( timer3));

// cleanup








printf("\nHit enter to quit\n");

scanf("%c", &ans);

//return 0;