Maximum size for a vector addition program Using structs

Hello,

I am using a GeForce GT 220 card with CUDA 3.0 under Ubuntu 9.10. One of the things that I wanted to answer before beginning serious CUDA programming was a simple issue: when adding two vectors that are not of a basic type, what is the maximum size of these vectors that one can use in such programs?

I presume that Linux is also using some of the memory of the card for rendering, so the entire 512 space will not be available for transferring the vectors from the host to the device. The size also oscillates in a weird way (e.g. if I open a 3D app - nexiuz or stellarium - the maximum size might increase or decrease after I close that app).

Here is the code (very very basic):

[codebox]

#include <stdio.h>

#define block_size 512

struct complex

{

float x,y;

};

unsigned long long int N=27903512;//18435512;

global void VecAdd(complex *a, complex *b, complex *c,unsigned long long int N)

{

unsigned long long int i=blockIdx.x*blockDim.x+threadIdx.x;

if (i<N)

{

	c[i].x=a[i].x+b[i].x;

	c[i].y=a[i].y+b[i].y;

}

}

//add 2 vectors wrapper

void vec_add(complex * h_a, complex * h_b, complex *&h_c, unsigned long long int N)

{

//allocate memory on the device

size_t size=N*sizeof(complex);

complex *d_a,*d_b,*d_c;

cudaMalloc((void**)&d_a,size);

cudaMalloc((void**)&d_b,size);

cudaMalloc((void**)&d_c,size);

//copy from host to device

cudaMemcpy(d_a,h_a,size,cudaMemcpyHostToDevice);

cudaMemcpy(d_b,h_b,size,cudaMemcpyHostToDevice);	 



//Invoke Kernel

dim3 dimBlock(block_size);

dim3 dimGrid( (N/dimBlock.x)+(!(N%dimBlock.x)?0:1));

VecAdd<<<dimGrid,dimBlock>>>(d_a,d_b,d_c,N);

//copy results from device memory to host memory

cudaMemcpy(h_c,d_c,size,cudaMemcpyDeviceToHost);

// Free device memory

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

}

int main(void)

{

//host variables

complex * h_a, *h_b, *h_c;

h_a=(complex *) malloc(N*sizeof(complex));

h_b=(complex *) malloc(N*sizeof(complex));

h_c=(complex *) malloc(N*sizeof(complex));

printf("%d %f %f ",sizeof(complex),1.0*N,3.0*N*sizeof(complex));

for (unsigned long long int i=0;i<N;i++)

{

	h_a[i].x=(float)1;

	h_a[i].y=(float)2;

	h_b[i].x=(float)1;

	h_b[i].y=(float)2;

	//h_a[i].z=h_b[i].z=1.0;

}

//add the vectors

vec_add(h_a,h_b,h_c,N);



unsigned long long int k=0;

for (unsigned long long int i=0;i<N && h_c[i].y!=0;i++)

{

	k=i;

}

printf("\n Last i= %f \n",1.0*k);

printf("%f+i%f",h_c[N-1].x,h_c[N-1].y);

free(h_a);

free(h_b);

free(h_c);

printf("Don't know!");

return 0;

}[/codebox]

What is happening? Is it that the memory on the GFX card is no properly managed or am I missing some other detail?

Thank you and regards

Teodor