Hello,
I am using a GeForce GT 220 card with CUDA 3.0 under Ubuntu 9.10. One of the things that I wanted to answer before beginning serious CUDA programming was a simple issue: when adding two vectors that are not of a basic type, what is the maximum size of these vectors that one can use in such programs?
I presume that Linux is also using some of the memory of the card for rendering, so the entire 512 space will not be available for transferring the vectors from the host to the device. The size also oscillates in a weird way (e.g. if I open a 3D app - nexiuz or stellarium - the maximum size might increase or decrease after I close that app).
Here is the code (very very basic):
[codebox]
#include <stdio.h>
#define block_size 512
struct complex
{
float x,y;
};
unsigned long long int N=27903512;//18435512;
global void VecAdd(complex *a, complex *b, complex *c,unsigned long long int N)
{
unsigned long long int i=blockIdx.x*blockDim.x+threadIdx.x;
if (i<N)
{
c[i].x=a[i].x+b[i].x;
c[i].y=a[i].y+b[i].y;
}
}
//add 2 vectors wrapper
void vec_add(complex * h_a, complex * h_b, complex *&h_c, unsigned long long int N)
{
//allocate memory on the device
size_t size=N*sizeof(complex);
complex *d_a,*d_b,*d_c;
cudaMalloc((void**)&d_a,size);
cudaMalloc((void**)&d_b,size);
cudaMalloc((void**)&d_c,size);
//copy from host to device
cudaMemcpy(d_a,h_a,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,size,cudaMemcpyHostToDevice);
//Invoke Kernel
dim3 dimBlock(block_size);
dim3 dimGrid( (N/dimBlock.x)+(!(N%dimBlock.x)?0:1));
VecAdd<<<dimGrid,dimBlock>>>(d_a,d_b,d_c,N);
//copy results from device memory to host memory
cudaMemcpy(h_c,d_c,size,cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
int main(void)
{
//host variables
complex * h_a, *h_b, *h_c;
h_a=(complex *) malloc(N*sizeof(complex));
h_b=(complex *) malloc(N*sizeof(complex));
h_c=(complex *) malloc(N*sizeof(complex));
printf("%d %f %f ",sizeof(complex),1.0*N,3.0*N*sizeof(complex));
for (unsigned long long int i=0;i<N;i++)
{
h_a[i].x=(float)1;
h_a[i].y=(float)2;
h_b[i].x=(float)1;
h_b[i].y=(float)2;
//h_a[i].z=h_b[i].z=1.0;
}
//add the vectors
vec_add(h_a,h_b,h_c,N);
unsigned long long int k=0;
for (unsigned long long int i=0;i<N && h_c[i].y!=0;i++)
{
k=i;
}
printf("\n Last i= %f \n",1.0*k);
printf("%f+i%f",h_c[N-1].x,h_c[N-1].y);
free(h_a);
free(h_b);
free(h_c);
printf("Don't know!");
return 0;
}[/codebox]
What is happening? Is it that the memory on the GFX card is no properly managed or am I missing some other detail?
Thank you and regards
Teodor