Need help with a very simple program

Hello!

I’m trying to write a program to display the cubes of the first 10000 natural numbers and also the execution time using CUDA Events. Also, just to make it clear, I’ve stored all elements in a matrix for convenience. Thus, 10000 elements are stored in a 100x100 matrix. I’ve commented my code where necessary, and i hope they make it clear. Here it is:

[codebox]#include<stdio.h>

#include<cuda.h>

#include<conio.h>

global void cube(int *a)

{

int row=blockIdx.y*blockDim.y + threadIdx.y;	         //To access elements in a row

int col=blockIdx.x*blockDim.x + threadIdx.x;	            // To access elements in a column



a[row*100+col]=a[row*100+col]*a[row*100+col]*a[row*100+col];              //Replacing each element by its cube

}

void main()

{

cudaEvent_t start,stop;                   //Appropriate declarations for time measurement

float time;

cudaEventCreate(&start);

cudaEventCreate(&stop);



int *a_h,*a_d,i,j,n=10000,c=1;             //a_h and a_d point to Host and Device arrays respectively. 

size_t size=n*sizeof(int);               // Size of the array = 100x100 = 10,000 integer elements

a_h=(int*)malloc(size);

cudaMalloc((void**)&a_d,size);



for(i=0;i<100;i++)					// Initialize the 100x100 matrix to contain the first 10,000 natural numbers.

{

	for(j=0;j<100;j++)

	{

	a_h[i*100+j]=c;

	c++;

	}

}

cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);         //Copy a_h to a_d

cudaEventRecord(start,0);                             //Start timer

dim3 dimBlock(10,10);                                  // To access 10,000 elements, i've used blocks of size 10x10. Thus each block accesses 100 elements at a time. 

dim3 dimGrid(10,10);					  // To access all 10,000 elements, i've used 10 such blocks in the x-direction, and 10 blocks in the y-direction. Thus, total no. of blocks=10x10=100. 

								  // And, (100 blocks)x(100 elements per block) = 10,000 elements

cube<<<dimGrid,dimBlock>>>(a_d);            //Only a_d to be passed.

cudaEventRecord(stop,0);                           // Stop timer

cudaEventSynchronize(stop);

cudaEventElapsedTime(&time,start,stop);

cudaMemcpy(a_h,a_d,size,cudaMemcpyDeviceToHost);           //Retrieve results from device.

printf("Cubes of the first 10000 natural numbers:\n");                //Print results.

for(i=0;i<100;i++)

{

	for(j=0;j<100;j++)

	{

	printf("%d ",a_h[i*100+j]);

	}

printf("\n");

}

printf("Execution Time: %f ms\n",time);             //Print execution time.

getch(); 

free(a_h);

cudaFree(a_d);

cudaEventDestroy(start);

cudaEventDestroy(stop);

}[/codebox]

The problem with the output of this code is that most values are garbage values. Just to verify whether the logic was right or not, I’ve simplified the code for 100 elements instead. They’re stored in a 10x10 matrix, and here’s the code:

[codebox]#include<stdio.h>

#include<cuda.h>

#include<conio.h>

global void cube(int *a)

{

int row=blockIdx.y*blockDim.y + threadIdx.y;	             //To access elements in a row

int col=blockIdx.x*blockDim.x + threadIdx.x;	            // To access elements in a column



a[row*10+col]=a[row*10+col]*a[row*10+col]*a[row*10+col];              //Replacing each element by its cube

}

void main()

{

cudaEvent_t start,stop;                   //Appropriate declarations for time measurement

float time;

cudaEventCreate(&start);

cudaEventCreate(&stop);



int *a_h,*a_d,i,j,n=100,c=1;             //a_h and a_d point to Host and Device arrays respectively. 

size_t size=n*sizeof(int);                  // Size of the array = 100x100 = 10,000 int elements

a_h=(int*)malloc(size);

cudaMalloc((void**)&a_d,size);



for(i=0;i<10;i++)					// Initialize the 10x10 matrix to contain the first 100 natural numbers.

{

	for(j=0;j<10;j++)

	{

	a_h[i*10+j]=c;

	c++;

	}

}

cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);         //Copy a_h to a_d

cudaEventRecord(start,0);                             //Start timer

dim3 dimBlock(10,10);                                  // To access 100 elements, i've used 1 block of size 10x10 to accesses all 100 elements.

dim3 dimGrid(1,1);					  // 1 block per grid.

cube<<<dimGrid,dimBlock>>>(a_d);            //Only a_d to be passed.

cudaEventRecord(stop,0);                           // Stop timer

cudaEventSynchronize(stop);

cudaEventElapsedTime(&time,start,stop);

cudaMemcpy(a_h,a_d,size,cudaMemcpyDeviceToHost);           //Retrieve results from device.

printf("Cubes of the first 100 natural numbers:\n");                //Print results.

for(i=0;i<10;i++)                                             

{

	for(j=0;j<10;j++)

	{

	printf("%d ",a_h[i*10+j]);

	}

printf("\n");

}

printf("Execution Time: %f ms\n",time);             //Print execution time.

getch(); 

free(a_h);

cudaFree(a_d);

cudaEventDestroy(start);

cudaEventDestroy(stop);

}[/codebox]

There’s no problem with the output for 100 elements, however when extended to 10,000 elements, I get garbage values. Please let me know where the mistake lies. Thanks in advance!

:wave:

Theres nothing wrong with your program. You are printing the values which is beyond the capacity of int datatype. try using long int. If u replace the printing statements by this
for(i=0;i<10;i++)
{
for(j=0;j<100;j++)
{
printf("%ld “,a_h[i*10+j]);
}
printf(”\n");
}

You will notice that your output is actually correct but you cant see all 10000 numbers