Hello!
I’m trying to write a program to display the cubes of the first 10000 natural numbers and also the execution time using CUDA Events. Also, just to make it clear, I’ve stored all elements in a matrix for convenience. Thus, 10000 elements are stored in a 100x100 matrix. I’ve commented my code where necessary, and i hope they make it clear. Here it is:
[codebox]#include<stdio.h>
#include<cuda.h>
#include<conio.h>
global void cube(int *a)
{
int row=blockIdx.y*blockDim.y + threadIdx.y; //To access elements in a row
int col=blockIdx.x*blockDim.x + threadIdx.x; // To access elements in a column
a[row*100+col]=a[row*100+col]*a[row*100+col]*a[row*100+col]; //Replacing each element by its cube
}
void main()
{
cudaEvent_t start,stop; //Appropriate declarations for time measurement
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int *a_h,*a_d,i,j,n=10000,c=1; //a_h and a_d point to Host and Device arrays respectively.
size_t size=n*sizeof(int); // Size of the array = 100x100 = 10,000 integer elements
a_h=(int*)malloc(size);
cudaMalloc((void**)&a_d,size);
for(i=0;i<100;i++) // Initialize the 100x100 matrix to contain the first 10,000 natural numbers.
{
for(j=0;j<100;j++)
{
a_h[i*100+j]=c;
c++;
}
}
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice); //Copy a_h to a_d
cudaEventRecord(start,0); //Start timer
dim3 dimBlock(10,10); // To access 10,000 elements, i've used blocks of size 10x10. Thus each block accesses 100 elements at a time.
dim3 dimGrid(10,10); // To access all 10,000 elements, i've used 10 such blocks in the x-direction, and 10 blocks in the y-direction. Thus, total no. of blocks=10x10=100.
// And, (100 blocks)x(100 elements per block) = 10,000 elements
cube<<<dimGrid,dimBlock>>>(a_d); //Only a_d to be passed.
cudaEventRecord(stop,0); // Stop timer
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
cudaMemcpy(a_h,a_d,size,cudaMemcpyDeviceToHost); //Retrieve results from device.
printf("Cubes of the first 10000 natural numbers:\n"); //Print results.
for(i=0;i<100;i++)
{
for(j=0;j<100;j++)
{
printf("%d ",a_h[i*100+j]);
}
printf("\n");
}
printf("Execution Time: %f ms\n",time); //Print execution time.
getch();
free(a_h);
cudaFree(a_d);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}[/codebox]
The problem with the output of this code is that most values are garbage values. Just to verify whether the logic was right or not, I’ve simplified the code for 100 elements instead. They’re stored in a 10x10 matrix, and here’s the code:
[codebox]#include<stdio.h>
#include<cuda.h>
#include<conio.h>
global void cube(int *a)
{
int row=blockIdx.y*blockDim.y + threadIdx.y; //To access elements in a row
int col=blockIdx.x*blockDim.x + threadIdx.x; // To access elements in a column
a[row*10+col]=a[row*10+col]*a[row*10+col]*a[row*10+col]; //Replacing each element by its cube
}
void main()
{
cudaEvent_t start,stop; //Appropriate declarations for time measurement
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int *a_h,*a_d,i,j,n=100,c=1; //a_h and a_d point to Host and Device arrays respectively.
size_t size=n*sizeof(int); // Size of the array = 100x100 = 10,000 int elements
a_h=(int*)malloc(size);
cudaMalloc((void**)&a_d,size);
for(i=0;i<10;i++) // Initialize the 10x10 matrix to contain the first 100 natural numbers.
{
for(j=0;j<10;j++)
{
a_h[i*10+j]=c;
c++;
}
}
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice); //Copy a_h to a_d
cudaEventRecord(start,0); //Start timer
dim3 dimBlock(10,10); // To access 100 elements, i've used 1 block of size 10x10 to accesses all 100 elements.
dim3 dimGrid(1,1); // 1 block per grid.
cube<<<dimGrid,dimBlock>>>(a_d); //Only a_d to be passed.
cudaEventRecord(stop,0); // Stop timer
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
cudaMemcpy(a_h,a_d,size,cudaMemcpyDeviceToHost); //Retrieve results from device.
printf("Cubes of the first 100 natural numbers:\n"); //Print results.
for(i=0;i<10;i++)
{
for(j=0;j<10;j++)
{
printf("%d ",a_h[i*10+j]);
}
printf("\n");
}
printf("Execution Time: %f ms\n",time); //Print execution time.
getch();
free(a_h);
cudaFree(a_d);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}[/codebox]
There’s no problem with the output for 100 elements, however when extended to 10,000 elements, I get garbage values. Please let me know where the mistake lies. Thanks in advance!