Te following code is named a simple CUDA matrix multiplication program. The host allocates two matrices a_h, b_h on the host and allocates two matrices a_d, b_d on the device.
All the code is doing is multiplying every element of A by 2 and putting the result in B.
When it allocates memory on the device the code itself is on the host.
Can A be allocated and populated on the device? Why allocate matrix A on the host, populate it, move it to device and multiply each element by 2 and put result in B on the device. It seems allocating A on host a_h and moving to a_d is an unneeded step; just start with a_d. I can see having b_h and b_d, but there is no reason for a_h.
is it possible this was done for exhibit purposes or maybe the programmer was unaware of doing it a shorter way? It seems in a lot of example code people are allocating on the host and moving to device and then using there and never using it again. So just start out on the device and save a step.
#include <iostream>
#include <cuda.h>
#include "sys/time.h"
__global__ void vecMult_d(int *A, int *B, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i<N) { B[i] = A[i]*2; }
}
void vecMult_h(int *A, int *B, int N)
{
for(int i=0;i<N;i++) { B[i] = A[i]*2; }
}
int main() {
int *a_h, *b_h; // pointers to host memory; a.k.a. CPU
int *a_d, *b_d; // pointers to device memory; a.k.a. GPU
int blocksize=512, grid_size, n=501;
struct timeval t1_start,t1_end,t2_start,t2_end;
double time_d, time_h;
// allocate arrays on host
a_h = (int *)malloc(sizeof(int)*n);
b_h = (int *)malloc(sizeof(int)*n);
// allocate arrays on device
cudaMalloc((void **)&a_d,n*sizeof(int));
cudaMalloc((void **)&b_d,n*sizeof(int));
dim3 dimBlock( blocksize );
dim3 dimGrid( ceil(float(n)/float(dimBlock.x)) );
for(int j=0;j<n;j++) a_h[j]=j;
// GPU
cudaMemcpy(a_d,a_h,n*sizeof(int),cudaMemcpyHostToDevice);
gettimeofday(&t1_start,0);
vecMult_d<<<dimGrid,dimBlock>>>(a_d,b_d,n);
cudaThreadSynchronize();
gettimeofday(&t1_end,0);
cudaMemcpy(b_h,b_d,n*sizeof(int),cudaMemcpyDeviceToHost);
// CPU
gettimeofday(&t2_start,0);
vecMult_h(a_h,b_h,n);
gettimeofday(&t2_end,0);
time_d = (t1_end.tv_sec-t1_start.tv_sec)*1000000 + t1_end.tv_usec - t1_start.tv_usec;
time_h = (t2_end.tv_sec-t2_start.tv_sec)*1000000 + t2_end.tv_usec - t2_start.tv_usec;
printf("%d %lf %lf\n",n,time_d,time_h);
free(a_h);
free(b_h);
cudaFree(a_d);
cudaFree(b_d);
return(0);
}