This is my first CUDA program. I used VS2010 to create it. I have NVIDIA GeForce GT 430 (96 cores, 1GB ram). When I run this program, everything executes and I get back expected results, but it takes a long time – about 5 seconds. I would expect 500000 additions to take no time at all.
Obviously, I am doing something wrong – why is it running slow?
Thank you for your help.
PS. The multi-second delay is on the line: cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
#define N 500000
#include <stdlib.h>
#include <cuda_runtime.h>
global void add(int *a, int *b, int *c) {
int tid = blockIdx.x;
while(tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x;
}
}
int main(void) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
a = (int*)malloc(N * sizeof(int));
b = (int*)malloc(N * sizeof(int));
c = (int*)malloc(N * sizeof(int));
for(int i =0; i < N; i++){
a[i] = i+1;
b[i] = (i+1) * 2;
}
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
add<<<1024, 1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
free(a);
free(b);
free(c);
}