I’ve been playing with CUDA running on a RHEL box for the past day or so. After getting 2.2 installed, I tried to implement the first snippet described by the NVIDIA programming manual, a simple 1-d vector addition. To make sure the answer makes sense, I wrote the routine the “old way” and then tried the implementation below. As the output shows, the CUDA call doesn’t work (and I think I lifted it directly from the user manual.
Any suggestions on what I screwed up?
Code:
[codebox]
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
// Kernel definition
global void
VecAdd (float *A, float *B, float *C)
{
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
void add_vectors (int N, float A, float B, float C);
void
add_vectors (int N, float A, float B, float C)
{
int j;
for (j = 0; j < N; j++)
{
C[j] = A[j] + B[j];
}
return;
}
int
main (void)
{
int i, N;
N = 10;
float A[N], B[N], C[N];
for (i = 0; i < N; i++)
{
A[i] = 1.0 * i;
B[i] = 0.01 * i;
}
add_vectors (N, A, B, C);
printf (“adding the old-fashioned way:\n”);
for (i = 0; i < N; i++)
{
printf ("%7.2f +%7.2f =%7.2f\n", A[i], B[i], C[i]);
}
for (i = 0; i < N; i++)
{
C[i] = acos (-1.0);
}
// Kernel invocation
VecAdd <<< 1, N >>> (A, B, C);
printf (“\nAdding with CUDA\n”);
for (i = 0; i < N; i++)
{
printf ("%7.2f +%7.2f =%7.2f\n", A[i], B[i], C[i]);
}
return 1;
}
[/codebox]
Output:
[codebox]
[nmoore@buff cuda_play]$ nvcc dot_prod.cu
[nmoore@buff cuda_play]$ ./a.out
adding the old-fashioned way:
0.00 + 0.00 = 0.00
1.00 + 0.01 = 1.01
2.00 + 0.02 = 2.02
3.00 + 0.03 = 3.03
4.00 + 0.04 = 4.04
5.00 + 0.05 = 5.05
6.00 + 0.06 = 6.06
7.00 + 0.07 = 7.07
8.00 + 0.08 = 8.08
9.00 + 0.09 = 9.09
Adding with CUDA
0.00 + 0.00 = 3.14
1.00 + 0.01 = 3.14
2.00 + 0.02 = 3.14
3.00 + 0.03 = 3.14
4.00 + 0.04 = 3.14
5.00 + 0.05 = 3.14
6.00 + 0.06 = 3.14
7.00 + 0.07 = 3.14
8.00 + 0.08 = 3.14
9.00 + 0.09 = 3.14
[/codebox]