Hi all,
This is my first hello World program! Its running greatly on Tesla GPU platform. I found some problem with the N=1000. Can any one clarify whats wrong with the code.
[codebox]/*
Program to compute the sum of two arrays of size N using GPU
@author Sajan Kumar.S
@email: nospam+ammasajan[A.T]gmail
*/
#include <stdio.h>
#include <stdlib.h>
#define N 20 // 20 elements
global void vecAdd(int *A, int *B, int *C){
int i=threadIdx.x;
shared int s_A[N],s_B[N],s_C[N];
// copy the values to shared mem and attack! :D
s_A[i]=A[i];
s_B[i]=B[i];
__syncthreads();
// C[i]=A[i]+B[i];
// s_C[i]=s_A[i]+s_B[i]; // to calucate the sume of elements
s_C[i]=s_A[i]*s_B[i]; // to caluclate the sume of elements
__syncthreads();
C[i]=s_C[i];
}
int main(){
int *h_a=0,*h_b=0,*h_c=0;
int *d_a=0,*d_b=0,*d_c=0;
int memSize=N*sizeof(int);
// allocate host memory size of N
h_a=(int *)malloc(memSize);
h_b=(int *)malloc(memSize);
h_c=(int *)malloc(memSize);
// allocate GPU memory size of N
cudaMalloc((void **)&d_a,memSize);
cudaMalloc((void **)&d_b,memSize);
cudaMalloc((void **)&d_c,memSize);
// Init values to A and B arrays(clearing C array)
for(int i=0;i<N;i++){
h_a[i]=i+2;
h_b[i]=i+3;
h_c[i]=0;
}
// Copied the values to GPU arrays A and B
cudaMemcpy(d_a,h_a,memSize,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,memSize,cudaMemcpyHostToDevice);
// printing the A array and B array on CPU
printf("\n Array A : \n");
for(int i=0;i<N;i++)
printf("%d\t",h_a[i]);
printf("\n Array B : \n");
for(int i=0;i<N;i++)
printf("%d\t",h_b[i]);
printf("\ncalucalting Sum : ");
vecAdd<<<1, N>>>(d_a,d_b,d_c);
// copying the output C from GPU to mem
cudaMemcpy(h_c,d_c,memSize,cudaMemcpyDeviceToHost);
printf(“\nSum of Arrays: \n”);
for(int i=0;i<N;i++)
printf("%d\t",h_c[i]);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 1;
}
[/codebox]