I wrote a simple program that should add two vectors, 1 character per 1 thread. But the value returned from kernel differs from i expected ((. Could someone explain why?
[codebox]#include <stdio.h>
#include <stdlib.h>
#include “cuda.h”
#define MAX 16 //size of arrays and number of threads
global void addVec(int *a, int *b, int *c);
int main()
{
int i;
int a_h[MAX];
int b_h[MAX];
int c_h[MAX];
int *a_d;
int *b_d;
int *c_d;
int size = MAX * sizeof(char);
//initialisation of an array
for(i = 0; i < MAX; i++){
a_h[i] = i;
b_h[i] = i;
}
cudaMalloc((void**)&a_d, size);
cudaMalloc((void**)&b_d, size);
cudaMalloc((void**)&c_d, size);
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice);
addVec<<<1, MAX>>>(a_d, b_d, c_d);
cudaMemcpy(c_h, c_d, size, cudaMemcpyDeviceToHost);
for(i = 0; i < MAX; i++)
printf("%d \n",*(c_h + i));
return 0;
}
global void addVec(int *a,int *b,int *c)
{
int i = threadIdx.x;
*(c + i) = *(a + i) + *(b + i);
}[/codebox]
right values are just 1-4 of resulted array. If i increase array size, number of right values increases too.
It depends from array size so that – right values = array size / 4.
This code is like example in CUDA Programming Guide.