I’m following a course for CUDA Programming and I wanted to try the square program myself. I re-wrote the code from the course but I’m not getting the same output. Here is my code
#include <stdio.h>
__global__ void square (int *d_out, int *d_in){
int idx = threadIdx.x;
int f = d_in[idx];
d_out[idx] = f * f;
}
int main(int argc, char ** argv){
const int ARRAY_SIZE = 64;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
int h_in[ARRAY_SIZE];
for(int i = 0; i < ARRAY_SIZE; i ++){
h_in[i] = i;
// printf("%d ", h_in[i]);
}
int h_out[ARRAY_SIZE];
int *d_in;
int *d_out;
cudaMalloc(&d_in, ARRAY_BYTES);
cudaMalloc(&d_out, ARRAY_BYTES);
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
square<<<1, ARRAY_SIZE>>>(d_out, d_in);
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
printf("\n\n");
for(int i = 0; i < ARRAY_SIZE; i ++){
printf("%d", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
The course shows a square of floats but I changed it and do it with integers. Here’s my output and every time I compile it it’s difference. Considering the fact of different outputs, I think that the mistake is in the h_out array. Here’s what I got as an output my last time:
-1042321192 22092 627381103 32615
0 0 0 0
0 0 -808220160 -700231102
-1042321200 22092 635949264 32615
-339029120 32764 635950901 32615
644673632 32615 0 0
-1064290840 22092 635949431 32615
-1064280056 22092 -1066442772 22092
644678000 32615 -1066442772 22092
-1066488425 22092 630040104 32615
-1066489136 22092 626185353 32615
21 0 4 0
-1066743629 22092 -1064290840 22092
1 0 -1066464195 22092
642480544 32615 0 0
-1066464272 22092 -1066819952 22092