Different results when alter number of threads and number of blocks

So I have Jetson Nano and the following piece of code:

  8 __global__ void vec_min(float * a, float * b, float * c)
  9 {
 10     int i = threadIdx.x;
 11     c[i] = min(a[i], b[i]);
 12 }
 13
 14 void print(float src[4])
 15 {
 16     for (int i=0; i < 4; ++i) {
 17         std::cout << src[i] << ' ';
 18     }
 19     std::cout << std::endl;
 20 }
 21
 22 int main()
 23 {
 24     std::vector<float> a = { 10, std::numeric_limits<float>::quiet_NaN(), 3, 4 };
 25     std::vector<float> b = { 1, 20, 3, 40 };
 26
 27     float * d_a;
 28     cudaMalloc(&d_a, 4*sizeof(float));
 29
 30     float * d_b;
 31     cudaMalloc(&d_b, 4*sizeof(float));
 32
 33     cudaMemcpy(d_a, a.data(), 4*sizeof(float), cudaMemcpyHostToDevice);
 34     cudaMemcpy(d_b, b.data(), 4*sizeof(float), cudaMemcpyHostToDevice);
 35
 36     float * d_mins;
 37     cudaMalloc(&d_mins, 4*sizeof(float));
 38
 39     int blocks_per_grid = 4;
 40     int threads_per_block = 1;
 41
 42     vec_min<<<blocks_per_grid, threads_per_block>>>(d_a, d_b, d_mins);
 43
 44     float mins[4];
 45     cudaMemcpy(mins, d_mins, 4*sizeof(float), cudaMemcpyDeviceToHost);
 46
 47     print(mins);
...
 54     return 0;
 55 }

The code above is compiled with nvcc main.cu -o main and prints 1 0 0 0. This output is wrong.
However, if swap the values in blocks_per_grid and threads_per_block, then the printed values are correct: 1 20 3 4

Why chaining the number of threads and blocks leads to different results?