Hi All!

I’m new to CUDA and to get warmed up I have been playing with some example code that uses multi-dimensional arrays. The original code used integers, I changed these to floats so that I could play around with basic maths functions. If I run the code below I get some weird rounding. Each number in the array should have a value that ends in .55, and when printed to 1 decimal place should round up to .6. However when I actually run the code, the first group of numbers rounds to .6 - the following 3 groups round to .5 - and then the remaining groups round to .6.

Can anyone explain why this is? Am I doing something wrong?

The program is running on a Quadro FX 570 (Compute 1.1), windows vista (32bit), and compiled in MSVC++ 2010 express.

Here is the code:

#include <stdlib.h>

#include <stdio.h>

**global** void kernel(float *array)

{

int index_x = blockIdx.x * blockDim.x + threadIdx.x;

int index_y = blockIdx.y * blockDim.y + threadIdx.y;

// map the two 2D indices to a single linear, 1D index

int grid_width = gridDim.x * blockDim.x;

int index = index_y * grid_width + index_x;

// map the two 2D block indices to a single linear, 1D block index

float result = blockIdx.y * gridDim.x + blockIdx.x + 0.55; // each value will end in .55

// write out the result

array[index] = result;

}

int main(void)

{

int num_elements_x = 16;

int num_elements_y = 16;

int num_bytes = num_elements_x * num_elements_y * sizeof(int);

float *device_array = 0;

float *host_array = 0;

// allocate memory in either space

host_array = (float*)malloc(num_bytes);

cudaMalloc((void**)&device_array, num_bytes);

// create two dimensional 4x4 thread blocks

dim3 block_size;

block_size.x = 4;

block_size.y = 4;

// configure a two dimensional grid as well

dim3 grid_size;

grid_size.x = num_elements_x / block_size.x;

grid_size.y = num_elements_y / block_size.y;

// grid_size & block_size are passed as arguments to the triple chevrons as usual

kernel<<<grid_size,block_size>>>(device_array);

// download and inspect the result on the host:

cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyDeviceToHost);

// print out the result element by element

for(int row = 0; row < num_elements_y; ++row)

{

for(int col = 0; col < num_elements_x; ++col)

{

printf("%2.1f ", host_array[row * num_elements_x + col]);

}

printf("\n");

}

printf("\n");

// deallocate memory

free(host_array);

cudaFree(device_array);

getchar();

}