Reordering a vector. kernel working only for single precision

I have a program, which basically amounts to rearranging an array of size num_elements.

The code below works fine for single precision, but gets the wrong values sometimes with double precision.

card: Tesla M1060

cuda 4.0

#include <stdlib.h>

#include <stdio.h>

__global__ void rearrange(float *dst, float *src,

                          int *map, const size_t n)

{

  // compute the global element index this thread should process

  unsigned int i = threadIdx.x + blockDim.x * blockIdx.x;

// avoid accessing out of bounds elements

  if(i < n) {

    dst[i] = src[map[i]];

  }

}

int main(void)

{

  const int num_elements = 19*23520;

// compute the size of the arrays in bytes

  const int num_bytes = num_elements * sizeof(float);

  const int num_bytes_d = num_elements * sizeof(int);

int *device_map    = 0;

  float *device_dst = 0;

  float *device_src = 0;

int *host_map    = 0;

  float *host_dst = 0;

  float *host_src = 0;

  float *host_out = 0;

// malloc the host arrays

  host_dst = (float*)malloc(num_bytes);

  host_src = (float*)malloc(num_bytes);

  host_out = (float*)malloc(num_bytes);

  host_map = (int*)malloc(num_bytes_d);

// cudaMalloc the device arrays

  cudaMalloc((void**)&device_dst, num_bytes);

  cudaMalloc((void**)&device_src, num_bytes);

  cudaMalloc((void**)&device_map, num_bytes_d);

// if any memory allocation failed, report an error message

  if(host_dst == 0 || host_src == 0 || host_out == 0 || host_map == 0 ||

     device_dst == 0 || device_src == 0 || device_map == 0) {

        printf("couldn't allocate memory\n");

        return 1;

  }

// define a mapping which reverses the order

  for(int i = 0, j = num_elements-1; i < num_elements; ++i, --j) {

        host_map[i] = j;

        host_src[i] = i;

  }

cudaMemcpy(device_src, host_src, num_bytes, cudaMemcpyHostToDevice);

  cudaMemcpy(device_map, host_map, num_bytes_d, cudaMemcpyHostToDevice);

const size_t block_size = 128;

  size_t grid_size = num_elements / block_size;

// deal with a possible partial final block

  if(num_elements % block_size) 

      ++grid_size;

// launch the kernel

  rearrange<<<grid_size, block_size>>>(device_dst, device_src, device_map, num_elements);

// copy the result back to the host memory space

  cudaMemcpy(host_out, device_dst, num_bytes, cudaMemcpyDeviceToHost);

// check result

  for(int i = 0; i < num_elements; ++i) {

      if( host_out[i] != host_src[host_map[i]] ) {

          fprintf(stderr, "%g vs %g\n", host_out[i], host_src[host_map[i]]);

          return 1;

      }

  }

  // deallocate memory

  free(host_map); 

  free(host_dst);

  free(host_src);

  free(host_out);

cudaFree(device_map);

  cudaFree(device_dst);

  cudaFree(device_src);

}