Reordering a vector. kernel working only for single precision

h0h0 · July 30, 2011, 1:19pm

I have a program, which basically amounts to rearranging an array of size num_elements.

The code below works fine for single precision, but gets the wrong values sometimes with double precision.

card: Tesla M1060

cuda 4.0

#include <stdlib.h>

#include <stdio.h>

__global__ void rearrange(float *dst, float *src,

                          int *map, const size_t n)

{

  // compute the global element index this thread should process

  unsigned int i = threadIdx.x + blockDim.x * blockIdx.x;

// avoid accessing out of bounds elements

  if(i < n) {

    dst[i] = src[map[i]];

  }

}

int main(void)

{

  const int num_elements = 19*23520;

// compute the size of the arrays in bytes

  const int num_bytes = num_elements * sizeof(float);

  const int num_bytes_d = num_elements * sizeof(int);

int *device_map    = 0;

  float *device_dst = 0;

  float *device_src = 0;

int *host_map    = 0;

  float *host_dst = 0;

  float *host_src = 0;

  float *host_out = 0;

// malloc the host arrays

  host_dst = (float*)malloc(num_bytes);

  host_src = (float*)malloc(num_bytes);

  host_out = (float*)malloc(num_bytes);

  host_map = (int*)malloc(num_bytes_d);

// cudaMalloc the device arrays

  cudaMalloc((void**)&device_dst, num_bytes);

  cudaMalloc((void**)&device_src, num_bytes);

  cudaMalloc((void**)&device_map, num_bytes_d);

// if any memory allocation failed, report an error message

  if(host_dst == 0 || host_src == 0 || host_out == 0 || host_map == 0 ||

     device_dst == 0 || device_src == 0 || device_map == 0) {

        printf("couldn't allocate memory\n");

        return 1;

  }

// define a mapping which reverses the order

  for(int i = 0, j = num_elements-1; i < num_elements; ++i, --j) {

        host_map[i] = j;

        host_src[i] = i;

  }

cudaMemcpy(device_src, host_src, num_bytes, cudaMemcpyHostToDevice);

  cudaMemcpy(device_map, host_map, num_bytes_d, cudaMemcpyHostToDevice);

const size_t block_size = 128;

  size_t grid_size = num_elements / block_size;

// deal with a possible partial final block

  if(num_elements % block_size) 

      ++grid_size;

// launch the kernel

  rearrange<<<grid_size, block_size>>>(device_dst, device_src, device_map, num_elements);

// copy the result back to the host memory space

  cudaMemcpy(host_out, device_dst, num_bytes, cudaMemcpyDeviceToHost);

// check result

  for(int i = 0; i < num_elements; ++i) {

      if( host_out[i] != host_src[host_map[i]] ) {

          fprintf(stderr, "%g vs %g\n", host_out[i], host_src[host_map[i]]);

          return 1;

      }

  }

  // deallocate memory

  free(host_map); 

  free(host_dst);

  free(host_src);

  free(host_out);

cudaFree(device_map);

  cudaFree(device_dst);

  cudaFree(device_src);

}

Topic		Replies	Views
What's wrong with this extremely simple program? CUDA Programming and Performance	2	764	July 16, 2010
N-dimensional array reordering Strange errors for large array reordering on GPUs CUDA Programming and Performance	4	743	February 11, 2012
Help with cuda 2d array CUDA Programming and Performance	6	7460	September 29, 2014
having problem with simpe CUDA code Code debug CUDA Programming and Performance	4	1630	November 7, 2009
Problem in passing an integer array to global function and device function CUDA Programming and Performance	2	1571	June 11, 2014
2d array not properly working CUDA Programming and Performance	3	1512	October 2, 2008
Problem with getting data from blocks CUDA Programming and Performance	3	2654	December 10, 2007
CUDA reduction precision issues CUDA Programming and Performance cuda	1	419	December 13, 2022
I can access to only the first 8 elements of the array cannot acces to every element of the array CUDA Programming and Performance	5	5459	October 27, 2009
My first program with CUDA need some help CUDA Programming and Performance	3	2566	August 10, 2009

Reordering a vector. kernel working only for single precision

Related topics