I’m trying to use the code provided by Rob Farber in the Dr.Dobb’s May 13, 2008 article “CUDA, Supercomputing” for the Masses: Part 3". The code works well with types int and float, but does not work (does not return the same numbers) with type double for host/device memory. I’m using xp 64, ms visual studio 2005, cuda 2.1; my video card is 9800 gx2.
[codebox]/* test Dr.Dobb’s reverseArray_multiblock.cu code
-
use FLOAT or DOUBLE array,
-
submit to nvidia cuda forum
*/
// includes, system
#include <stdio.h>
#include <math.h>
#include <assert.h>
// includes, project
#include <cutil_inline.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Part3: implement the kernel
global void reverseArrayBlock(double *d_out, double *d_in)
{
int inOffset = blockDim.x * blockIdx.x;
int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
int in = inOffset + threadIdx.x;
int out = outOffset + (blockDim.x - 1 - threadIdx.x);
d_out[out] = d_in[in];
}
int main( int argc, char** argv)
{
// pointer for host memory and size
double *h_a;
// pointer for device memory
double *d_b, *d_a;
// define number of elements, grid, and block size
int dimA = 256 * 1024; // 256K elements (1MB total)
int numThreadsPerBlock = 256;
// Part 1: compute number of blocks needed based on
// array size and desired block size
int numBlocks = dimA / numThreadsPerBlock;
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(double);
h_a = (double *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
// Initialize input array on host
for (int i = 0; i < dimA; i++)
{
h_a[i] = rand() / (double)RAND_MAX;
if (i == 0 || i == dimA-1)
printf("h_a[%d] %4.4f \n",i,h_a[i]);
}
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a );
// block until the device has completed
cudaThreadSynchronize();
// device to host copy
cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++)
{
//assert(h_a[i] == dimA - 1 - i );
if (i == 0 || i == dimA-1)
printf("h_a[%d] %4.4f \n",i,h_a[i]);
}
// free device memory
cudaFree(d_a);
cudaFree(d_b);
// free host memory
free(h_a);
// If the program makes it this far, then the results are
// correct and there are no run-time errors. Good work!
printf("Correct!\n");
cudaThreadExit();
cutilExit(argc, argv);
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
[/codebox]
I need to use double precision variables for a research project at school (physics department).
Thank you.