Use_fast_math Difference

Hi: I think this is a bug, but I’m just starting with CUDA so it’s far more likely to be an error on my part. I would appreciate it if someone else would take a look at this.

Here is a matrix of my results using various compiler options for the code shown at the end (the forum wouldn’t let me attach a .cu file):

               Use Fast Math     No Use Fast Math

Debug              OK                   Incorrect

No Debug           OK                   Incorrect

Emulation          OK                   OK

The expected results are:       The incorrect results are:

0.866025 0.866025 0.866025      0.866025 0.866025 0.866025

0.866025 0.866025 -0.866025     0.866025 0.866025 0.866025 <- wrong

0.52449 2.34138 2.34138         0.52449 2.34138 2.34138

0.52449 2.34138 -2.34138        0.52449 2.34138 2.34138 <- wrong

I’m using the NVIDIA common.mk, so Debug (dbg=1 in the Makefile) results in the command line:

nvcc -o obj/debug/redux.cu_o -c redux.cu -D_DEBUG -keep -I. -I/usr/local/cuda/include -I/home/allen/NVIDIA_CUDA_SDK/common/…/common/inc -DUNIX -g

g++ -fPIC -o /home/allen/NVIDIA_CUDA_SDK/common/…/bin/linux/debug/redux obj/debug/redux.cu_o -L/usr/local/cuda/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/common/lib -lcuda -lcudart -lGL -lGLU -lcutilD

and dbg=0 yields:

nvcc -o obj/release/redux.cu_o -c redux.cu -keep -I. -I/usr/local/cuda/include -I/home/allen/NVIDIA_CUDA_SDK/common/…/common/inc -DUNIX -O3

g++ -fPIC -o /home/allen/NVIDIA_CUDA_SDK/common/…/bin/linux/release/redux obj/release/redux.cu_o -L/usr/local/cuda/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/common/lib -lcuda -lcudart -lGL -lGLU -lcutil

I’m running RHEL 4 x86_64 with a 8800ultra, on a Tyan S2927 MB with dual Opteron 2216, 4 GiB mem. Using CUDA SDK 1.0, compiler version

nvcc: NVIDIA ® Cuda compiler driver

Copyright © 2005-2006 NVIDIA Corporation

Built on Thu_Jun_28_09:57:19_PDT_2007

Cuda compilation tools, release 1.0, V0.2.1221

#include <cstdio>

#include <memory.h>

#include "cutil.h"

const int NX = 2;

const int NY = 1;

const int NZ = 1;

const int NQDPT = 2;

const float M_1_SQRT3 = 0.577350269f / 2.f;

const float M_HALF    = 0.5f;

// Some quadrature points relative to the center of the cell.

__constant__ float3 gauss2[NQDPT] = {

  { M_1_SQRT3 + M_HALF, M_1_SQRT3 + M_HALF, M_1_SQRT3 + M_HALF },

  { M_1_SQRT3 + M_HALF, M_1_SQRT3 + M_HALF, -M_1_SQRT3 + M_HALF },

};

// Borrowed from CudaMath.h

inline __device__ __host__ float3 operator +(float3 a, float3 b)

{

    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);

}

inline __device__ __host__ float3 operator -(float3 a, float3 b)

{

    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);

}

inline __device__ __host__ float dot(float3 a, float3 b)

{

    return a.x * b.x + a.y * b.y + a.z * b.z;

}

inline __device__ __host__ float3 operator /(float3 a, float l)

{

  float l_inv = 1.f / l;

  return make_float3( l_inv * a.x, l_inv * a.y, l_inv * a.z );

}

__global__ void hereToThere ( const float3 start, float3* p_bucket )

{

  int q = threadIdx.x;

  int i = threadIdx.y;

  int j = blockIdx.x;

  int k = blockIdx.y;

  int ipt1 = q + i * NQDPT + j * NQDPT * NX + k * NQDPT * NX * NY;

  // Locate our cell and our Gauss point.

  float3 p1    = make_float3( i, j, k ) +  gauss2[q];

 // Compute the normalized direction vector from the start point.

  float3 dp    = p1 - start;

  float  t     = sqrtf( dot( dp, dp ) );

  float3 omega = dp / t;

  // Initial step size in the omega direction.

  float3 p0    = start;

  float dx = p0.x / omega.x;

  float dy = p0.y / omega.y;

  float dz = p0.z / omega.z;

  // Return the distances we just calculated.

  p_bucket[ipt1] = make_float3( dx, dy, dz );

}

int main ( int argc, char* argv[] )

{

  CUT_DEVICE_INIT();

 const float3 start = { 0.5f, 0.5f, 0.5f };

  cudaError_t err;

 float3* p_bucket;

  CUDA_SAFE_CALL( cudaMalloc( (void**)&p_bucket,

         NQDPT * NX * NY * NZ * sizeof(float3) ) );

 err = cudaThreadSynchronize();

  if ( err != 0 ) {

    printf( "malloc error: %d\n", err );

    return 1;

  }

 CUDA_SAFE_CALL( cudaMemset( (void*)p_bucket, 0,

         NQDPT * NX * NY * NZ * sizeof(float3) ) );

 err = cudaThreadSynchronize();

  if ( err != 0 ) {

    printf( "memset error: %d\n", err );

    return 1;

  }

 hereToThere<<<dim3(NY,NZ), dim3(NQDPT,NX)>>>( start, p_bucket );

 err = cudaThreadSynchronize();

  if ( err != 0 ) {

    printf( "Kernel error: %d\n", err );

    return 1;

  }

 float3* g_p_bucket = (float3*)malloc( NQDPT * NX * NY * NZ * sizeof(float3) );

 CUDA_SAFE_CALL( cudaMemcpy( g_p_bucket, p_bucket,

         NQDPT * NX * NY * NZ * sizeof(float3),

         cudaMemcpyDeviceToHost ) );

 err = cudaThreadSynchronize();

  if ( err != 0 ) {

    printf( "memcpy error: %d\n", err );

    return 1;

  }

 for ( int qijk = 0; qijk < NQDPT* NX * NY * NZ; qijk++ ) {

    printf( "%g %g %g\n",

     g_p_bucket[qijk].x, g_p_bucket[qijk].y, g_p_bucket[qijk].z );

  }

 return 0;

}

That looks like the bug I found earlier.
http://forums.nvidia.com/index.php?showtopic=40674
Possibly a ptxas issue.
But you never assigned anything twice, so maybe you found a new bug.
Hope someone from nVidia to investigate.