Hi: I think this is a bug, but I’m just starting with CUDA so it’s far more likely to be an error on my part. I would appreciate it if someone else would take a look at this.
Here is a matrix of my results using various compiler options for the code shown at the end (the forum wouldn’t let me attach a .cu file):
Use Fast Math No Use Fast Math
Debug OK Incorrect
No Debug OK Incorrect
Emulation OK OK
The expected results are: The incorrect results are:
0.866025 0.866025 0.866025 0.866025 0.866025 0.866025
0.866025 0.866025 -0.866025 0.866025 0.866025 0.866025 <- wrong
0.52449 2.34138 2.34138 0.52449 2.34138 2.34138
0.52449 2.34138 -2.34138 0.52449 2.34138 2.34138 <- wrong
I’m using the NVIDIA common.mk, so Debug (dbg=1 in the Makefile) results in the command line:
nvcc -o obj/debug/redux.cu_o -c redux.cu -D_DEBUG -keep -I. -I/usr/local/cuda/include -I/home/allen/NVIDIA_CUDA_SDK/common/…/common/inc -DUNIX -g
g++ -fPIC -o /home/allen/NVIDIA_CUDA_SDK/common/…/bin/linux/debug/redux obj/debug/redux.cu_o -L/usr/local/cuda/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/common/lib -lcuda -lcudart -lGL -lGLU -lcutilD
and dbg=0 yields:
nvcc -o obj/release/redux.cu_o -c redux.cu -keep -I. -I/usr/local/cuda/include -I/home/allen/NVIDIA_CUDA_SDK/common/…/common/inc -DUNIX -O3
g++ -fPIC -o /home/allen/NVIDIA_CUDA_SDK/common/…/bin/linux/release/redux obj/release/redux.cu_o -L/usr/local/cuda/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/lib -L/home/allen/NVIDIA_CUDA_SDK/common/…/common/lib -lcuda -lcudart -lGL -lGLU -lcutil
I’m running RHEL 4 x86_64 with a 8800ultra, on a Tyan S2927 MB with dual Opteron 2216, 4 GiB mem. Using CUDA SDK 1.0, compiler version
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2006 NVIDIA Corporation
Built on Thu_Jun_28_09:57:19_PDT_2007
Cuda compilation tools, release 1.0, V0.2.1221
#include <cstdio>
#include <memory.h>
#include "cutil.h"
const int NX = 2;
const int NY = 1;
const int NZ = 1;
const int NQDPT = 2;
const float M_1_SQRT3 = 0.577350269f / 2.f;
const float M_HALF = 0.5f;
// Some quadrature points relative to the center of the cell.
__constant__ float3 gauss2[NQDPT] = {
{ M_1_SQRT3 + M_HALF, M_1_SQRT3 + M_HALF, M_1_SQRT3 + M_HALF },
{ M_1_SQRT3 + M_HALF, M_1_SQRT3 + M_HALF, -M_1_SQRT3 + M_HALF },
};
// Borrowed from CudaMath.h
inline __device__ __host__ float3 operator +(float3 a, float3 b)
{
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
}
inline __device__ __host__ float3 operator -(float3 a, float3 b)
{
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
}
inline __device__ __host__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __device__ __host__ float3 operator /(float3 a, float l)
{
float l_inv = 1.f / l;
return make_float3( l_inv * a.x, l_inv * a.y, l_inv * a.z );
}
__global__ void hereToThere ( const float3 start, float3* p_bucket )
{
int q = threadIdx.x;
int i = threadIdx.y;
int j = blockIdx.x;
int k = blockIdx.y;
int ipt1 = q + i * NQDPT + j * NQDPT * NX + k * NQDPT * NX * NY;
// Locate our cell and our Gauss point.
float3 p1 = make_float3( i, j, k ) + gauss2[q];
// Compute the normalized direction vector from the start point.
float3 dp = p1 - start;
float t = sqrtf( dot( dp, dp ) );
float3 omega = dp / t;
// Initial step size in the omega direction.
float3 p0 = start;
float dx = p0.x / omega.x;
float dy = p0.y / omega.y;
float dz = p0.z / omega.z;
// Return the distances we just calculated.
p_bucket[ipt1] = make_float3( dx, dy, dz );
}
int main ( int argc, char* argv[] )
{
CUT_DEVICE_INIT();
const float3 start = { 0.5f, 0.5f, 0.5f };
cudaError_t err;
float3* p_bucket;
CUDA_SAFE_CALL( cudaMalloc( (void**)&p_bucket,
NQDPT * NX * NY * NZ * sizeof(float3) ) );
err = cudaThreadSynchronize();
if ( err != 0 ) {
printf( "malloc error: %d\n", err );
return 1;
}
CUDA_SAFE_CALL( cudaMemset( (void*)p_bucket, 0,
NQDPT * NX * NY * NZ * sizeof(float3) ) );
err = cudaThreadSynchronize();
if ( err != 0 ) {
printf( "memset error: %d\n", err );
return 1;
}
hereToThere<<<dim3(NY,NZ), dim3(NQDPT,NX)>>>( start, p_bucket );
err = cudaThreadSynchronize();
if ( err != 0 ) {
printf( "Kernel error: %d\n", err );
return 1;
}
float3* g_p_bucket = (float3*)malloc( NQDPT * NX * NY * NZ * sizeof(float3) );
CUDA_SAFE_CALL( cudaMemcpy( g_p_bucket, p_bucket,
NQDPT * NX * NY * NZ * sizeof(float3),
cudaMemcpyDeviceToHost ) );
err = cudaThreadSynchronize();
if ( err != 0 ) {
printf( "memcpy error: %d\n", err );
return 1;
}
for ( int qijk = 0; qijk < NQDPT* NX * NY * NZ; qijk++ ) {
printf( "%g %g %g\n",
g_p_bucket[qijk].x, g_p_bucket[qijk].y, g_p_bucket[qijk].z );
}
return 0;
}